diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,46902 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998399743959033, + "eval_steps": 500, + "global_step": 3124, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 779.4875244140625, + "epoch": 0.0003200512081933109, + "grad_norm": 0.0655626729130745, + "kl": 0.0, + "learning_rate": 6.389776357827476e-08, + "loss": 0.0467, + "reward": 0.2614583417773247, + "reward_std": 0.32735898196697233, + "rewards/accuracy_reward": 0.06875000279396773, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.18854167312383652, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completion_length": 837.8854309082031, + "epoch": 0.0006401024163866218, + "grad_norm": 0.06388702988624573, + "kl": 0.0, + "learning_rate": 1.2779552715654952e-07, + "loss": 0.0259, + "reward": 0.2718750089406967, + "reward_std": 0.2908510401844978, + "rewards/accuracy_reward": 0.10625000298023224, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.1635416716337204, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completion_length": 781.164599609375, + "epoch": 0.0009601536245799327, + "grad_norm": 0.06498929113149643, + "kl": 0.0002956547366920859, + "learning_rate": 1.9169329073482428e-07, + "loss": 0.0036, + "reward": 0.22395834028720857, + "reward_std": 0.30153340846300125, + "rewards/accuracy_reward": 0.05000000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.17395833730697632, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completion_length": 788.1146057128906, + "epoch": 0.0012802048327732437, + "grad_norm": 0.07566636800765991, + "kl": 0.00030002407875144853, + "learning_rate": 2.5559105431309904e-07, + "loss": 0.0183, + "reward": 0.3145833443850279, + "reward_std": 0.3022632598876953, + "rewards/accuracy_reward": 0.08541666772216558, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.22500000558793545, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completion_length": 766.7708557128906, + "epoch": 0.0016002560409665546, + "grad_norm": 0.0667392835021019, + "kl": 0.0002937636716524139, + "learning_rate": 3.194888178913738e-07, + "loss": 0.0383, + "reward": 0.2546875052154064, + "reward_std": 0.26029517203569413, + "rewards/accuracy_reward": 0.07291666865348816, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.1796875037252903, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completion_length": 743.1791931152344, + "epoch": 0.0019203072491598655, + "grad_norm": 0.07169407606124878, + "kl": 0.0003271137073170394, + "learning_rate": 3.8338658146964857e-07, + "loss": 0.0153, + "reward": 0.24218750596046448, + "reward_std": 0.30807158052921296, + "rewards/accuracy_reward": 0.05625000111758709, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.18177083879709244, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completion_length": 780.4708618164062, + "epoch": 0.0022403584573531766, + "grad_norm": 0.07363027334213257, + "kl": 0.00030389373132493346, + "learning_rate": 4.4728434504792333e-07, + "loss": 0.0244, + "reward": 0.20833334028720857, + "reward_std": 0.308957539498806, + "rewards/accuracy_reward": 0.043750000186264515, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.1625000037252903, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completion_length": 748.7187683105469, + "epoch": 0.0025604096655464873, + "grad_norm": 0.07643438875675201, + "kl": 0.00032084174163173886, + "learning_rate": 5.111821086261981e-07, + "loss": 0.0317, + "reward": 0.32500000596046447, + "reward_std": 0.326321017742157, + "rewards/accuracy_reward": 0.09583333693444729, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2291666731238365, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completion_length": 776.7271057128906, + "epoch": 0.0028804608737397984, + "grad_norm": 0.06638701260089874, + "kl": 0.00030416845402214676, + "learning_rate": 5.750798722044729e-07, + "loss": 0.0329, + "reward": 0.26510417461395264, + "reward_std": 0.2857085719704628, + "rewards/accuracy_reward": 0.11041666995733976, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.1526041716337204, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completion_length": 757.7416931152344, + "epoch": 0.003200512081933109, + "grad_norm": 0.07611778378486633, + "kl": 0.00030973673274274914, + "learning_rate": 6.389776357827476e-07, + "loss": 0.0533, + "reward": 0.29062501043081285, + "reward_std": 0.32584609389305114, + "rewards/accuracy_reward": 0.06458333544433117, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.22187500894069673, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completion_length": 805.7187683105469, + "epoch": 0.0035205632901264203, + "grad_norm": 0.059567637741565704, + "kl": 0.0002851913624908775, + "learning_rate": 7.028753993610224e-07, + "loss": 0.0066, + "reward": 0.2348958395421505, + "reward_std": 0.28508761525154114, + "rewards/accuracy_reward": 0.0916666692122817, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.1390625037252903, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completion_length": 789.0291809082031, + "epoch": 0.003840614498319731, + "grad_norm": 0.06451418995857239, + "kl": 0.00028673450433416294, + "learning_rate": 7.667731629392971e-07, + "loss": 0.0169, + "reward": 0.2432291738688946, + "reward_std": 0.2678666725754738, + "rewards/accuracy_reward": 0.07083333451300859, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.17031250447034835, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completion_length": 784.3416809082031, + "epoch": 0.004160665706513042, + "grad_norm": 0.06225666403770447, + "kl": 0.00030059528799029065, + "learning_rate": 8.306709265175719e-07, + "loss": 0.0149, + "reward": 0.22760416939854622, + "reward_std": 0.27756396904587743, + "rewards/accuracy_reward": 0.06041666716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.16718750447034836, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completion_length": 726.8312622070313, + "epoch": 0.004480716914706353, + "grad_norm": 0.0688437670469284, + "kl": 0.00033944830065593126, + "learning_rate": 8.945686900958467e-07, + "loss": 0.0411, + "reward": 0.2854166775941849, + "reward_std": 0.28965494930744173, + "rewards/accuracy_reward": 0.11250000428408384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.1729166716337204, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completion_length": 812.6875183105469, + "epoch": 0.004800768122899664, + "grad_norm": 0.06382700055837631, + "kl": 0.0003171889838995412, + "learning_rate": 9.584664536741215e-07, + "loss": 0.0315, + "reward": 0.21979167312383652, + "reward_std": 0.251245941221714, + "rewards/accuracy_reward": 0.045833333395421505, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.17187500596046448, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completion_length": 789.4062683105469, + "epoch": 0.005120819331092975, + "grad_norm": 0.06542661041021347, + "kl": 0.00029948877927381544, + "learning_rate": 1.0223642172523962e-06, + "loss": 0.0325, + "reward": 0.31822917610406876, + "reward_std": 0.29038531333208084, + "rewards/accuracy_reward": 0.12708333507180214, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.19114584103226662, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completion_length": 774.4479431152344, + "epoch": 0.005440870539286286, + "grad_norm": 0.0694902092218399, + "kl": 0.0003498132777167484, + "learning_rate": 1.086261980830671e-06, + "loss": 0.0363, + "reward": 0.21302083879709244, + "reward_std": 0.27850082218647004, + "rewards/accuracy_reward": 0.00833333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2046875089406967, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completion_length": 760.539599609375, + "epoch": 0.005760921747479597, + "grad_norm": 0.06334321200847626, + "kl": 0.0003469670336926356, + "learning_rate": 1.1501597444089457e-06, + "loss": 0.0129, + "reward": 0.27968751043081286, + "reward_std": 0.3003611326217651, + "rewards/accuracy_reward": 0.10000000335276127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.17968750596046448, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completion_length": 796.0041809082031, + "epoch": 0.006080972955672908, + "grad_norm": 0.06805295497179031, + "kl": 0.0004138117627007887, + "learning_rate": 1.2140575079872206e-06, + "loss": 0.0203, + "reward": 0.2723958395421505, + "reward_std": 0.3050135537981987, + "rewards/accuracy_reward": 0.0916666690260172, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.1807291738688946, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completion_length": 828.1062622070312, + "epoch": 0.006401024163866218, + "grad_norm": 0.06387760490179062, + "kl": 0.00039581527817063035, + "learning_rate": 1.2779552715654952e-06, + "loss": 0.0482, + "reward": 0.2500000037252903, + "reward_std": 0.2750077828764915, + "rewards/accuracy_reward": 0.07291666883975267, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.17500000745058059, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 809.339599609375, + "epoch": 0.00672107537205953, + "grad_norm": 0.0663813054561615, + "kl": 0.0004165978491073474, + "learning_rate": 1.34185303514377e-06, + "loss": 0.0227, + "reward": 0.2078125037252903, + "reward_std": 0.28486852943897245, + "rewards/accuracy_reward": 0.05000000111758709, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.15572917088866234, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completion_length": 784.6770935058594, + "epoch": 0.0070411265802528405, + "grad_norm": 0.06903195381164551, + "kl": 0.00047817713639233264, + "learning_rate": 1.4057507987220447e-06, + "loss": 0.0204, + "reward": 0.3536458477377892, + "reward_std": 0.3153432786464691, + "rewards/accuracy_reward": 0.11041667070239783, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2432291775941849, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completion_length": 764.2896057128906, + "epoch": 0.007361177788446151, + "grad_norm": 0.06699788570404053, + "kl": 0.0005332382366759703, + "learning_rate": 1.4696485623003196e-06, + "loss": 0.0319, + "reward": 0.25468750968575476, + "reward_std": 0.2953441575169563, + "rewards/accuracy_reward": 0.06875000186264515, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.18385417237877846, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completion_length": 773.4646057128906, + "epoch": 0.007681228996639462, + "grad_norm": 0.07254047691822052, + "kl": 0.0006663853419013321, + "learning_rate": 1.5335463258785943e-06, + "loss": 0.0288, + "reward": 0.2437500074505806, + "reward_std": 0.32061070799827573, + "rewards/accuracy_reward": 0.022916666977107526, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.21666667535901069, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completion_length": 804.4416870117187, + "epoch": 0.008001280204832774, + "grad_norm": 0.07208665460348129, + "kl": 0.0009051127126440406, + "learning_rate": 1.5974440894568691e-06, + "loss": 0.0228, + "reward": 0.3687500134110451, + "reward_std": 0.3236115902662277, + "rewards/accuracy_reward": 0.11458333767950535, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2541666731238365, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completion_length": 797.6062744140625, + "epoch": 0.008321331413026083, + "grad_norm": 0.07198836654424667, + "kl": 0.0010994194832164793, + "learning_rate": 1.6613418530351438e-06, + "loss": 0.0327, + "reward": 0.39322917759418485, + "reward_std": 0.33683302253484726, + "rewards/accuracy_reward": 0.15000000447034836, + "rewards/format_reward": 0.006250000186264515, + "rewards/tag_count_reward": 0.2369791693985462, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completion_length": 741.5521057128906, + "epoch": 0.008641382621219395, + "grad_norm": 0.06907658278942108, + "kl": 0.001576024480164051, + "learning_rate": 1.7252396166134187e-06, + "loss": 0.0406, + "reward": 0.29739583730697633, + "reward_std": 0.32967462539672854, + "rewards/accuracy_reward": 0.0312500013038516, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.2619791716337204, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completion_length": 727.1312683105468, + "epoch": 0.008961433829412706, + "grad_norm": 0.07349957525730133, + "kl": 0.0019106465857475995, + "learning_rate": 1.7891373801916933e-06, + "loss": 0.0432, + "reward": 0.42604167461395265, + "reward_std": 0.36733110845088957, + "rewards/accuracy_reward": 0.1291666690260172, + "rewards/format_reward": 0.006250000186264515, + "rewards/tag_count_reward": 0.29062501043081285, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completion_length": 732.5708557128906, + "epoch": 0.009281485037606016, + "grad_norm": 0.07309871166944504, + "kl": 0.0023886744515039028, + "learning_rate": 1.8530351437699682e-06, + "loss": 0.021, + "reward": 0.313541679084301, + "reward_std": 0.31117996871471404, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.2697916775941849, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completion_length": 779.2083557128906, + "epoch": 0.009601536245799328, + "grad_norm": 0.06522124260663986, + "kl": 0.002394045365508646, + "learning_rate": 1.916932907348243e-06, + "loss": 0.0413, + "reward": 0.39635417610406876, + "reward_std": 0.3403144717216492, + "rewards/accuracy_reward": 0.09375000149011611, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.30260417312383653, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completion_length": 750.5708557128906, + "epoch": 0.00992158745399264, + "grad_norm": 0.07314804196357727, + "kl": 0.0028994579799473284, + "learning_rate": 1.9808306709265175e-06, + "loss": 0.059, + "reward": 0.37916667461395265, + "reward_std": 0.3659482032060623, + "rewards/accuracy_reward": 0.04583333544433117, + "rewards/format_reward": 0.00833333358168602, + "rewards/tag_count_reward": 0.3250000089406967, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completion_length": 739.9396179199218, + "epoch": 0.01024163866218595, + "grad_norm": 0.06802285462617874, + "kl": 0.0030266973888501527, + "learning_rate": 2.0447284345047924e-06, + "loss": 0.0349, + "reward": 0.42083333879709245, + "reward_std": 0.33237463533878325, + "rewards/accuracy_reward": 0.04583333358168602, + "rewards/format_reward": 0.006250000186264515, + "rewards/tag_count_reward": 0.36875001043081285, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completion_length": 776.1646118164062, + "epoch": 0.01056168987037926, + "grad_norm": 0.061457838863134384, + "kl": 0.0036948778200894595, + "learning_rate": 2.1086261980830672e-06, + "loss": 0.0446, + "reward": 0.4390625059604645, + "reward_std": 0.3595381796360016, + "rewards/accuracy_reward": 0.06458333507180214, + "rewards/format_reward": 0.00833333358168602, + "rewards/tag_count_reward": 0.36614584624767305, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completion_length": 796.0562683105469, + "epoch": 0.010881741078572572, + "grad_norm": 0.06410246342420578, + "kl": 0.007469672057777643, + "learning_rate": 2.172523961661342e-06, + "loss": 0.0565, + "reward": 0.43125001043081285, + "reward_std": 0.3425430357456207, + "rewards/accuracy_reward": 0.05208333544433117, + "rewards/format_reward": 0.010416666977107525, + "rewards/tag_count_reward": 0.3687500059604645, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completion_length": 743.7812744140625, + "epoch": 0.011201792286765882, + "grad_norm": 0.06519825756549835, + "kl": 0.007782880403101444, + "learning_rate": 2.2364217252396165e-06, + "loss": 0.0653, + "reward": 0.45937501490116117, + "reward_std": 0.36458621323108675, + "rewards/accuracy_reward": 0.06458333544433117, + "rewards/format_reward": 0.016666666977107523, + "rewards/tag_count_reward": 0.37812501192092896, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completion_length": 734.9021179199219, + "epoch": 0.011521843494959194, + "grad_norm": 0.07106705009937286, + "kl": 0.010493304487317801, + "learning_rate": 2.3003194888178914e-06, + "loss": 0.0507, + "reward": 0.4416666775941849, + "reward_std": 0.361805260181427, + "rewards/accuracy_reward": 0.01458333358168602, + "rewards/format_reward": 0.014583333767950535, + "rewards/tag_count_reward": 0.4125000059604645, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completion_length": 745.5979370117187, + "epoch": 0.011841894703152504, + "grad_norm": 0.06895628571510315, + "kl": 0.01310005160048604, + "learning_rate": 2.3642172523961663e-06, + "loss": 0.0644, + "reward": 0.5255208402872086, + "reward_std": 0.3466539680957794, + "rewards/accuracy_reward": 0.0687500013038516, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.45260417759418486, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completion_length": 740.8541809082031, + "epoch": 0.012161945911345815, + "grad_norm": 0.06350603699684143, + "kl": 0.01148996208794415, + "learning_rate": 2.428115015974441e-06, + "loss": 0.0569, + "reward": 0.6489583432674408, + "reward_std": 0.3286067843437195, + "rewards/accuracy_reward": 0.11875000521540642, + "rewards/format_reward": 0.018750000558793545, + "rewards/tag_count_reward": 0.5114583402872086, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completion_length": 753.3229309082031, + "epoch": 0.012481997119539127, + "grad_norm": 0.06634090095758438, + "kl": 0.010962517792358994, + "learning_rate": 2.4920127795527156e-06, + "loss": 0.0411, + "reward": 0.567187511920929, + "reward_std": 0.32436338663101194, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.02083333395421505, + "rewards/tag_count_reward": 0.5067708522081376, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completion_length": 740.2521118164062, + "epoch": 0.012802048327732437, + "grad_norm": 0.060020193457603455, + "kl": 0.012796282302588224, + "learning_rate": 2.5559105431309904e-06, + "loss": 0.0726, + "reward": 0.5812500149011612, + "reward_std": 0.3484797939658165, + "rewards/accuracy_reward": 0.050000001676380634, + "rewards/format_reward": 0.014583333767950535, + "rewards/tag_count_reward": 0.5166666805744171, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 697.7000183105469, + "epoch": 0.013122099535925748, + "grad_norm": 0.05944250524044037, + "kl": 0.012154347030445933, + "learning_rate": 2.6198083067092657e-06, + "loss": 0.1055, + "reward": 0.6166666924953461, + "reward_std": 0.31222147643566134, + "rewards/accuracy_reward": 0.0645833346992731, + "rewards/format_reward": 0.012500000186264515, + "rewards/tag_count_reward": 0.5395833522081375, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completion_length": 735.7458618164062, + "epoch": 0.01344215074411906, + "grad_norm": 0.06103678047657013, + "kl": 0.014400722924619913, + "learning_rate": 2.68370607028754e-06, + "loss": 0.058, + "reward": 0.6255208432674408, + "reward_std": 0.3416227579116821, + "rewards/accuracy_reward": 0.07083333507180214, + "rewards/format_reward": 0.01250000037252903, + "rewards/tag_count_reward": 0.542187511920929, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completion_length": 775.9333618164062, + "epoch": 0.01376220195231237, + "grad_norm": 0.06868734210729599, + "kl": 0.017303874902427196, + "learning_rate": 2.747603833865815e-06, + "loss": 0.0529, + "reward": 0.6255208551883698, + "reward_std": 0.32499729096889496, + "rewards/accuracy_reward": 0.10625000204890966, + "rewards/format_reward": 0.01875000037252903, + "rewards/tag_count_reward": 0.5005208492279053, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completion_length": 698.9979309082031, + "epoch": 0.014082253160505681, + "grad_norm": 0.06461925804615021, + "kl": 0.014360193721950054, + "learning_rate": 2.8115015974440895e-06, + "loss": 0.0818, + "reward": 0.610416692495346, + "reward_std": 0.3274090111255646, + "rewards/accuracy_reward": 0.05208333507180214, + "rewards/format_reward": 0.01666666716337204, + "rewards/tag_count_reward": 0.5416666805744171, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completion_length": 738.0083618164062, + "epoch": 0.014402304368698993, + "grad_norm": 0.06297613680362701, + "kl": 0.01128577790223062, + "learning_rate": 2.8753993610223648e-06, + "loss": 0.0643, + "reward": 0.5531250178813935, + "reward_std": 0.30965033173561096, + "rewards/accuracy_reward": 0.0229166679084301, + "rewards/format_reward": 0.006250000186264515, + "rewards/tag_count_reward": 0.5239583522081375, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completion_length": 787.9354431152344, + "epoch": 0.014722355576892302, + "grad_norm": 0.0712624341249466, + "kl": 0.019073341879993676, + "learning_rate": 2.9392971246006392e-06, + "loss": 0.0794, + "reward": 0.5593750149011611, + "reward_std": 0.28996885418891905, + "rewards/accuracy_reward": 0.04375000111758709, + "rewards/format_reward": 0.006250000186264515, + "rewards/tag_count_reward": 0.5093750178813934, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completion_length": 749.1521057128906, + "epoch": 0.015042406785085614, + "grad_norm": 0.057314738631248474, + "kl": 0.01389997247606516, + "learning_rate": 3.003194888178914e-06, + "loss": 0.0873, + "reward": 0.6125000178813934, + "reward_std": 0.3025829717516899, + "rewards/accuracy_reward": 0.0875000024214387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5250000178813934, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completion_length": 765.7541809082031, + "epoch": 0.015362457993278924, + "grad_norm": 0.05906614288687706, + "kl": 0.01638176813721657, + "learning_rate": 3.0670926517571885e-06, + "loss": 0.0634, + "reward": 0.6109375119209289, + "reward_std": 0.2791178122162819, + "rewards/accuracy_reward": 0.05000000111758709, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.5588541865348816, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 740.683349609375, + "epoch": 0.015682509201472235, + "grad_norm": 0.06702172756195068, + "kl": 0.017427592631429435, + "learning_rate": 3.130990415335464e-06, + "loss": 0.0844, + "reward": 0.6395833522081376, + "reward_std": 0.28103085309267045, + "rewards/accuracy_reward": 0.07708333563059569, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.5604166835546494, + "step": 49 + }, + { + "clip_ratio": 0.0, + "completion_length": 724.6375244140625, + "epoch": 0.016002560409665547, + "grad_norm": 0.07204456627368927, + "kl": 0.02422009501606226, + "learning_rate": 3.1948881789137383e-06, + "loss": 0.0571, + "reward": 0.6385416924953461, + "reward_std": 0.3018873170018196, + "rewards/accuracy_reward": 0.09166667088866234, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.5447916835546494, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completion_length": 745.5000305175781, + "epoch": 0.01632261161785886, + "grad_norm": 0.06197218969464302, + "kl": 0.015600860584527254, + "learning_rate": 3.258785942492013e-06, + "loss": 0.0704, + "reward": 0.6192708551883698, + "reward_std": 0.27808820456266403, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.5776041865348815, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completion_length": 740.6771057128906, + "epoch": 0.016642662826052167, + "grad_norm": 0.05719318985939026, + "kl": 0.016424881853163244, + "learning_rate": 3.3226837060702876e-06, + "loss": 0.0677, + "reward": 0.6161458492279053, + "reward_std": 0.2974912986159325, + "rewards/accuracy_reward": 0.02083333358168602, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.5932291805744171, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completion_length": 713.7646118164063, + "epoch": 0.016962714034245478, + "grad_norm": 0.07055334746837616, + "kl": 0.019731516763567925, + "learning_rate": 3.386581469648563e-06, + "loss": 0.1139, + "reward": 0.6614583492279053, + "reward_std": 0.26685925424098966, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.6239583492279053, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completion_length": 682.839599609375, + "epoch": 0.01728276524243879, + "grad_norm": 0.06527426093816757, + "kl": 0.01968300249427557, + "learning_rate": 3.4504792332268373e-06, + "loss": 0.0842, + "reward": 0.6651041865348816, + "reward_std": 0.27405439764261247, + "rewards/accuracy_reward": 0.04375000130385161, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.6171875119209289, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completion_length": 706.3312622070313, + "epoch": 0.0176028164506321, + "grad_norm": 0.06510436534881592, + "kl": 0.019583940878510474, + "learning_rate": 3.514376996805112e-06, + "loss": 0.091, + "reward": 0.6786458641290665, + "reward_std": 0.2530492454767227, + "rewards/accuracy_reward": 0.07708333656191826, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6015625268220901, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completion_length": 705.5250244140625, + "epoch": 0.017922867658825413, + "grad_norm": 0.07424038648605347, + "kl": 0.02574802339076996, + "learning_rate": 3.5782747603833866e-06, + "loss": 0.0836, + "reward": 0.7000000178813934, + "reward_std": 0.2880414813756943, + "rewards/accuracy_reward": 0.06875000111758708, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.631250011920929, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completion_length": 626.9562622070313, + "epoch": 0.018242918867018725, + "grad_norm": 0.07916589826345444, + "kl": 0.022329603042453527, + "learning_rate": 3.642172523961662e-06, + "loss": 0.0914, + "reward": 0.7244791865348816, + "reward_std": 0.24143780022859573, + "rewards/accuracy_reward": 0.04583333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6786458551883697, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completion_length": 663.2021118164063, + "epoch": 0.018562970075212033, + "grad_norm": 0.0976390466094017, + "kl": 0.025962639041244982, + "learning_rate": 3.7060702875399364e-06, + "loss": 0.1237, + "reward": 0.7281250178813934, + "reward_std": 0.2525856912136078, + "rewards/accuracy_reward": 0.03958333358168602, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.6864583551883697, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completion_length": 700.6666931152344, + "epoch": 0.018883021283405344, + "grad_norm": 0.06289377063512802, + "kl": 0.0208747168071568, + "learning_rate": 3.7699680511182112e-06, + "loss": 0.0846, + "reward": 0.6458333492279053, + "reward_std": 0.2825623080134392, + "rewards/accuracy_reward": 0.016666666977107523, + "rewards/format_reward": 0.00833333358168602, + "rewards/tag_count_reward": 0.6208333432674408, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completion_length": 665.733349609375, + "epoch": 0.019203072491598656, + "grad_norm": 0.07116346806287766, + "kl": 0.023737166076898575, + "learning_rate": 3.833865814696486e-06, + "loss": 0.0857, + "reward": 0.7718750238418579, + "reward_std": 0.26197345554828644, + "rewards/accuracy_reward": 0.12083333730697632, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6510416865348816, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 697.3125244140625, + "epoch": 0.019523123699791967, + "grad_norm": 0.061922844499349594, + "kl": 0.023615499306470156, + "learning_rate": 3.8977635782747605e-06, + "loss": 0.0684, + "reward": 0.6494791865348816, + "reward_std": 0.25366342514753343, + "rewards/accuracy_reward": 0.01458333358168602, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.6307291805744171, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completion_length": 706.2312683105469, + "epoch": 0.01984317490798528, + "grad_norm": 0.06591752171516418, + "kl": 0.019074952974915505, + "learning_rate": 3.961661341853035e-06, + "loss": 0.0935, + "reward": 0.6859375298023224, + "reward_std": 0.2340133711695671, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.6442708551883698, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completion_length": 691.6312683105468, + "epoch": 0.020163226116178587, + "grad_norm": 0.06416673958301544, + "kl": 0.024632269330322743, + "learning_rate": 4.02555910543131e-06, + "loss": 0.1397, + "reward": 0.7000000238418579, + "reward_std": 0.251662477850914, + "rewards/accuracy_reward": 0.0541666679084301, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.6437500238418579, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completion_length": 689.0104370117188, + "epoch": 0.0204832773243719, + "grad_norm": 0.06594003736972809, + "kl": 0.023258844204247, + "learning_rate": 4.089456869009585e-06, + "loss": 0.1066, + "reward": 0.7401041805744171, + "reward_std": 0.25073023736476896, + "rewards/accuracy_reward": 0.08333333358168601, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6567708492279053, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completion_length": 698.4687683105469, + "epoch": 0.02080332853256521, + "grad_norm": 0.06369701772928238, + "kl": 0.02361576007679105, + "learning_rate": 4.15335463258786e-06, + "loss": 0.1014, + "reward": 0.6906250238418579, + "reward_std": 0.23723849654197693, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.6489583551883698, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completion_length": 631.3666870117188, + "epoch": 0.02112337974075852, + "grad_norm": 0.0697370246052742, + "kl": 0.027477294206619263, + "learning_rate": 4.2172523961661345e-06, + "loss": 0.0981, + "reward": 0.7203125119209289, + "reward_std": 0.27272156327962876, + "rewards/accuracy_reward": 0.03333333451300859, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.6828125178813934, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completion_length": 657.4875183105469, + "epoch": 0.021443430948951833, + "grad_norm": 0.07227819412946701, + "kl": 0.022128170542418956, + "learning_rate": 4.28115015974441e-06, + "loss": 0.1108, + "reward": 0.7484375238418579, + "reward_std": 0.2265900582075119, + "rewards/accuracy_reward": 0.07708333563059569, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6713541865348815, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.1854309082031, + "epoch": 0.021763482157145145, + "grad_norm": 0.08304242789745331, + "kl": 0.036028834991157055, + "learning_rate": 4.345047923322684e-06, + "loss": 0.1116, + "reward": 0.8598958671092987, + "reward_std": 0.24308189302682875, + "rewards/accuracy_reward": 0.14791667200624942, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7119791865348816, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completion_length": 631.5979431152343, + "epoch": 0.022083533365338453, + "grad_norm": 0.07294842600822449, + "kl": 0.028276703879237176, + "learning_rate": 4.408945686900959e-06, + "loss": 0.0673, + "reward": 0.720312523841858, + "reward_std": 0.2192530706524849, + "rewards/accuracy_reward": 0.012500000558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7078125238418579, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completion_length": 672.1000061035156, + "epoch": 0.022403584573531764, + "grad_norm": 0.07134870439767838, + "kl": 0.025765881687402726, + "learning_rate": 4.472843450479233e-06, + "loss": 0.0862, + "reward": 0.7296875178813934, + "reward_std": 0.24120083600282669, + "rewards/accuracy_reward": 0.02500000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7046875178813934, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completion_length": 638.2479309082031, + "epoch": 0.022723635781725076, + "grad_norm": 0.07261071354150772, + "kl": 0.036591825634241106, + "learning_rate": 4.536741214057508e-06, + "loss": 0.0825, + "reward": 0.8052083551883698, + "reward_std": 0.22306446582078934, + "rewards/accuracy_reward": 0.06250000149011611, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completion_length": 643.6604248046875, + "epoch": 0.023043686989918388, + "grad_norm": 0.07969695329666138, + "kl": 0.03463496062904596, + "learning_rate": 4.600638977635783e-06, + "loss": 0.0973, + "reward": 0.8437500298023224, + "reward_std": 0.2542044401168823, + "rewards/accuracy_reward": 0.10000000316649675, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completion_length": 653.0833557128906, + "epoch": 0.0233637381981117, + "grad_norm": 0.07199858874082565, + "kl": 0.031290368735790254, + "learning_rate": 4.664536741214058e-06, + "loss": 0.0653, + "reward": 0.8010416924953461, + "reward_std": 0.21112514436244964, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7322916865348816, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completion_length": 671.058349609375, + "epoch": 0.023683789406305007, + "grad_norm": 0.08724746108055115, + "kl": 0.04281867854297161, + "learning_rate": 4.7284345047923325e-06, + "loss": 0.0785, + "reward": 0.7541666924953461, + "reward_std": 0.22042571306228637, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.7145833551883698, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completion_length": 697.4229309082032, + "epoch": 0.02400384061449832, + "grad_norm": 0.07457519322633743, + "kl": 0.03650112468749285, + "learning_rate": 4.792332268370608e-06, + "loss": 0.0906, + "reward": 0.8213541805744171, + "reward_std": 0.2625132277607918, + "rewards/accuracy_reward": 0.12083333749324084, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7005208551883697, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completion_length": 652.5500091552734, + "epoch": 0.02432389182269163, + "grad_norm": 0.07498233765363693, + "kl": 0.03543906323611736, + "learning_rate": 4.856230031948882e-06, + "loss": 0.095, + "reward": 0.7598958492279053, + "reward_std": 0.22588661164045334, + "rewards/accuracy_reward": 0.01875, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7411458492279053, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completion_length": 677.7000183105469, + "epoch": 0.024643943030884942, + "grad_norm": 0.09663916379213333, + "kl": 0.04480956122279167, + "learning_rate": 4.920127795527157e-06, + "loss": 0.1344, + "reward": 0.7166666865348816, + "reward_std": 0.2699084341526031, + "rewards/accuracy_reward": 0.01875000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.697916692495346, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completion_length": 658.0666748046875, + "epoch": 0.024963994239078253, + "grad_norm": 0.08884267508983612, + "kl": 0.06787048671394587, + "learning_rate": 4.984025559105431e-06, + "loss": 0.0796, + "reward": 0.7500000238418579, + "reward_std": 0.20937047749757767, + "rewards/accuracy_reward": 0.00625, + "rewards/format_reward": 0.006250000186264515, + "rewards/tag_count_reward": 0.7375000178813934, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completion_length": 653.7562683105468, + "epoch": 0.025284045447271565, + "grad_norm": 0.12078572064638138, + "kl": 0.04476796705275774, + "learning_rate": 5.0479233226837065e-06, + "loss": 0.1238, + "reward": 0.8010416865348816, + "reward_std": 0.28335138112306596, + "rewards/accuracy_reward": 0.06875000111758708, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.7302083551883698, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completion_length": 663.2187744140625, + "epoch": 0.025604096655464873, + "grad_norm": 0.9814605712890625, + "kl": 0.16789422370493412, + "learning_rate": 5.111821086261981e-06, + "loss": 0.1109, + "reward": 0.8114583671092988, + "reward_std": 0.26037254482507705, + "rewards/accuracy_reward": 0.07708333637565375, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7343750238418579, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 645.8104309082031, + "epoch": 0.025924147863658185, + "grad_norm": 0.10365696996450424, + "kl": 0.04504641108214855, + "learning_rate": 5.175718849840255e-06, + "loss": 0.1185, + "reward": 0.7807291984558106, + "reward_std": 0.2670013889670372, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7432291865348816, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completion_length": 679.1979431152344, + "epoch": 0.026244199071851496, + "grad_norm": 0.10205037146806717, + "kl": 0.04317870959639549, + "learning_rate": 5.2396166134185315e-06, + "loss": 0.124, + "reward": 0.7625000178813934, + "reward_std": 0.26102137863636016, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.7208333551883698, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completion_length": 663.708349609375, + "epoch": 0.026564250280044808, + "grad_norm": 0.09976530820131302, + "kl": 0.04077131990343332, + "learning_rate": 5.303514376996806e-06, + "loss": 0.1011, + "reward": 0.8312500178813934, + "reward_std": 0.2625503420829773, + "rewards/accuracy_reward": 0.08333333544433116, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7479166865348816, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completion_length": 633.995849609375, + "epoch": 0.02688430148823812, + "grad_norm": 0.11126335710287094, + "kl": 0.04656725451350212, + "learning_rate": 5.36741214057508e-06, + "loss": 0.1043, + "reward": 0.7817708551883698, + "reward_std": 0.2660654917359352, + "rewards/accuracy_reward": 0.06041666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7213541865348816, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completion_length": 640.8979370117188, + "epoch": 0.027204352696431428, + "grad_norm": 0.1051332950592041, + "kl": 0.04987532235682011, + "learning_rate": 5.431309904153355e-06, + "loss": 0.127, + "reward": 0.7156250238418579, + "reward_std": 0.253768752515316, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.7093750238418579, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completion_length": 649.9750183105468, + "epoch": 0.02752440390462474, + "grad_norm": 0.1315613090991974, + "kl": 0.049108054488897324, + "learning_rate": 5.49520766773163e-06, + "loss": 0.1478, + "reward": 0.6697916865348816, + "reward_std": 0.3035299152135849, + "rewards/accuracy_reward": 0.014583333767950535, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6552083551883697, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completion_length": 686.2729309082031, + "epoch": 0.02784445511281805, + "grad_norm": 0.15391191840171814, + "kl": 0.055657780915498736, + "learning_rate": 5.5591054313099045e-06, + "loss": 0.1489, + "reward": 0.6500000208616257, + "reward_std": 0.29610070735216143, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6166666835546494, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completion_length": 656.4562683105469, + "epoch": 0.028164506321011362, + "grad_norm": 0.11603273451328278, + "kl": 0.04724425338208675, + "learning_rate": 5.623003194888179e-06, + "loss": 0.1022, + "reward": 0.6697916924953461, + "reward_std": 0.2515522539615631, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6489583492279053, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completion_length": 696.1625183105468, + "epoch": 0.028484557529204674, + "grad_norm": 0.15315236151218414, + "kl": 0.04981156475841999, + "learning_rate": 5.6869009584664534e-06, + "loss": 0.0933, + "reward": 0.665104192495346, + "reward_std": 0.27600702494382856, + "rewards/accuracy_reward": 0.04583333469927311, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6192708611488342, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completion_length": 695.0104431152344, + "epoch": 0.028804608737397985, + "grad_norm": 0.14363472163677216, + "kl": 0.048564912378787996, + "learning_rate": 5.7507987220447296e-06, + "loss": 0.0629, + "reward": 0.6744791805744171, + "reward_std": 0.2537939205765724, + "rewards/accuracy_reward": 0.039583333395421506, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6348958432674408, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 675.5833557128906, + "epoch": 0.029124659945591293, + "grad_norm": 0.24816231429576874, + "kl": 0.07004429288208484, + "learning_rate": 5.814696485623004e-06, + "loss": 0.0738, + "reward": 0.6838541865348816, + "reward_std": 0.2542484775185585, + "rewards/accuracy_reward": 0.04375000130385161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6401041924953461, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completion_length": 707.4437683105468, + "epoch": 0.029444711153784605, + "grad_norm": 0.35220104455947876, + "kl": 0.08218934014439583, + "learning_rate": 5.8785942492012785e-06, + "loss": 0.0353, + "reward": 0.6536458551883697, + "reward_std": 0.2631165474653244, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.6140625178813934, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completion_length": 640.3979370117188, + "epoch": 0.029764762361977917, + "grad_norm": 0.48827409744262695, + "kl": 0.1260451439768076, + "learning_rate": 5.942492012779553e-06, + "loss": 0.0827, + "reward": 0.658854192495346, + "reward_std": 0.26948108375072477, + "rewards/accuracy_reward": 0.022916667349636554, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6359375238418579, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completion_length": 688.8896057128907, + "epoch": 0.030084813570171228, + "grad_norm": 0.6792539358139038, + "kl": 0.1645615816116333, + "learning_rate": 6.006389776357828e-06, + "loss": 0.0582, + "reward": 0.7057291865348816, + "reward_std": 0.27467391192913054, + "rewards/accuracy_reward": 0.08125000335276127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6244791924953461, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completion_length": 699.545849609375, + "epoch": 0.03040486477836454, + "grad_norm": 0.7102315425872803, + "kl": 0.24729929864406586, + "learning_rate": 6.070287539936103e-06, + "loss": -0.0393, + "reward": 0.6776041984558105, + "reward_std": 0.2612756446003914, + "rewards/accuracy_reward": 0.07291666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6046875298023224, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completion_length": 649.2875183105468, + "epoch": 0.030724915986557848, + "grad_norm": 1.246417760848999, + "kl": 0.40844622552394866, + "learning_rate": 6.134185303514377e-06, + "loss": -0.068, + "reward": 0.657812523841858, + "reward_std": 0.2727681741118431, + "rewards/accuracy_reward": 0.04791666772216559, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.6078125238418579, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.0271118164062, + "epoch": 0.03104496719475116, + "grad_norm": 3.4409334659576416, + "kl": 1.8638823270797729, + "learning_rate": 6.1980830670926515e-06, + "loss": -0.1356, + "reward": 0.6661458492279053, + "reward_std": 0.28613357841968534, + "rewards/accuracy_reward": 0.07500000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5911458492279053, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completion_length": 651.3687622070313, + "epoch": 0.03136501840294447, + "grad_norm": 6.783247470855713, + "kl": 0.931004011631012, + "learning_rate": 6.261980830670928e-06, + "loss": -0.19, + "reward": 0.6791666865348815, + "reward_std": 0.30113149881362916, + "rewards/accuracy_reward": 0.0937500026077032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5854166805744171, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.1354309082031, + "epoch": 0.03168506961113778, + "grad_norm": 0.5530074238777161, + "kl": 0.5937871515750885, + "learning_rate": 6.325878594249202e-06, + "loss": -0.2302, + "reward": 0.5416666805744171, + "reward_std": 0.3111159473657608, + "rewards/accuracy_reward": 0.016666667349636555, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5250000119209289, + "step": 99 + }, + { + "clip_ratio": 0.0, + "completion_length": 615.7812622070312, + "epoch": 0.032005120819331094, + "grad_norm": 0.4718743860721588, + "kl": 0.8061838716268539, + "learning_rate": 6.3897763578274765e-06, + "loss": -0.1769, + "reward": 0.5369791775941849, + "reward_std": 0.281660270690918, + "rewards/accuracy_reward": 0.04375000130385161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.49322917461395266, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.3187622070312, + "epoch": 0.032325172027524406, + "grad_norm": 1.6203663349151611, + "kl": 1.0429674439132213, + "learning_rate": 6.453674121405751e-06, + "loss": -0.2055, + "reward": 0.5328125149011612, + "reward_std": 0.3219150841236115, + "rewards/accuracy_reward": 0.09375000242143869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4390625149011612, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completion_length": 498.24376525878904, + "epoch": 0.03264522323571772, + "grad_norm": 15.725761413574219, + "kl": 1.3648170441389085, + "learning_rate": 6.517571884984026e-06, + "loss": -0.264, + "reward": 0.387500011920929, + "reward_std": 0.2903954029083252, + "rewards/accuracy_reward": 0.04166666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3458333447575569, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completion_length": 440.0270965576172, + "epoch": 0.03296527444391103, + "grad_norm": 23.40546989440918, + "kl": 7.020788234472275, + "learning_rate": 6.581469648562301e-06, + "loss": -0.1625, + "reward": 0.2968750104308128, + "reward_std": 0.26096881479024886, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2927083417773247, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.03334350585936, + "epoch": 0.03328532565210433, + "grad_norm": 7.267569541931152, + "kl": 2.4132164478302003, + "learning_rate": 6.645367412140575e-06, + "loss": -0.5439, + "reward": 0.24791667461395264, + "reward_std": 0.21916062086820604, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.24791667461395264, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.7791778564453, + "epoch": 0.033605376860297645, + "grad_norm": 2.026956558227539, + "kl": 5.643379735946655, + "learning_rate": 6.709265175718851e-06, + "loss": -0.4394, + "reward": 0.31406250447034834, + "reward_std": 0.22878252267837523, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.24739584028720857, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.2979278564453, + "epoch": 0.033925428068490956, + "grad_norm": 3.8936071395874023, + "kl": 2.7883768916130065, + "learning_rate": 6.773162939297126e-06, + "loss": -0.7208, + "reward": 0.36197917312383654, + "reward_std": 0.2445184901356697, + "rewards/accuracy_reward": 0.10416666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2578125074505806, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.5145965576172, + "epoch": 0.03424547927668427, + "grad_norm": 1.3432917594909668, + "kl": 2.7319489240646364, + "learning_rate": 6.8370607028754e-06, + "loss": -0.5744, + "reward": 0.3395833432674408, + "reward_std": 0.23924150168895722, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.30625001192092893, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.5729248046875, + "epoch": 0.03456553048487758, + "grad_norm": 17.39250373840332, + "kl": 10.756308102607727, + "learning_rate": 6.900958466453675e-06, + "loss": -0.7653, + "reward": 0.2385416731238365, + "reward_std": 0.23416123688220977, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2385416731238365, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.77084197998045, + "epoch": 0.03488558169307089, + "grad_norm": 11.659235954284668, + "kl": 7.152430748939514, + "learning_rate": 6.96485623003195e-06, + "loss": -0.5854, + "reward": 0.3473958447575569, + "reward_std": 0.24123955219984056, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.28072917461395264, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.0770950317383, + "epoch": 0.0352056329012642, + "grad_norm": 1.1932032108306885, + "kl": 2.870084857940674, + "learning_rate": 7.028753993610224e-06, + "loss": -0.825, + "reward": 0.40833334177732467, + "reward_std": 0.23637133538722993, + "rewards/accuracy_reward": 0.13333333730697633, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2750000074505806, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.9896011352539, + "epoch": 0.035525684109457514, + "grad_norm": 0.5973522067070007, + "kl": 2.3504113078117372, + "learning_rate": 7.092651757188499e-06, + "loss": -0.7701, + "reward": 0.3484375089406967, + "reward_std": 0.2674763187766075, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.31093751043081286, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completion_length": 398.56459350585936, + "epoch": 0.035845735317650826, + "grad_norm": 0.35262539982795715, + "kl": 2.6340174436569215, + "learning_rate": 7.156549520766773e-06, + "loss": -0.5525, + "reward": 0.3354166761040688, + "reward_std": 0.2599399000406265, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.29791667610406875, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completion_length": 406.2562591552734, + "epoch": 0.03616578652584414, + "grad_norm": 0.3773941397666931, + "kl": 2.847642481327057, + "learning_rate": 7.220447284345049e-06, + "loss": -0.6454, + "reward": 0.3380208432674408, + "reward_std": 0.2487858936190605, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3338541775941849, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completion_length": 468.59376525878906, + "epoch": 0.03648583773403745, + "grad_norm": 1.5241540670394897, + "kl": 3.3227088809013368, + "learning_rate": 7.284345047923324e-06, + "loss": -0.4856, + "reward": 0.34270834028720853, + "reward_std": 0.23925637304782868, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.34270834028720853, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completion_length": 485.65834655761716, + "epoch": 0.036805888942230754, + "grad_norm": 0.6140667200088501, + "kl": 2.535499429702759, + "learning_rate": 7.348242811501598e-06, + "loss": -0.5934, + "reward": 0.3692708432674408, + "reward_std": 0.23712805062532424, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3338541775941849, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.3479309082031, + "epoch": 0.037125940150424065, + "grad_norm": 0.2592978775501251, + "kl": 1.8334528475999832, + "learning_rate": 7.412140575079873e-06, + "loss": -0.4457, + "reward": 0.41614584922790526, + "reward_std": 0.23759952187538147, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.37864584624767306, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completion_length": 522.435433959961, + "epoch": 0.03744599135861738, + "grad_norm": 0.2672344148159027, + "kl": 2.5485853970050814, + "learning_rate": 7.476038338658148e-06, + "loss": -0.4729, + "reward": 0.4140625178813934, + "reward_std": 0.23257143348455428, + "rewards/accuracy_reward": 0.07500000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.33906251192092896, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.4583435058594, + "epoch": 0.03776604256681069, + "grad_norm": 0.169293612241745, + "kl": 2.141266053915024, + "learning_rate": 7.5399361022364225e-06, + "loss": -0.4337, + "reward": 0.4895833536982536, + "reward_std": 0.20847297906875611, + "rewards/accuracy_reward": 0.13333333730697633, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3562500074505806, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.439599609375, + "epoch": 0.038086093775004, + "grad_norm": 0.16004759073257446, + "kl": 2.440905587375164, + "learning_rate": 7.603833865814697e-06, + "loss": -0.3232, + "reward": 0.49895834624767305, + "reward_std": 0.2105870932340622, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3947916805744171, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.9896118164063, + "epoch": 0.03840614498319731, + "grad_norm": 0.121612548828125, + "kl": 2.967607820034027, + "learning_rate": 7.667731629392972e-06, + "loss": -0.4322, + "reward": 0.38072917610406876, + "reward_std": 0.20375476628541947, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3473958417773247, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.1312728881836, + "epoch": 0.03872619619139062, + "grad_norm": 0.45568010210990906, + "kl": 3.2019447505474092, + "learning_rate": 7.731629392971247e-06, + "loss": -0.4594, + "reward": 0.43593751788139345, + "reward_std": 0.21259481608867645, + "rewards/accuracy_reward": 0.07291666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3630208447575569, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.1395965576172, + "epoch": 0.039046247399583935, + "grad_norm": 0.22284314036369324, + "kl": 2.2424231648445128, + "learning_rate": 7.795527156549521e-06, + "loss": -0.3353, + "reward": 0.41145834028720857, + "reward_std": 0.20110711306333542, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.37604167461395266, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completion_length": 625.7979370117188, + "epoch": 0.039366298607777246, + "grad_norm": 0.12313688546419144, + "kl": 2.1035622477531435, + "learning_rate": 7.859424920127796e-06, + "loss": -0.3469, + "reward": 0.40520834624767305, + "reward_std": 0.21936969012022017, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.38437501192092893, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.0708465576172, + "epoch": 0.03968634981597056, + "grad_norm": 1.0849019289016724, + "kl": 3.4187208458781244, + "learning_rate": 7.92332268370607e-06, + "loss": -0.377, + "reward": 0.426562511920929, + "reward_std": 0.2063383214175701, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3598958432674408, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.7833465576172, + "epoch": 0.04000640102416387, + "grad_norm": 0.17697374522686005, + "kl": 3.645887120813131, + "learning_rate": 7.987220447284347e-06, + "loss": -0.4098, + "reward": 0.4916666835546494, + "reward_std": 0.1886795900762081, + "rewards/accuracy_reward": 0.10000000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3916666775941849, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.3270965576172, + "epoch": 0.040326452232357174, + "grad_norm": 0.9790335297584534, + "kl": 4.54524188041687, + "learning_rate": 8.05111821086262e-06, + "loss": -0.5171, + "reward": 0.3625000089406967, + "reward_std": 0.18890787661075592, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3270833432674408, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completion_length": 515.5625122070312, + "epoch": 0.040646503440550485, + "grad_norm": 0.2020310014486313, + "kl": 3.900516414642334, + "learning_rate": 8.115015974440896e-06, + "loss": -0.6225, + "reward": 0.31822917312383653, + "reward_std": 0.20110346525907516, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3161458417773247, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.4583557128906, + "epoch": 0.0409665546487438, + "grad_norm": 0.6082282662391663, + "kl": 3.7118215203285216, + "learning_rate": 8.17891373801917e-06, + "loss": -0.4666, + "reward": 0.3500000089406967, + "reward_std": 0.20825719237327575, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.34791667461395265, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.7604309082031, + "epoch": 0.04128660585693711, + "grad_norm": 0.1156620979309082, + "kl": 2.5296462953090666, + "learning_rate": 8.242811501597445e-06, + "loss": -0.4838, + "reward": 0.3765625089406967, + "reward_std": 0.22583024352788925, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.33906251192092896, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completion_length": 500.91876525878905, + "epoch": 0.04160665706513042, + "grad_norm": 0.1654772162437439, + "kl": 1.6273529171943664, + "learning_rate": 8.30670926517572e-06, + "loss": -0.5471, + "reward": 0.46093751192092897, + "reward_std": 0.2393754631280899, + "rewards/accuracy_reward": 0.13958333749324084, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.32135417461395266, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.1875122070312, + "epoch": 0.04192670827332373, + "grad_norm": 0.24710217118263245, + "kl": 1.1954802095890045, + "learning_rate": 8.370607028753994e-06, + "loss": -0.4337, + "reward": 0.35468751192092896, + "reward_std": 0.21932000368833543, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3526041805744171, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completion_length": 494.2604278564453, + "epoch": 0.04224675948151704, + "grad_norm": 0.3936014175415039, + "kl": 1.0165327221155167, + "learning_rate": 8.434504792332269e-06, + "loss": -0.4978, + "reward": 0.3369791775941849, + "reward_std": 0.21726072281599046, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3369791775941849, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completion_length": 482.7979278564453, + "epoch": 0.042566810689710355, + "grad_norm": 0.29438725113868713, + "kl": 1.0482024848461151, + "learning_rate": 8.498402555910544e-06, + "loss": -0.4462, + "reward": 0.4250000149011612, + "reward_std": 0.21294644474983215, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.35416668057441714, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.9062683105469, + "epoch": 0.042886861897903666, + "grad_norm": 0.17954134941101074, + "kl": 1.039529764652252, + "learning_rate": 8.56230031948882e-06, + "loss": -0.3937, + "reward": 0.3729166775941849, + "reward_std": 0.20056458413600922, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3729166775941849, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completion_length": 537.2791809082031, + "epoch": 0.04320691310609698, + "grad_norm": 0.12642943859100342, + "kl": 1.100398463010788, + "learning_rate": 8.626198083067093e-06, + "loss": -0.4328, + "reward": 0.38802084922790525, + "reward_std": 0.19484265446662902, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3546875149011612, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.1250091552735, + "epoch": 0.04352696431429029, + "grad_norm": 0.10088885575532913, + "kl": 1.264848804473877, + "learning_rate": 8.690095846645368e-06, + "loss": -0.3422, + "reward": 0.414583346247673, + "reward_std": 0.18097187280654908, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3791666775941849, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.5021057128906, + "epoch": 0.043847015522483594, + "grad_norm": 0.14205513894557953, + "kl": 1.0939792722463608, + "learning_rate": 8.753993610223644e-06, + "loss": -0.2693, + "reward": 0.42968751192092897, + "reward_std": 0.16178201138973236, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.39427084624767306, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completion_length": 641.9916748046875, + "epoch": 0.044167066730676906, + "grad_norm": 0.0982375368475914, + "kl": 0.7304096844047308, + "learning_rate": 8.817891373801917e-06, + "loss": -0.1667, + "reward": 0.40989584624767306, + "reward_std": 0.1424245983362198, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.40989584624767306, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completion_length": 617.9458435058593, + "epoch": 0.04448711793887022, + "grad_norm": 0.13668540120124817, + "kl": 1.4703953325748444, + "learning_rate": 8.881789137380193e-06, + "loss": -0.1482, + "reward": 0.4848958492279053, + "reward_std": 0.1417014442384243, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.41614584624767303, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completion_length": 696.9750122070312, + "epoch": 0.04480716914706353, + "grad_norm": 1.111905574798584, + "kl": 0.7676813244819641, + "learning_rate": 8.945686900958466e-06, + "loss": -0.0908, + "reward": 0.4343750149011612, + "reward_std": 0.1505623020231724, + "rewards/accuracy_reward": 0.01041666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.42395834624767303, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 630.5812744140625, + "epoch": 0.04512722035525684, + "grad_norm": 0.17764167487621307, + "kl": 0.6737473249435425, + "learning_rate": 9.009584664536743e-06, + "loss": -0.1169, + "reward": 0.42968750894069674, + "reward_std": 0.1200267419219017, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.42968750894069674, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completion_length": 632.0166809082032, + "epoch": 0.04544727156345015, + "grad_norm": 0.0950830951333046, + "kl": 1.0807377099990845, + "learning_rate": 9.073482428115017e-06, + "loss": -0.1856, + "reward": 0.4817708432674408, + "reward_std": 0.15390508249402046, + "rewards/accuracy_reward": 0.07083333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.41093750596046447, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.5645965576172, + "epoch": 0.045767322771643464, + "grad_norm": 0.1526706963777542, + "kl": 0.7299367796629668, + "learning_rate": 9.137380191693292e-06, + "loss": -0.1688, + "reward": 0.5208333522081375, + "reward_std": 0.14540843814611434, + "rewards/accuracy_reward": 0.10208333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.418750011920929, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completion_length": 627.7479370117187, + "epoch": 0.046087373979836775, + "grad_norm": 0.15692493319511414, + "kl": 0.7879636850208044, + "learning_rate": 9.201277955271566e-06, + "loss": -0.1618, + "reward": 0.4661458432674408, + "reward_std": 0.12382942289113999, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.43281251192092896, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completion_length": 651.795849609375, + "epoch": 0.04640742518803009, + "grad_norm": 0.07808000594377518, + "kl": 0.7620222073048353, + "learning_rate": 9.265175718849841e-06, + "loss": -0.1348, + "reward": 0.5567708402872086, + "reward_std": 0.11367295645177364, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.45260417759418486, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completion_length": 640.8125122070312, + "epoch": 0.0467274763962234, + "grad_norm": 0.07719507813453674, + "kl": 0.6032818179577589, + "learning_rate": 9.329073482428116e-06, + "loss": -0.0882, + "reward": 0.5203125149011611, + "reward_std": 0.10005458071827888, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.45364584028720856, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completion_length": 649.1833465576171, + "epoch": 0.04704752760441671, + "grad_norm": 0.06736160814762115, + "kl": 0.4908189844340086, + "learning_rate": 9.39297124600639e-06, + "loss": -0.0721, + "reward": 0.5036458462476731, + "reward_std": 0.1022965095937252, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4661458432674408, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completion_length": 696.7562683105468, + "epoch": 0.047367578812610014, + "grad_norm": 0.06782618910074234, + "kl": 0.44623993411660196, + "learning_rate": 9.456869009584665e-06, + "loss": -0.0432, + "reward": 0.4822916746139526, + "reward_std": 0.09638992436230183, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4802083432674408, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completion_length": 636.6437683105469, + "epoch": 0.047687630020803326, + "grad_norm": 0.06565655022859573, + "kl": 1.2311117429286242, + "learning_rate": 9.52076677316294e-06, + "loss": -0.0837, + "reward": 0.6432291865348816, + "reward_std": 0.10860510841012001, + "rewards/accuracy_reward": 0.1708333384245634, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.47239584624767306, + "step": 149 + }, + { + "clip_ratio": 0.0, + "completion_length": 673.8854370117188, + "epoch": 0.04800768122899664, + "grad_norm": 0.05761784315109253, + "kl": 0.3750141691416502, + "learning_rate": 9.584664536741216e-06, + "loss": -0.0318, + "reward": 0.5354166865348816, + "reward_std": 0.12210818231105805, + "rewards/accuracy_reward": 0.05000000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4854166805744171, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 683.5250122070313, + "epoch": 0.04832773243718995, + "grad_norm": 0.06147807464003563, + "kl": 0.48957694321870804, + "learning_rate": 9.64856230031949e-06, + "loss": -0.0461, + "reward": 0.518750011920929, + "reward_std": 0.09728633239865303, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4833333432674408, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completion_length": 687.3541809082031, + "epoch": 0.04864778364538326, + "grad_norm": 0.05679427832365036, + "kl": 0.7635953679680825, + "learning_rate": 9.712460063897765e-06, + "loss": -0.0577, + "reward": 0.5260416865348816, + "reward_std": 0.09843504205346107, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4927083492279053, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completion_length": 709.8979431152344, + "epoch": 0.04896783485357657, + "grad_norm": 0.061386361718177795, + "kl": 0.34264843370765447, + "learning_rate": 9.77635782747604e-06, + "loss": -0.0276, + "reward": 0.5447916924953461, + "reward_std": 0.12330903708934784, + "rewards/accuracy_reward": 0.047916668094694616, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.49687502086162566, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completion_length": 653.8687683105469, + "epoch": 0.049287886061769884, + "grad_norm": 0.06698963791131973, + "kl": 0.8141861855983734, + "learning_rate": 9.840255591054313e-06, + "loss": -0.06, + "reward": 0.5182291865348816, + "reward_std": 0.12519886679947376, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5140625238418579, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completion_length": 620.10419921875, + "epoch": 0.049607937269963195, + "grad_norm": 0.0670495480298996, + "kl": 1.0433136209845544, + "learning_rate": 9.904153354632589e-06, + "loss": -0.0964, + "reward": 0.5984375208616257, + "reward_std": 0.15226368308067323, + "rewards/accuracy_reward": 0.07291666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.525520846247673, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completion_length": 684.5625244140625, + "epoch": 0.04992798847815651, + "grad_norm": 0.06608369201421738, + "kl": 0.8835949804633856, + "learning_rate": 9.968051118210862e-06, + "loss": -0.0806, + "reward": 0.6583333492279053, + "reward_std": 0.17013006806373596, + "rewards/accuracy_reward": 0.10833333656191826, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5500000238418579, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completion_length": 745.2604431152344, + "epoch": 0.05024803968634982, + "grad_norm": 0.06933408230543137, + "kl": 0.14976065829396248, + "learning_rate": 1.0031948881789138e-05, + "loss": -0.0559, + "reward": 0.5828125238418579, + "reward_std": 0.1708540216088295, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.580729192495346, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completion_length": 728.7020935058594, + "epoch": 0.05056809089454313, + "grad_norm": 0.06534843891859055, + "kl": 0.4937169037759304, + "learning_rate": 1.0095846645367413e-05, + "loss": -0.0882, + "reward": 0.6229166924953461, + "reward_std": 0.17530024647712708, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6229166924953461, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completion_length": 734.3208557128906, + "epoch": 0.050888142102736435, + "grad_norm": 0.062214288860559464, + "kl": 0.5076489731669426, + "learning_rate": 1.0159744408945688e-05, + "loss": -0.0606, + "reward": 0.7739583492279053, + "reward_std": 0.2034250505268574, + "rewards/accuracy_reward": 0.11875000447034836, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6552083551883697, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completion_length": 692.9541870117188, + "epoch": 0.051208193310929746, + "grad_norm": 0.0641920194029808, + "kl": 0.3405150633305311, + "learning_rate": 1.0223642172523962e-05, + "loss": -0.0657, + "reward": 0.7109375178813935, + "reward_std": 0.16137611567974092, + "rewards/accuracy_reward": 0.014583333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6963541865348816, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 705.8896057128907, + "epoch": 0.05152824451912306, + "grad_norm": 0.057186439633369446, + "kl": 0.3750379033386707, + "learning_rate": 1.0287539936102237e-05, + "loss": -0.0887, + "reward": 0.7437500178813934, + "reward_std": 0.17544491738080978, + "rewards/accuracy_reward": 0.04375000130385161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7000000298023223, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completion_length": 787.7479309082031, + "epoch": 0.05184829572731637, + "grad_norm": 0.053737491369247437, + "kl": 0.26711506862193346, + "learning_rate": 1.035143769968051e-05, + "loss": -0.044, + "reward": 0.7645833492279053, + "reward_std": 0.17004137337207795, + "rewards/accuracy_reward": 0.03125000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7333333551883697, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completion_length": 795.1354370117188, + "epoch": 0.05216834693550968, + "grad_norm": 0.04419364780187607, + "kl": 0.17381047271192074, + "learning_rate": 1.0415335463258786e-05, + "loss": -0.0529, + "reward": 0.7645833551883697, + "reward_std": 0.14466918781399726, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7270833551883698, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completion_length": 750.2250183105468, + "epoch": 0.05248839814370299, + "grad_norm": 0.04193587601184845, + "kl": 0.20865581147372722, + "learning_rate": 1.0479233226837063e-05, + "loss": -0.0779, + "reward": 0.7322916805744171, + "reward_std": 0.15352466367185116, + "rewards/accuracy_reward": 0.00833333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7239583551883697, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completion_length": 752.6562683105469, + "epoch": 0.052808449351896304, + "grad_norm": 0.04201805219054222, + "kl": 0.08975614961236715, + "learning_rate": 1.0543130990415335e-05, + "loss": -0.0339, + "reward": 0.9260416865348816, + "reward_std": 0.12468962892889976, + "rewards/accuracy_reward": 0.1937500059604645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7322916805744171, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completion_length": 702.752099609375, + "epoch": 0.053128500560089616, + "grad_norm": 0.04461502656340599, + "kl": 0.30946322418749334, + "learning_rate": 1.0607028753993612e-05, + "loss": -0.0498, + "reward": 0.8140625178813934, + "reward_std": 0.12668757885694504, + "rewards/accuracy_reward": 0.07916666995733976, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7348958551883698, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completion_length": 741.483349609375, + "epoch": 0.05344855176828293, + "grad_norm": 0.04805014282464981, + "kl": 0.2729524029418826, + "learning_rate": 1.0670926517571887e-05, + "loss": -0.0861, + "reward": 0.7713541984558105, + "reward_std": 0.1388819508254528, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7317708551883697, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completion_length": 750.4937622070313, + "epoch": 0.05376860297647624, + "grad_norm": 0.04891999065876007, + "kl": 0.23906942158937455, + "learning_rate": 1.073482428115016e-05, + "loss": -0.0397, + "reward": 0.8406250178813934, + "reward_std": 0.1587209053337574, + "rewards/accuracy_reward": 0.11666666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7239583432674408, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 747.183349609375, + "epoch": 0.05408865418466955, + "grad_norm": 0.061414770781993866, + "kl": 0.12371877171099185, + "learning_rate": 1.0798722044728436e-05, + "loss": -0.0639, + "reward": 0.7963541865348815, + "reward_std": 0.13769610971212387, + "rewards/accuracy_reward": 0.05208333507180214, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7442708492279053, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completion_length": 763.4146057128906, + "epoch": 0.054408705392862855, + "grad_norm": 0.04897288233041763, + "kl": 0.14503006264567375, + "learning_rate": 1.086261980830671e-05, + "loss": -0.072, + "reward": 0.8734375298023224, + "reward_std": 0.12419936545193196, + "rewards/accuracy_reward": 0.12500000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7484375178813935, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 773.9437805175781, + "epoch": 0.05472875660105617, + "grad_norm": 0.04901808127760887, + "kl": 0.27871253080666064, + "learning_rate": 1.0926517571884985e-05, + "loss": -0.0725, + "reward": 0.8041666865348815, + "reward_std": 0.21438361182808877, + "rewards/accuracy_reward": 0.08125000335276127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7229166865348816, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completion_length": 752.4146057128906, + "epoch": 0.05504880780924948, + "grad_norm": 0.048184193670749664, + "kl": 0.14338683970272542, + "learning_rate": 1.099041533546326e-05, + "loss": -0.0312, + "reward": 0.7796875238418579, + "reward_std": 0.1453725527971983, + "rewards/accuracy_reward": 0.04583333469927311, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7338541865348815, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completion_length": 808.0916931152344, + "epoch": 0.05536885901744279, + "grad_norm": 0.05158894136548042, + "kl": 0.14129090048372744, + "learning_rate": 1.1054313099041534e-05, + "loss": -0.0349, + "reward": 0.8651041924953461, + "reward_std": 0.20059835761785508, + "rewards/accuracy_reward": 0.11875000260770321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7463541805744172, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completion_length": 743.1812744140625, + "epoch": 0.0556889102256361, + "grad_norm": 0.052500639110803604, + "kl": 0.19789513647556306, + "learning_rate": 1.1118210862619809e-05, + "loss": -0.0694, + "reward": 0.8750000298023224, + "reward_std": 0.14784687310457229, + "rewards/accuracy_reward": 0.1458333373069763, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.729166692495346, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completion_length": 788.4729309082031, + "epoch": 0.05600896143382941, + "grad_norm": 0.045249536633491516, + "kl": 0.14523118771612645, + "learning_rate": 1.1182108626198084e-05, + "loss": -0.0787, + "reward": 0.8843750357627869, + "reward_std": 0.21863970458507537, + "rewards/accuracy_reward": 0.15625000409781933, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.728125023841858, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completion_length": 791.3062683105469, + "epoch": 0.056329012642022724, + "grad_norm": 0.04602384939789772, + "kl": 0.11468939054757357, + "learning_rate": 1.1246006389776358e-05, + "loss": -0.0463, + "reward": 0.8541666984558105, + "reward_std": 0.15563478991389273, + "rewards/accuracy_reward": 0.1229166716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7312500178813934, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completion_length": 739.3500244140625, + "epoch": 0.056649063850216036, + "grad_norm": 0.04179657995700836, + "kl": 0.07534434907138347, + "learning_rate": 1.1309904153354633e-05, + "loss": -0.044, + "reward": 0.783854192495346, + "reward_std": 0.12365256864577531, + "rewards/accuracy_reward": 0.04375000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7401041865348816, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completion_length": 802.0562683105469, + "epoch": 0.05696911505840935, + "grad_norm": 0.04096505045890808, + "kl": 0.08589836191385984, + "learning_rate": 1.1373801916932907e-05, + "loss": -0.0478, + "reward": 0.7515625178813934, + "reward_std": 0.11650533098727464, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7515625178813934, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completion_length": 793.402099609375, + "epoch": 0.05728916626660266, + "grad_norm": 0.04286341741681099, + "kl": 0.0660052813589573, + "learning_rate": 1.1437699680511182e-05, + "loss": -0.0462, + "reward": 0.8166666924953461, + "reward_std": 0.1813099652528763, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7479166865348816, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completion_length": 748.2604370117188, + "epoch": 0.05760921747479597, + "grad_norm": 0.052840933203697205, + "kl": 0.2744094289839268, + "learning_rate": 1.1501597444089459e-05, + "loss": -0.0967, + "reward": 0.8364583730697632, + "reward_std": 0.19567819759249688, + "rewards/accuracy_reward": 0.11041667070239783, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7260416984558106, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 802.1250183105469, + "epoch": 0.057929268682989275, + "grad_norm": 0.05346866324543953, + "kl": 0.10278765726834535, + "learning_rate": 1.1565495207667731e-05, + "loss": -0.0678, + "reward": 0.821354192495346, + "reward_std": 0.15091807544231414, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7546875238418579, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completion_length": 777.9396057128906, + "epoch": 0.05824931989118259, + "grad_norm": 0.0543404147028923, + "kl": 0.38175057210028174, + "learning_rate": 1.1629392971246008e-05, + "loss": -0.0913, + "reward": 0.7859375238418579, + "reward_std": 0.22937640249729158, + "rewards/accuracy_reward": 0.04375000055879354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completion_length": 792.1708557128907, + "epoch": 0.0585693710993759, + "grad_norm": 0.047496624290943146, + "kl": 0.1264643581584096, + "learning_rate": 1.1693290734824283e-05, + "loss": -0.0565, + "reward": 0.830729192495346, + "reward_std": 0.19259970933198928, + "rewards/accuracy_reward": 0.08750000316649675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.743229192495346, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completion_length": 837.839599609375, + "epoch": 0.05888942230756921, + "grad_norm": 0.09101786464452744, + "kl": 0.04611029475927353, + "learning_rate": 1.1757188498402557e-05, + "loss": -0.0188, + "reward": 0.791666692495346, + "reward_std": 0.17144267708063127, + "rewards/accuracy_reward": 0.016666667722165584, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.775000023841858, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completion_length": 806.0145935058594, + "epoch": 0.05920947351576252, + "grad_norm": 0.051978375762701035, + "kl": 0.13175021912902593, + "learning_rate": 1.1821086261980832e-05, + "loss": -0.0792, + "reward": 0.8328125178813934, + "reward_std": 0.21297012269496918, + "rewards/accuracy_reward": 0.07500000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7578125178813935, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completion_length": 832.8146057128906, + "epoch": 0.05952952472395583, + "grad_norm": 0.05038255825638771, + "kl": 0.10234151016920805, + "learning_rate": 1.1884984025559106e-05, + "loss": -0.0399, + "reward": 0.7869791865348816, + "reward_std": 0.22053608372807504, + "rewards/accuracy_reward": 0.03541666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.751562523841858, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completion_length": 793.3583557128907, + "epoch": 0.059849575932149145, + "grad_norm": 0.05190233141183853, + "kl": 0.1916276691481471, + "learning_rate": 1.1948881789137381e-05, + "loss": -0.0471, + "reward": 0.8348958373069764, + "reward_std": 0.17255963943898678, + "rewards/accuracy_reward": 0.06875000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7661458432674408, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completion_length": 832.2937744140625, + "epoch": 0.060169627140342456, + "grad_norm": 0.048310406506061554, + "kl": 0.3118143383413553, + "learning_rate": 1.2012779552715656e-05, + "loss": -0.0564, + "reward": 0.8093750238418579, + "reward_std": 0.19749893844127656, + "rewards/accuracy_reward": 0.05208333469927311, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7572916865348815, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completion_length": 855.8937622070313, + "epoch": 0.06048967834853577, + "grad_norm": 0.041950833052396774, + "kl": 0.13190485239028932, + "learning_rate": 1.207667731629393e-05, + "loss": -0.0476, + "reward": 0.8265625238418579, + "reward_std": 0.16573217641562224, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7890625178813935, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completion_length": 851.1979431152344, + "epoch": 0.06080972955672908, + "grad_norm": 0.05058378353714943, + "kl": 0.4103548087179661, + "learning_rate": 1.2140575079872205e-05, + "loss": -0.0665, + "reward": 0.892187523841858, + "reward_std": 0.2683277949690819, + "rewards/accuracy_reward": 0.12500000465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7671875178813934, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 826.3041931152344, + "epoch": 0.06112978076492239, + "grad_norm": 0.06598315387964249, + "kl": 0.1979449477046728, + "learning_rate": 1.220447284345048e-05, + "loss": -0.065, + "reward": 0.8723958492279053, + "reward_std": 0.16352599412202834, + "rewards/accuracy_reward": 0.08125000204890967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7911458492279053, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completion_length": 879.0062683105468, + "epoch": 0.061449831973115696, + "grad_norm": 0.04891607537865639, + "kl": 0.19413473419845104, + "learning_rate": 1.2268370607028754e-05, + "loss": -0.0562, + "reward": 0.8197916805744171, + "reward_std": 0.18419070467352866, + "rewards/accuracy_reward": 0.00833333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8114583492279053, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 817.7750122070313, + "epoch": 0.06176988318130901, + "grad_norm": 0.04997456446290016, + "kl": 0.21307219099253416, + "learning_rate": 1.233226837060703e-05, + "loss": -0.0385, + "reward": 0.8276041865348815, + "reward_std": 0.1461735963821411, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7859375178813934, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completion_length": 854.0583435058594, + "epoch": 0.06208993438950232, + "grad_norm": 0.048078566789627075, + "kl": 0.18797751162201165, + "learning_rate": 1.2396166134185303e-05, + "loss": -0.0481, + "reward": 0.8651041865348816, + "reward_std": 0.1707301080226898, + "rewards/accuracy_reward": 0.05416666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8109375178813935, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completion_length": 827.9250244140625, + "epoch": 0.06240998559769563, + "grad_norm": 0.0677054226398468, + "kl": 0.24947662875056267, + "learning_rate": 1.2460063897763578e-05, + "loss": -0.0748, + "reward": 0.852604192495346, + "reward_std": 0.16003775596618652, + "rewards/accuracy_reward": 0.039583335444331166, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8130208492279053, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completion_length": 877.6354370117188, + "epoch": 0.06273003680588894, + "grad_norm": 0.04754648357629776, + "kl": 0.13086143620312213, + "learning_rate": 1.2523961661341855e-05, + "loss": -0.0492, + "reward": 0.8312500298023224, + "reward_std": 0.181324028596282, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7916666865348816, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completion_length": 874.577099609375, + "epoch": 0.06305008801408225, + "grad_norm": 0.060636699199676514, + "kl": 0.8544145112857222, + "learning_rate": 1.2587859424920127e-05, + "loss": -0.0723, + "reward": 0.9250000298023224, + "reward_std": 0.1732124462723732, + "rewards/accuracy_reward": 0.10208333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8229166805744171, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completion_length": 895.7646057128907, + "epoch": 0.06337013922227556, + "grad_norm": 0.04687461629509926, + "kl": 0.20718304738402366, + "learning_rate": 1.2651757188498404e-05, + "loss": -0.048, + "reward": 0.8713541865348816, + "reward_std": 0.22062713280320168, + "rewards/accuracy_reward": 0.052083334513008596, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8192708432674408, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completion_length": 830.4750183105468, + "epoch": 0.06369019043046888, + "grad_norm": 0.0773216262459755, + "kl": 0.8122439078986645, + "learning_rate": 1.271565495207668e-05, + "loss": -0.1072, + "reward": 0.8343750298023224, + "reward_std": 0.2193293772637844, + "rewards/accuracy_reward": 0.06250000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7718750238418579, + "step": 199 + }, + { + "clip_ratio": 0.0, + "completion_length": 874.8125244140625, + "epoch": 0.06401024163866219, + "grad_norm": 0.09615940600633621, + "kl": 0.4952188327908516, + "learning_rate": 1.2779552715654953e-05, + "loss": -0.0731, + "reward": 0.9630208551883698, + "reward_std": 0.2012898415327072, + "rewards/accuracy_reward": 0.1562500050291419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8067708432674408, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 919.6937622070312, + "epoch": 0.0643302928468555, + "grad_norm": 0.055171411484479904, + "kl": 0.42661799900233743, + "learning_rate": 1.2843450479233228e-05, + "loss": -0.0889, + "reward": 0.8906250298023224, + "reward_std": 0.1967291221022606, + "rewards/accuracy_reward": 0.041666668653488156, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8489583551883697, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completion_length": 908.6062683105469, + "epoch": 0.06465034405504881, + "grad_norm": 0.04904462397098541, + "kl": 0.19709489084780216, + "learning_rate": 1.2907348242811502e-05, + "loss": -0.0544, + "reward": 0.9531250357627868, + "reward_std": 0.18670724853873252, + "rewards/accuracy_reward": 0.1104166692122817, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8427083492279053, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completion_length": 893.7104431152344, + "epoch": 0.06497039526324212, + "grad_norm": 0.0566297248005867, + "kl": 0.6682713240385055, + "learning_rate": 1.2971246006389777e-05, + "loss": -0.0859, + "reward": 0.9708333551883698, + "reward_std": 0.2716028355062008, + "rewards/accuracy_reward": 0.1562500074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8145833551883698, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completion_length": 913.8271057128907, + "epoch": 0.06529044647143543, + "grad_norm": 0.04946311190724373, + "kl": 0.3586285777390003, + "learning_rate": 1.3035143769968053e-05, + "loss": -0.0774, + "reward": 0.8635416984558105, + "reward_std": 0.2157668873667717, + "rewards/accuracy_reward": 0.04583333469927311, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8177083551883697, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 955.3604370117188, + "epoch": 0.06561049767962875, + "grad_norm": 0.06902287155389786, + "kl": 1.093763279914856, + "learning_rate": 1.3099041533546326e-05, + "loss": -0.0612, + "reward": 0.8843750178813934, + "reward_std": 0.1965337350964546, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8510416865348815, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completion_length": 931.152099609375, + "epoch": 0.06593054888782206, + "grad_norm": 0.13752013444900513, + "kl": 1.419936703145504, + "learning_rate": 1.3162939297124601e-05, + "loss": -0.0831, + "reward": 0.9291666805744171, + "reward_std": 0.21198599338531493, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8458333492279053, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completion_length": 898.8145935058594, + "epoch": 0.06625060009601537, + "grad_norm": 0.05612272769212723, + "kl": 0.6546476993709802, + "learning_rate": 1.3226837060702877e-05, + "loss": -0.0815, + "reward": 0.8734375238418579, + "reward_std": 0.21769996285438536, + "rewards/accuracy_reward": 0.05625000167638063, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8171875178813934, + "step": 207 + }, + { + "clip_ratio": 0.0, + "completion_length": 910.4833618164063, + "epoch": 0.06657065130420867, + "grad_norm": 0.05039665102958679, + "kl": 0.7447992540895939, + "learning_rate": 1.329073482428115e-05, + "loss": -0.0785, + "reward": 0.857812511920929, + "reward_std": 0.17638538777828217, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8244791805744172, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.6750244140625, + "epoch": 0.06689070251240198, + "grad_norm": 0.055274222046136856, + "kl": 0.6583600550889969, + "learning_rate": 1.3354632587859426e-05, + "loss": -0.0736, + "reward": 1.0552083730697632, + "reward_std": 0.1857333317399025, + "rewards/accuracy_reward": 0.18750000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8677083611488342, + "step": 209 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.3333557128906, + "epoch": 0.06721075372059529, + "grad_norm": 1.966977834701538, + "kl": 0.29377989545464517, + "learning_rate": 1.3418530351437703e-05, + "loss": -0.0634, + "reward": 0.9557292103767395, + "reward_std": 0.24651620537042618, + "rewards/accuracy_reward": 0.11041666883975268, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.845312523841858, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completion_length": 946.7916809082031, + "epoch": 0.0675308049287886, + "grad_norm": 0.04794445261359215, + "kl": 0.19607984125614167, + "learning_rate": 1.3482428115015975e-05, + "loss": -0.0447, + "reward": 0.8927083551883698, + "reward_std": 0.2090878263115883, + "rewards/accuracy_reward": 0.04583333469927311, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8468750238418579, + "step": 211 + }, + { + "clip_ratio": 0.0, + "completion_length": 925.9646057128906, + "epoch": 0.06785085613698191, + "grad_norm": 0.05501377210021019, + "kl": 0.4783424001187086, + "learning_rate": 1.3546325878594251e-05, + "loss": -0.0918, + "reward": 0.9140625119209289, + "reward_std": 0.2742405004799366, + "rewards/accuracy_reward": 0.08541666865348815, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8286458373069763, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completion_length": 910.5520935058594, + "epoch": 0.06817090734517522, + "grad_norm": 0.06620791554450989, + "kl": 0.6241559140384197, + "learning_rate": 1.3610223642172523e-05, + "loss": -0.085, + "reward": 0.8677083492279053, + "reward_std": 0.22876578718423843, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8281250238418579, + "step": 213 + }, + { + "clip_ratio": 0.0, + "completion_length": 917.358349609375, + "epoch": 0.06849095855336854, + "grad_norm": 0.08500348031520844, + "kl": 0.6475715897977352, + "learning_rate": 1.36741214057508e-05, + "loss": -0.0907, + "reward": 0.8208333492279053, + "reward_std": 0.24260879009962083, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8187500178813935, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completion_length": 930.9750244140625, + "epoch": 0.06881100976156185, + "grad_norm": 0.06262990832328796, + "kl": 0.3204282820224762, + "learning_rate": 1.3738019169329076e-05, + "loss": -0.0806, + "reward": 0.8671875178813935, + "reward_std": 0.228495317697525, + "rewards/accuracy_reward": 0.052083334513008596, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.815104192495346, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completion_length": 871.8562683105469, + "epoch": 0.06913106096975516, + "grad_norm": 0.0846937745809555, + "kl": 0.5917581547051668, + "learning_rate": 1.380191693290735e-05, + "loss": -0.0988, + "reward": 0.8718750178813934, + "reward_std": 0.2557776391506195, + "rewards/accuracy_reward": 0.08541666846722365, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7864583432674408, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completion_length": 877.3812744140625, + "epoch": 0.06945111217794847, + "grad_norm": 0.1118309274315834, + "kl": 0.9335677590221166, + "learning_rate": 1.3865814696485625e-05, + "loss": -0.1107, + "reward": 0.9578125298023223, + "reward_std": 0.20639725401997566, + "rewards/accuracy_reward": 0.12708333656191825, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8307291805744171, + "step": 217 + }, + { + "clip_ratio": 0.0, + "completion_length": 907.5916870117187, + "epoch": 0.06977116338614178, + "grad_norm": 0.14517973363399506, + "kl": 0.8811855886131525, + "learning_rate": 1.39297124600639e-05, + "loss": -0.0957, + "reward": 0.901041692495346, + "reward_std": 0.2358024850487709, + "rewards/accuracy_reward": 0.07291666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8281250119209289, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completion_length": 918.8687622070313, + "epoch": 0.0700912145943351, + "grad_norm": 0.056523486971855164, + "kl": 0.31053061634302137, + "learning_rate": 1.3993610223642173e-05, + "loss": -0.0742, + "reward": 0.9885416984558105, + "reward_std": 0.24320609569549562, + "rewards/accuracy_reward": 0.1562500052154064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8322916865348816, + "step": 219 + }, + { + "clip_ratio": 0.0, + "completion_length": 921.8979370117188, + "epoch": 0.0704112658025284, + "grad_norm": 0.07962542772293091, + "kl": 0.40545434355735777, + "learning_rate": 1.4057507987220449e-05, + "loss": -0.0853, + "reward": 0.9197916865348816, + "reward_std": 0.24055335223674773, + "rewards/accuracy_reward": 0.09166666828095912, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8281250238418579, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 905.5479370117188, + "epoch": 0.07073131701072172, + "grad_norm": 0.06706108152866364, + "kl": 0.6539999444037676, + "learning_rate": 1.4121405750798722e-05, + "loss": -0.1003, + "reward": 0.9619791984558106, + "reward_std": 0.2473888464272022, + "rewards/accuracy_reward": 0.13333333600312472, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8286458551883698, + "step": 221 + }, + { + "clip_ratio": 0.0, + "completion_length": 864.2271057128906, + "epoch": 0.07105136821891503, + "grad_norm": 0.055032406002283096, + "kl": 0.45531212612986566, + "learning_rate": 1.4185303514376998e-05, + "loss": -0.0982, + "reward": 0.8796875298023223, + "reward_std": 0.19982553869485856, + "rewards/accuracy_reward": 0.07083333656191826, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8088541865348816, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.4041870117187, + "epoch": 0.07137141942710834, + "grad_norm": 0.054181210696697235, + "kl": 0.19609659500420093, + "learning_rate": 1.4249201277955273e-05, + "loss": -0.0762, + "reward": 0.9645833611488343, + "reward_std": 0.2233875073492527, + "rewards/accuracy_reward": 0.12083333730697632, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8437500238418579, + "step": 223 + }, + { + "clip_ratio": 0.0, + "completion_length": 915.5021118164062, + "epoch": 0.07169147063530165, + "grad_norm": 0.1139911636710167, + "kl": 0.36906580030918124, + "learning_rate": 1.4313099041533547e-05, + "loss": -0.0601, + "reward": 0.8807291865348816, + "reward_std": 0.19108127057552338, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8390625238418579, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completion_length": 934.5166931152344, + "epoch": 0.07201152184349496, + "grad_norm": 0.05806288123130798, + "kl": 0.6494541350752115, + "learning_rate": 1.4376996805111822e-05, + "loss": -0.0754, + "reward": 0.9494791924953461, + "reward_std": 0.24038127958774566, + "rewards/accuracy_reward": 0.10416666995733977, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.845312523841858, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.6083557128907, + "epoch": 0.07233157305168827, + "grad_norm": 0.09360338002443314, + "kl": 0.5438719809055328, + "learning_rate": 1.4440894568690099e-05, + "loss": -0.0858, + "reward": 0.9640625298023224, + "reward_std": 0.24050120264291763, + "rewards/accuracy_reward": 0.12083333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8432291865348815, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.1437744140625, + "epoch": 0.07265162425988159, + "grad_norm": 0.06449954211711884, + "kl": 0.4873860139399767, + "learning_rate": 1.450479233226837e-05, + "loss": -0.0587, + "reward": 0.8567708492279053, + "reward_std": 0.2645621284842491, + "rewards/accuracy_reward": 0.018750000558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8380208492279053, + "step": 227 + }, + { + "clip_ratio": 0.0, + "completion_length": 924.8708557128906, + "epoch": 0.0729716754680749, + "grad_norm": 0.22151648998260498, + "kl": 2.0567788064479826, + "learning_rate": 1.4568690095846648e-05, + "loss": -0.1014, + "reward": 0.9880208551883698, + "reward_std": 0.22209313362836838, + "rewards/accuracy_reward": 0.13750000409781932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8505208492279053, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completion_length": 937.3479431152343, + "epoch": 0.07329172667626821, + "grad_norm": 0.07860510051250458, + "kl": 0.3857763078063726, + "learning_rate": 1.4632587859424921e-05, + "loss": -0.081, + "reward": 0.9505208611488343, + "reward_std": 0.21638043001294135, + "rewards/accuracy_reward": 0.09791666939854622, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8526041805744171, + "step": 229 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.614599609375, + "epoch": 0.07361177788446151, + "grad_norm": 0.14230084419250488, + "kl": 0.5276958528906107, + "learning_rate": 1.4696485623003197e-05, + "loss": -0.0949, + "reward": 0.9317708492279053, + "reward_std": 0.275539268553257, + "rewards/accuracy_reward": 0.09166667181998492, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8401041865348816, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.7937683105469, + "epoch": 0.07393182909265482, + "grad_norm": 0.0633036196231842, + "kl": 0.43820536993443965, + "learning_rate": 1.4760383386581472e-05, + "loss": -0.0817, + "reward": 0.9218750238418579, + "reward_std": 0.26296704858541486, + "rewards/accuracy_reward": 0.07916666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8427083551883697, + "step": 231 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.0666870117187, + "epoch": 0.07425188030084813, + "grad_norm": 0.12930835783481598, + "kl": 0.4649226266890764, + "learning_rate": 1.4824281150159745e-05, + "loss": -0.0555, + "reward": 0.8692708551883698, + "reward_std": 0.25685995519161225, + "rewards/accuracy_reward": 0.04166666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8276041865348815, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completion_length": 972.4396118164062, + "epoch": 0.07457193150904144, + "grad_norm": 0.09969345480203629, + "kl": 0.5674844704568386, + "learning_rate": 1.488817891373802e-05, + "loss": -0.0716, + "reward": 0.9229166984558106, + "reward_std": 0.2375131815671921, + "rewards/accuracy_reward": 0.05000000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8729166924953461, + "step": 233 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.058349609375, + "epoch": 0.07489198271723475, + "grad_norm": 0.06822368502616882, + "kl": 0.7561644535511732, + "learning_rate": 1.4952076677316296e-05, + "loss": -0.1003, + "reward": 0.8223958551883698, + "reward_std": 0.2654576122760773, + "rewards/accuracy_reward": 0.006250000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8161458551883698, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completion_length": 989.1646057128906, + "epoch": 0.07521203392542807, + "grad_norm": 0.3903352916240692, + "kl": 0.5844904962927103, + "learning_rate": 1.501597444089457e-05, + "loss": -0.048, + "reward": 0.9182291746139526, + "reward_std": 0.23523074388504028, + "rewards/accuracy_reward": 0.04166666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8765625119209289, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completion_length": 965.189599609375, + "epoch": 0.07553208513362138, + "grad_norm": 0.057704098522663116, + "kl": 0.7968939036130905, + "learning_rate": 1.5079872204472845e-05, + "loss": -0.0777, + "reward": 0.9833333611488342, + "reward_std": 0.23558287769556047, + "rewards/accuracy_reward": 0.10208333730697632, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8812500238418579, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.7687622070313, + "epoch": 0.07585213634181469, + "grad_norm": 0.05349546670913696, + "kl": 0.2956059377640486, + "learning_rate": 1.5143769968051119e-05, + "loss": -0.0692, + "reward": 0.8864583551883698, + "reward_std": 0.22086520940065385, + "rewards/accuracy_reward": 0.014583333767950535, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8718750178813934, + "step": 237 + }, + { + "clip_ratio": 0.0, + "completion_length": 968.858349609375, + "epoch": 0.076172187550008, + "grad_norm": 0.04913521930575371, + "kl": 0.1724690929055214, + "learning_rate": 1.5207667731629394e-05, + "loss": -0.0687, + "reward": 0.9208333551883697, + "reward_std": 0.19898689687252044, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8812500238418579, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completion_length": 932.9541870117188, + "epoch": 0.07649223875820131, + "grad_norm": 0.12697535753250122, + "kl": 0.1935725949704647, + "learning_rate": 1.527156549520767e-05, + "loss": -0.0749, + "reward": 0.8927083492279053, + "reward_std": 0.21466821134090425, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8552083492279052, + "step": 239 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.7208557128906, + "epoch": 0.07681228996639462, + "grad_norm": 0.0980035737156868, + "kl": 0.21331431940197945, + "learning_rate": 1.5335463258785944e-05, + "loss": -0.081, + "reward": 0.9015625178813934, + "reward_std": 0.21541929244995117, + "rewards/accuracy_reward": 0.029166667722165585, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8723958551883697, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 889.0000244140625, + "epoch": 0.07713234117458793, + "grad_norm": 0.08833970129489899, + "kl": 0.39752455055713654, + "learning_rate": 1.5399361022364218e-05, + "loss": -0.1273, + "reward": 0.9666666865348816, + "reward_std": 0.196412655711174, + "rewards/accuracy_reward": 0.13333333730697633, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8333333432674408, + "step": 241 + }, + { + "clip_ratio": 0.0, + "completion_length": 915.3437805175781, + "epoch": 0.07745239238278125, + "grad_norm": 0.11988092958927155, + "kl": 0.5734913632273674, + "learning_rate": 1.5463258785942495e-05, + "loss": -0.1226, + "reward": 0.8947916984558105, + "reward_std": 0.24440784752368927, + "rewards/accuracy_reward": 0.054166667722165586, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8406250298023223, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completion_length": 934.3604370117188, + "epoch": 0.07777244359097456, + "grad_norm": 0.16344939172267914, + "kl": 1.3431407153606414, + "learning_rate": 1.552715654952077e-05, + "loss": -0.0986, + "reward": 0.8593750298023224, + "reward_std": 0.20968187004327773, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8572916924953461, + "step": 243 + }, + { + "clip_ratio": 0.0, + "completion_length": 929.958349609375, + "epoch": 0.07809249479916787, + "grad_norm": 0.05979590862989426, + "kl": 0.7483044236898422, + "learning_rate": 1.5591054313099042e-05, + "loss": -0.1062, + "reward": 0.9828125238418579, + "reward_std": 0.26393072605133056, + "rewards/accuracy_reward": 0.1229166716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8598958492279053, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completion_length": 940.4229370117188, + "epoch": 0.07841254600736118, + "grad_norm": 0.05631721392273903, + "kl": 0.6490201197564602, + "learning_rate": 1.5654952076677316e-05, + "loss": -0.1067, + "reward": 0.9223958551883698, + "reward_std": 0.20847276747226715, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8807291805744171, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completion_length": 950.0166931152344, + "epoch": 0.07873259721555449, + "grad_norm": 0.09853655844926834, + "kl": 0.5052209571003914, + "learning_rate": 1.5718849840255593e-05, + "loss": -0.0544, + "reward": 0.9916666984558106, + "reward_std": 0.23812063187360763, + "rewards/accuracy_reward": 0.12083333935588599, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8708333611488343, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completion_length": 919.2541809082031, + "epoch": 0.0790526484237478, + "grad_norm": 0.12566211819648743, + "kl": 1.113349625095725, + "learning_rate": 1.5782747603833866e-05, + "loss": -0.1065, + "reward": 0.8838541865348816, + "reward_std": 0.23888127356767655, + "rewards/accuracy_reward": 0.02916666716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8546875178813934, + "step": 247 + }, + { + "clip_ratio": 0.0, + "completion_length": 945.0062683105468, + "epoch": 0.07937269963194112, + "grad_norm": 0.11134926974773407, + "kl": 1.0100045025348663, + "learning_rate": 1.584664536741214e-05, + "loss": -0.0955, + "reward": 0.9098958492279052, + "reward_std": 0.21706083416938782, + "rewards/accuracy_reward": 0.01666666716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8932291865348816, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completion_length": 928.4041870117187, + "epoch": 0.07969275084013443, + "grad_norm": 0.09585289657115936, + "kl": 1.7022089518606662, + "learning_rate": 1.5910543130990417e-05, + "loss": -0.1081, + "reward": 0.9406250178813934, + "reward_std": 0.23443643376231194, + "rewards/accuracy_reward": 0.07500000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.865625011920929, + "step": 249 + }, + { + "clip_ratio": 0.0, + "completion_length": 953.2375244140625, + "epoch": 0.08001280204832774, + "grad_norm": 0.08478322625160217, + "kl": 1.1752378184348344, + "learning_rate": 1.5974440894568694e-05, + "loss": -0.0952, + "reward": 0.9005208671092987, + "reward_std": 0.23451116681098938, + "rewards/accuracy_reward": 0.01250000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8880208551883697, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 911.4916809082031, + "epoch": 0.08033285325652105, + "grad_norm": 0.17704196274280548, + "kl": 1.9864186983555556, + "learning_rate": 1.6038338658146964e-05, + "loss": -0.146, + "reward": 1.012500023841858, + "reward_std": 0.2728649765253067, + "rewards/accuracy_reward": 0.1479166716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8645833492279053, + "step": 251 + }, + { + "clip_ratio": 0.0, + "completion_length": 941.0708557128906, + "epoch": 0.08065290446471435, + "grad_norm": 0.13199065625667572, + "kl": 1.3647517763078212, + "learning_rate": 1.610223642172524e-05, + "loss": -0.1164, + "reward": 0.9848958551883698, + "reward_std": 0.27456178665161135, + "rewards/accuracy_reward": 0.12291667126119137, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.861979192495346, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completion_length": 954.4291870117188, + "epoch": 0.08097295567290766, + "grad_norm": 0.05368947610259056, + "kl": 0.8016906466335059, + "learning_rate": 1.6166134185303515e-05, + "loss": -0.0952, + "reward": 0.9369791805744171, + "reward_std": 0.2690433174371719, + "rewards/accuracy_reward": 0.06250000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8744791865348815, + "step": 253 + }, + { + "clip_ratio": 0.0, + "completion_length": 938.6520935058594, + "epoch": 0.08129300688110097, + "grad_norm": 0.11912751197814941, + "kl": 1.1840459078550338, + "learning_rate": 1.623003194888179e-05, + "loss": -0.0976, + "reward": 1.030729204416275, + "reward_std": 0.32827322483062743, + "rewards/accuracy_reward": 0.15000000335276126, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8807291865348816, + "step": 254 + }, + { + "clip_ratio": 0.0, + "completion_length": 961.820849609375, + "epoch": 0.08161305808929428, + "grad_norm": 0.06231944262981415, + "kl": 1.172089009732008, + "learning_rate": 1.6293929712460065e-05, + "loss": -0.0999, + "reward": 0.8864583492279052, + "reward_std": 0.24923737421631814, + "rewards/accuracy_reward": 0.01041666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8760416865348816, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completion_length": 958.8812744140625, + "epoch": 0.0819331092974876, + "grad_norm": 0.06999265402555466, + "kl": 0.6742954470217228, + "learning_rate": 1.635782747603834e-05, + "loss": -0.0619, + "reward": 1.0625000238418578, + "reward_std": 0.21779928505420684, + "rewards/accuracy_reward": 0.16875000596046447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8937500238418579, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completion_length": 913.3833435058593, + "epoch": 0.0822531605056809, + "grad_norm": 0.16180475056171417, + "kl": 2.9971985332667828, + "learning_rate": 1.6421725239616616e-05, + "loss": -0.1122, + "reward": 0.8828125238418579, + "reward_std": 0.2641023457050323, + "rewards/accuracy_reward": 0.01458333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.868229192495346, + "step": 257 + }, + { + "clip_ratio": 0.0, + "completion_length": 923.8458557128906, + "epoch": 0.08257321171387422, + "grad_norm": 0.3563595414161682, + "kl": 2.980782502889633, + "learning_rate": 1.648562300319489e-05, + "loss": -0.0747, + "reward": 0.9838541984558106, + "reward_std": 0.25154028832912445, + "rewards/accuracy_reward": 0.07916666883975268, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9046875298023224, + "step": 258 + }, + { + "clip_ratio": 0.0, + "completion_length": 926.2646057128907, + "epoch": 0.08289326292206753, + "grad_norm": 0.13118846714496613, + "kl": 1.8122212439775467, + "learning_rate": 1.6549520766773163e-05, + "loss": -0.0782, + "reward": 1.0093750298023223, + "reward_std": 0.2527278661727905, + "rewards/accuracy_reward": 0.09583333600312471, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9135416746139526, + "step": 259 + }, + { + "clip_ratio": 0.0, + "completion_length": 905.1271057128906, + "epoch": 0.08321331413026084, + "grad_norm": 0.06394088268280029, + "kl": 1.5887242540717126, + "learning_rate": 1.661341853035144e-05, + "loss": -0.1111, + "reward": 0.9333333432674408, + "reward_std": 0.2683968171477318, + "rewards/accuracy_reward": 0.04583333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8875000059604645, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 892.6187744140625, + "epoch": 0.08353336533845415, + "grad_norm": 0.1131933182477951, + "kl": 0.6811731692403555, + "learning_rate": 1.6677316293929714e-05, + "loss": -0.0831, + "reward": 1.0098958611488342, + "reward_std": 0.2009401135146618, + "rewards/accuracy_reward": 0.08750000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9223958551883698, + "step": 261 + }, + { + "clip_ratio": 0.0, + "completion_length": 900.8021057128906, + "epoch": 0.08385341654664746, + "grad_norm": 0.08476514369249344, + "kl": 1.0106553114950656, + "learning_rate": 1.6741214057507987e-05, + "loss": -0.131, + "reward": 0.9656250298023223, + "reward_std": 0.22452602237462999, + "rewards/accuracy_reward": 0.052083334885537624, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9135416805744171, + "step": 262 + }, + { + "clip_ratio": 0.0, + "completion_length": 955.2646057128907, + "epoch": 0.08417346775484078, + "grad_norm": 0.08032719045877457, + "kl": 0.8753206558525563, + "learning_rate": 1.6805111821086264e-05, + "loss": -0.0422, + "reward": 1.049479204416275, + "reward_std": 0.2446434311568737, + "rewards/accuracy_reward": 0.14791667014360427, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9015625238418579, + "step": 263 + }, + { + "clip_ratio": 0.0, + "completion_length": 899.0416931152344, + "epoch": 0.08449351896303409, + "grad_norm": 0.5515336394309998, + "kl": 1.3456205856055021, + "learning_rate": 1.6869009584664538e-05, + "loss": -0.1086, + "reward": 0.9744791865348816, + "reward_std": 0.22072734907269478, + "rewards/accuracy_reward": 0.06458333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9098958492279052, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completion_length": 885.8771057128906, + "epoch": 0.0848135701712274, + "grad_norm": 0.06949339061975479, + "kl": 0.9026762183755637, + "learning_rate": 1.693290734824281e-05, + "loss": -0.098, + "reward": 1.0098958551883697, + "reward_std": 0.22607185021042825, + "rewards/accuracy_reward": 0.09583333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9140625238418579, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completion_length": 902.6458618164063, + "epoch": 0.08513362137942071, + "grad_norm": 0.09065587818622589, + "kl": 1.112578734010458, + "learning_rate": 1.699680511182109e-05, + "loss": -0.0904, + "reward": 1.0500000298023224, + "reward_std": 0.2474749308079481, + "rewards/accuracy_reward": 0.13541666865348817, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9145833492279053, + "step": 266 + }, + { + "clip_ratio": 0.0, + "completion_length": 850.6291931152343, + "epoch": 0.08545367258761402, + "grad_norm": 0.0645834282040596, + "kl": 0.8398421596735716, + "learning_rate": 1.7060702875399362e-05, + "loss": -0.1208, + "reward": 1.0151041805744172, + "reward_std": 0.2006215013563633, + "rewards/accuracy_reward": 0.10208333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9130208551883697, + "step": 267 + }, + { + "clip_ratio": 0.0, + "completion_length": 903.7958557128907, + "epoch": 0.08577372379580733, + "grad_norm": 0.059141870588064194, + "kl": 0.7011767126619816, + "learning_rate": 1.712460063897764e-05, + "loss": -0.0965, + "reward": 1.0427083611488341, + "reward_std": 0.19903551936149597, + "rewards/accuracy_reward": 0.10833333749324084, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9343750178813934, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completion_length": 868.0416870117188, + "epoch": 0.08609377500400064, + "grad_norm": 0.08352208137512207, + "kl": 0.6763238899409771, + "learning_rate": 1.7188498402555913e-05, + "loss": -0.103, + "reward": 0.9942708671092987, + "reward_std": 0.20066261664032936, + "rewards/accuracy_reward": 0.052083334513008596, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9421875298023223, + "step": 269 + }, + { + "clip_ratio": 0.0, + "completion_length": 882.5896057128906, + "epoch": 0.08641382621219396, + "grad_norm": 0.058626312762498856, + "kl": 1.3493116207420826, + "learning_rate": 1.7252396166134186e-05, + "loss": -0.0921, + "reward": 1.027604192495346, + "reward_std": 0.23043378591537475, + "rewards/accuracy_reward": 0.10000000353902579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9276041865348816, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completion_length": 897.5166870117188, + "epoch": 0.08673387742038727, + "grad_norm": 0.07342275232076645, + "kl": 0.8855142720043659, + "learning_rate": 1.7316293929712463e-05, + "loss": -0.0822, + "reward": 1.0546875298023224, + "reward_std": 0.27408884316682813, + "rewards/accuracy_reward": 0.12916666753590106, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9255208492279052, + "step": 271 + }, + { + "clip_ratio": 0.0, + "completion_length": 889.0729370117188, + "epoch": 0.08705392862858058, + "grad_norm": 0.06917428970336914, + "kl": 1.0903139643371105, + "learning_rate": 1.7380191693290737e-05, + "loss": -0.0621, + "reward": 1.035416704416275, + "reward_std": 0.22211865484714508, + "rewards/accuracy_reward": 0.10208333674818278, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9333333551883698, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completion_length": 872.6250183105469, + "epoch": 0.08737397983677389, + "grad_norm": 0.0981907919049263, + "kl": 0.7083079494535923, + "learning_rate": 1.744408945686901e-05, + "loss": -0.053, + "reward": 1.0489583671092988, + "reward_std": 0.16593048833310603, + "rewards/accuracy_reward": 0.09166666828095912, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9572916865348816, + "step": 273 + }, + { + "clip_ratio": 0.0, + "completion_length": 877.7541809082031, + "epoch": 0.08769403104496719, + "grad_norm": 0.18095174431800842, + "kl": 2.6296974059194325, + "learning_rate": 1.7507987220447287e-05, + "loss": -0.0654, + "reward": 1.018229180574417, + "reward_std": 0.1441993907094002, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9473958492279053, + "step": 274 + }, + { + "clip_ratio": 0.0, + "completion_length": 875.0437744140625, + "epoch": 0.0880140822531605, + "grad_norm": 0.13419152796268463, + "kl": 3.1912473395466803, + "learning_rate": 1.757188498402556e-05, + "loss": -0.0698, + "reward": 0.9859375178813934, + "reward_std": 0.19504168927669524, + "rewards/accuracy_reward": 0.05416666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9317708492279053, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completion_length": 862.0500244140625, + "epoch": 0.08833413346135381, + "grad_norm": 0.21924547851085663, + "kl": 3.4867818232625725, + "learning_rate": 1.7635782747603835e-05, + "loss": -0.0687, + "reward": 0.9635416865348816, + "reward_std": 0.1808617640286684, + "rewards/accuracy_reward": 0.03125000204890967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.932291692495346, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completion_length": 864.8125183105469, + "epoch": 0.08865418466954712, + "grad_norm": 0.49595341086387634, + "kl": 2.0808908861130475, + "learning_rate": 1.7699680511182108e-05, + "loss": -0.0736, + "reward": 1.0552083551883698, + "reward_std": 0.2792856268584728, + "rewards/accuracy_reward": 0.14166666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9135416805744171, + "step": 277 + }, + { + "clip_ratio": 0.0, + "completion_length": 814.4771118164062, + "epoch": 0.08897423587774043, + "grad_norm": 0.13152822852134705, + "kl": 1.0621005825698375, + "learning_rate": 1.7763578274760385e-05, + "loss": -0.1042, + "reward": 0.9630208551883698, + "reward_std": 0.20329482033848761, + "rewards/accuracy_reward": 0.04583333469927311, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9171875178813934, + "step": 278 + }, + { + "clip_ratio": 0.0, + "completion_length": 760.3854309082031, + "epoch": 0.08929428708593375, + "grad_norm": 0.33260223269462585, + "kl": 1.3636042416095733, + "learning_rate": 1.782747603833866e-05, + "loss": -0.152, + "reward": 0.9802083730697632, + "reward_std": 0.26020273864269255, + "rewards/accuracy_reward": 0.07916666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.901041692495346, + "step": 279 + }, + { + "clip_ratio": 0.0, + "completion_length": 775.677099609375, + "epoch": 0.08961433829412706, + "grad_norm": 0.0919167548418045, + "kl": 0.8035856388509274, + "learning_rate": 1.7891373801916932e-05, + "loss": -0.0947, + "reward": 1.0625000298023224, + "reward_std": 0.23918063938617706, + "rewards/accuracy_reward": 0.1604166716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9020833551883698, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 843.8791809082031, + "epoch": 0.08993438950232037, + "grad_norm": 0.06751978397369385, + "kl": 0.6697272006422281, + "learning_rate": 1.795527156549521e-05, + "loss": -0.0986, + "reward": 0.9140625119209289, + "reward_std": 0.24967138916254045, + "rewards/accuracy_reward": 0.00625, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9078125119209289, + "step": 281 + }, + { + "clip_ratio": 0.0, + "completion_length": 797.4062744140625, + "epoch": 0.09025444071051368, + "grad_norm": 0.06286520510911942, + "kl": 0.4617580160498619, + "learning_rate": 1.8019169329073486e-05, + "loss": -0.0937, + "reward": 0.9760416924953461, + "reward_std": 0.2386911503970623, + "rewards/accuracy_reward": 0.05625000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9197916865348816, + "step": 282 + }, + { + "clip_ratio": 0.0, + "completion_length": 859.6125244140625, + "epoch": 0.09057449191870699, + "grad_norm": 0.08970347791910172, + "kl": 0.37080557718873025, + "learning_rate": 1.808306709265176e-05, + "loss": -0.0966, + "reward": 0.9406250238418579, + "reward_std": 0.22452181503176688, + "rewards/accuracy_reward": 0.012500000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9281250178813935, + "step": 283 + }, + { + "clip_ratio": 0.0, + "completion_length": 862.0187683105469, + "epoch": 0.0908945431269003, + "grad_norm": 0.11629839241504669, + "kl": 0.5317555744200945, + "learning_rate": 1.8146964856230033e-05, + "loss": -0.0823, + "reward": 0.9687500238418579, + "reward_std": 0.1916698656976223, + "rewards/accuracy_reward": 0.031250000558793546, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9375000178813935, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completion_length": 808.6896118164062, + "epoch": 0.09121459433509362, + "grad_norm": 0.04873790219426155, + "kl": 0.41392175033688544, + "learning_rate": 1.8210862619808307e-05, + "loss": -0.0581, + "reward": 1.0192708551883698, + "reward_std": 0.18486709110438823, + "rewards/accuracy_reward": 0.06250000037252904, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9567708492279052, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completion_length": 823.9062683105469, + "epoch": 0.09153464554328693, + "grad_norm": 0.06124405190348625, + "kl": 0.5761492840945721, + "learning_rate": 1.8274760383386584e-05, + "loss": -0.0575, + "reward": 1.0343750298023224, + "reward_std": 0.14318594969809056, + "rewards/accuracy_reward": 0.08125000223517417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9531250178813935, + "step": 286 + }, + { + "clip_ratio": 0.0, + "completion_length": 852.4708557128906, + "epoch": 0.09185469675148024, + "grad_norm": 0.04435551166534424, + "kl": 0.8433479636907577, + "learning_rate": 1.8338658146964858e-05, + "loss": -0.0992, + "reward": 1.0718750298023223, + "reward_std": 0.19699937254190444, + "rewards/accuracy_reward": 0.12083333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9510416805744171, + "step": 287 + }, + { + "clip_ratio": 0.0, + "completion_length": 819.7062805175781, + "epoch": 0.09217474795967355, + "grad_norm": 0.05230933800339699, + "kl": 0.5397528253495694, + "learning_rate": 1.840255591054313e-05, + "loss": -0.0561, + "reward": 0.9604166984558106, + "reward_std": 0.16495687067508696, + "rewards/accuracy_reward": 0.006250000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9541666865348816, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completion_length": 817.6687683105469, + "epoch": 0.09249479916786686, + "grad_norm": 0.044067054986953735, + "kl": 1.0274098832160234, + "learning_rate": 1.8466453674121408e-05, + "loss": -0.0696, + "reward": 1.0661458790302276, + "reward_std": 0.138642318546772, + "rewards/accuracy_reward": 0.09791666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.968229204416275, + "step": 289 + }, + { + "clip_ratio": 0.0, + "completion_length": 798.7437683105469, + "epoch": 0.09281485037606017, + "grad_norm": 0.13034485280513763, + "kl": 0.8338620312511921, + "learning_rate": 1.8530351437699682e-05, + "loss": -0.0463, + "reward": 1.0578125298023224, + "reward_std": 0.13859039451926947, + "rewards/accuracy_reward": 0.08750000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.970312523841858, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completion_length": 833.6333557128906, + "epoch": 0.09313490158425348, + "grad_norm": 0.17142996191978455, + "kl": 1.5055570479482412, + "learning_rate": 1.8594249201277955e-05, + "loss": -0.04, + "reward": 1.0614583611488342, + "reward_std": 0.19720946326851846, + "rewards/accuracy_reward": 0.10833333693444729, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9531250238418579, + "step": 291 + }, + { + "clip_ratio": 0.0, + "completion_length": 833.6458618164063, + "epoch": 0.0934549527924468, + "grad_norm": 0.092293381690979, + "kl": 0.9619705751538277, + "learning_rate": 1.8658146964856232e-05, + "loss": -0.0269, + "reward": 1.0546875238418578, + "reward_std": 0.11363485138863325, + "rewards/accuracy_reward": 0.08125000223517417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9734375178813934, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completion_length": 812.5312683105469, + "epoch": 0.09377500400064011, + "grad_norm": 0.18601863086223602, + "kl": 1.3381911851465702, + "learning_rate": 1.8722044728434506e-05, + "loss": -0.0482, + "reward": 1.0447916984558105, + "reward_std": 0.15205052942037584, + "rewards/accuracy_reward": 0.08541666939854622, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9593750238418579, + "step": 293 + }, + { + "clip_ratio": 0.0, + "completion_length": 816.6041870117188, + "epoch": 0.09409505520883342, + "grad_norm": 0.3239324986934662, + "kl": 2.9891052283346653, + "learning_rate": 1.878594249201278e-05, + "loss": -0.0762, + "reward": 0.9588541865348816, + "reward_std": 0.1921792333945632, + "rewards/accuracy_reward": 0.020833334513008596, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9380208492279053, + "step": 294 + }, + { + "clip_ratio": 0.0, + "completion_length": 831.8125244140625, + "epoch": 0.09441510641702672, + "grad_norm": 0.12830707430839539, + "kl": 0.6059975288808346, + "learning_rate": 1.8849840255591057e-05, + "loss": -0.0271, + "reward": 1.0671875298023223, + "reward_std": 0.10790065918117761, + "rewards/accuracy_reward": 0.10208333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9651041865348816, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completion_length": 818.6937683105468, + "epoch": 0.09473515762522003, + "grad_norm": 0.04302423447370529, + "kl": 1.0549761176109314, + "learning_rate": 1.891373801916933e-05, + "loss": -0.0439, + "reward": 0.9984375238418579, + "reward_std": 0.1427600732073188, + "rewards/accuracy_reward": 0.04583333469927311, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9526041805744171, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completion_length": 814.9416870117187, + "epoch": 0.09505520883341334, + "grad_norm": 0.14698351919651031, + "kl": 0.6408292330801487, + "learning_rate": 1.8977635782747604e-05, + "loss": -0.0273, + "reward": 0.9041666805744171, + "reward_std": 0.19298009127378463, + "rewards/accuracy_reward": 0.03958333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8645833492279053, + "step": 297 + }, + { + "clip_ratio": 0.0, + "completion_length": 786.627099609375, + "epoch": 0.09537526004160665, + "grad_norm": 0.12881284952163696, + "kl": 0.522902286797762, + "learning_rate": 1.904153354632588e-05, + "loss": -0.0235, + "reward": 0.9333333671092987, + "reward_std": 0.20901244282722473, + "rewards/accuracy_reward": 0.06250000093132257, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8708333492279052, + "step": 298 + }, + { + "clip_ratio": 0.0, + "completion_length": 867.3333557128906, + "epoch": 0.09569531124979996, + "grad_norm": 0.0646795704960823, + "kl": 0.4893409203737974, + "learning_rate": 1.9105431309904154e-05, + "loss": -0.0422, + "reward": 1.0057291984558105, + "reward_std": 0.1382185447961092, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9661458611488343, + "step": 299 + }, + { + "clip_ratio": 0.0, + "completion_length": 782.7833557128906, + "epoch": 0.09601536245799328, + "grad_norm": 0.06920057535171509, + "kl": 1.862360952794552, + "learning_rate": 1.916932907348243e-05, + "loss": -0.1221, + "reward": 0.9901041924953461, + "reward_std": 0.20693937465548515, + "rewards/accuracy_reward": 0.04791666902601719, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9421875178813934, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 838.0146057128907, + "epoch": 0.09633541366618659, + "grad_norm": 0.06644880771636963, + "kl": 1.1580650568008424, + "learning_rate": 1.9233226837060705e-05, + "loss": -0.1085, + "reward": 0.9890625298023223, + "reward_std": 0.1861676774919033, + "rewards/accuracy_reward": 0.04375000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9453125298023224, + "step": 301 + }, + { + "clip_ratio": 0.0, + "completion_length": 843.3458557128906, + "epoch": 0.0966554648743799, + "grad_norm": 0.07187503576278687, + "kl": 0.7149579245597124, + "learning_rate": 1.929712460063898e-05, + "loss": -0.0523, + "reward": 1.0875000476837158, + "reward_std": 0.14358032830059528, + "rewards/accuracy_reward": 0.1166666703298688, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333551883698, + "step": 302 + }, + { + "clip_ratio": 0.0, + "completion_length": 793.3000244140625, + "epoch": 0.09697551608257321, + "grad_norm": 0.07028498500585556, + "kl": 0.5270621210336686, + "learning_rate": 1.9361022364217256e-05, + "loss": -0.0411, + "reward": 1.0192708492279052, + "reward_std": 0.14648367427289485, + "rewards/accuracy_reward": 0.0520833358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9671875178813935, + "step": 303 + }, + { + "clip_ratio": 0.0, + "completion_length": 851.6541809082031, + "epoch": 0.09729556729076652, + "grad_norm": 0.08742289245128632, + "kl": 0.9225699122995138, + "learning_rate": 1.942492012779553e-05, + "loss": -0.0382, + "reward": 1.0802083492279053, + "reward_std": 0.17449979558587075, + "rewards/accuracy_reward": 0.1083333371207118, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9718750238418579, + "step": 304 + }, + { + "clip_ratio": 0.0, + "completion_length": 815.5333557128906, + "epoch": 0.09761561849895983, + "grad_norm": 0.148127943277359, + "kl": 1.2980008512735366, + "learning_rate": 1.9488817891373803e-05, + "loss": -0.0697, + "reward": 1.0197916865348815, + "reward_std": 0.1628091825172305, + "rewards/accuracy_reward": 0.05625000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completion_length": 822.1250244140625, + "epoch": 0.09793566970715314, + "grad_norm": 0.08461788296699524, + "kl": 0.48892029859125613, + "learning_rate": 1.955271565495208e-05, + "loss": -0.026, + "reward": 1.1718750298023224, + "reward_std": 0.14723527543246745, + "rewards/accuracy_reward": 0.19166667219251393, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9802083551883698, + "step": 306 + }, + { + "clip_ratio": 0.0, + "completion_length": 825.6416748046875, + "epoch": 0.09825572091534646, + "grad_norm": 0.09465766698122025, + "kl": 1.6012873794883489, + "learning_rate": 1.9616613418530353e-05, + "loss": -0.0675, + "reward": 1.0411458551883697, + "reward_std": 0.14493329152464868, + "rewards/accuracy_reward": 0.07916666883975268, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9619791805744171, + "step": 307 + }, + { + "clip_ratio": 0.0, + "completion_length": 861.5354431152343, + "epoch": 0.09857577212353977, + "grad_norm": 0.07408089190721512, + "kl": 1.533422616124153, + "learning_rate": 1.9680511182108627e-05, + "loss": -0.0936, + "reward": 0.9979166805744171, + "reward_std": 0.18721573576331138, + "rewards/accuracy_reward": 0.04166666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.956250011920929, + "step": 308 + }, + { + "clip_ratio": 0.0, + "completion_length": 880.7458618164062, + "epoch": 0.09889582333173308, + "grad_norm": 0.04676924645900726, + "kl": 0.6797443836927414, + "learning_rate": 1.97444089456869e-05, + "loss": -0.0341, + "reward": 1.035416692495346, + "reward_std": 0.1299929341301322, + "rewards/accuracy_reward": 0.06041666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9750000178813935, + "step": 309 + }, + { + "clip_ratio": 0.0, + "completion_length": 895.7854370117187, + "epoch": 0.09921587453992639, + "grad_norm": 0.10597676038742065, + "kl": 0.8373191263526678, + "learning_rate": 1.9808306709265177e-05, + "loss": -0.0276, + "reward": 1.0484375178813934, + "reward_std": 0.11354983560740947, + "rewards/accuracy_reward": 0.07708333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9713541805744171, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completion_length": 871.6062622070312, + "epoch": 0.0995359257481197, + "grad_norm": 0.04293238744139671, + "kl": 0.7700838308781386, + "learning_rate": 1.987220447284345e-05, + "loss": -0.0357, + "reward": 1.0250000298023223, + "reward_std": 0.16061475723981858, + "rewards/accuracy_reward": 0.06041666846722364, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9645833492279052, + "step": 311 + }, + { + "clip_ratio": 0.0, + "completion_length": 876.1854370117187, + "epoch": 0.09985597695631301, + "grad_norm": 0.04478353261947632, + "kl": 0.7534620493650437, + "learning_rate": 1.9936102236421725e-05, + "loss": -0.0471, + "reward": 1.0213542044162751, + "reward_std": 0.17604071646928787, + "rewards/accuracy_reward": 0.0520833358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9692708492279053, + "step": 312 + }, + { + "clip_ratio": 0.0, + "completion_length": 847.4937805175781, + "epoch": 0.10017602816450633, + "grad_norm": 0.217251256108284, + "kl": 1.3029839828610421, + "learning_rate": 2e-05, + "loss": -0.0849, + "reward": 0.9958333551883698, + "reward_std": 0.1536863178014755, + "rewards/accuracy_reward": 0.0375, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9583333551883697, + "step": 313 + }, + { + "clip_ratio": 0.0, + "completion_length": 871.7458557128906, + "epoch": 0.10049607937269964, + "grad_norm": 0.07100000232458115, + "kl": 1.1783099208027124, + "learning_rate": 1.99999937547761e-05, + "loss": -0.0472, + "reward": 1.017187523841858, + "reward_std": 0.1712120622396469, + "rewards/accuracy_reward": 0.0541666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9630208432674408, + "step": 314 + }, + { + "clip_ratio": 0.0, + "completion_length": 864.9416809082031, + "epoch": 0.10081613058089295, + "grad_norm": 0.05442134663462639, + "kl": 0.8484370153397321, + "learning_rate": 1.9999975019112187e-05, + "loss": -0.0856, + "reward": 1.0411458730697631, + "reward_std": 0.18634050339460373, + "rewards/accuracy_reward": 0.09583333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9453125238418579, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completion_length": 812.7146057128906, + "epoch": 0.10113618178908626, + "grad_norm": 0.05835329368710518, + "kl": 0.7999336563050747, + "learning_rate": 1.9999943793031672e-05, + "loss": -0.071, + "reward": 1.035937535762787, + "reward_std": 0.2407546177506447, + "rewards/accuracy_reward": 0.12500000279396772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9109375238418579, + "step": 316 + }, + { + "clip_ratio": 0.0, + "completion_length": 850.0979370117187, + "epoch": 0.10145623299727956, + "grad_norm": 0.08183707296848297, + "kl": 0.9409760713577271, + "learning_rate": 1.9999900076573555e-05, + "loss": -0.0842, + "reward": 0.9593750357627868, + "reward_std": 0.2523031309247017, + "rewards/accuracy_reward": 0.0666666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8927083551883698, + "step": 317 + }, + { + "clip_ratio": 0.0, + "completion_length": 829.1187744140625, + "epoch": 0.10177628420547287, + "grad_norm": 0.16990694403648376, + "kl": 0.8687799766659736, + "learning_rate": 1.999984386979244e-05, + "loss": -0.0651, + "reward": 0.9723958611488343, + "reward_std": 0.20160830169916152, + "rewards/accuracy_reward": 0.09166666939854622, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8807291924953461, + "step": 318 + }, + { + "clip_ratio": 0.0, + "completion_length": 812.4437683105468, + "epoch": 0.10209633541366618, + "grad_norm": 0.10622044652700424, + "kl": 0.5579104773700237, + "learning_rate": 1.999977517275853e-05, + "loss": -0.0654, + "reward": 0.9927083551883698, + "reward_std": 0.25271194577217104, + "rewards/accuracy_reward": 0.11875000409781933, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8739583432674408, + "step": 319 + }, + { + "clip_ratio": 0.0, + "completion_length": 827.0458618164063, + "epoch": 0.10241638662185949, + "grad_norm": 0.14122267067432404, + "kl": 0.9455968786031008, + "learning_rate": 1.9999693985557632e-05, + "loss": -0.118, + "reward": 0.9593750357627868, + "reward_std": 0.20687768310308458, + "rewards/accuracy_reward": 0.08333333730697631, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8760416865348816, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completion_length": 791.833349609375, + "epoch": 0.1027364378300528, + "grad_norm": 0.1345462054014206, + "kl": 0.6353983476758003, + "learning_rate": 1.999960030829115e-05, + "loss": -0.1179, + "reward": 0.9557291865348816, + "reward_std": 0.27604651898145677, + "rewards/accuracy_reward": 0.09583333749324083, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8598958492279053, + "step": 321 + }, + { + "clip_ratio": 0.0, + "completion_length": 807.3687805175781, + "epoch": 0.10305648903824612, + "grad_norm": 0.19280694425106049, + "kl": 1.307307593524456, + "learning_rate": 1.99994941410761e-05, + "loss": -0.139, + "reward": 0.8828125238418579, + "reward_std": 0.2544379264116287, + "rewards/accuracy_reward": 0.01458333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8682291865348816, + "step": 322 + }, + { + "clip_ratio": 0.0, + "completion_length": 823.5916931152344, + "epoch": 0.10337654024643943, + "grad_norm": 0.2708165943622589, + "kl": 1.0159433037042618, + "learning_rate": 1.9999375484045077e-05, + "loss": -0.1316, + "reward": 0.8807291924953461, + "reward_std": 0.26034645885229113, + "rewards/accuracy_reward": 0.03958333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8411458551883697, + "step": 323 + }, + { + "clip_ratio": 0.0, + "completion_length": 866.6270935058594, + "epoch": 0.10369659145463274, + "grad_norm": 0.6394182443618774, + "kl": 1.2064526498317718, + "learning_rate": 1.99992443373463e-05, + "loss": -0.1229, + "reward": 0.9432291865348816, + "reward_std": 0.275150665640831, + "rewards/accuracy_reward": 0.09166666716337205, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8515625178813935, + "step": 324 + }, + { + "clip_ratio": 0.0, + "completion_length": 898.0916870117187, + "epoch": 0.10401664266282605, + "grad_norm": 0.07190922647714615, + "kl": 1.2092401087284088, + "learning_rate": 1.999910070114357e-05, + "loss": -0.1126, + "reward": 0.9255208551883698, + "reward_std": 0.2189515396952629, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.883854192495346, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completion_length": 915.2458557128906, + "epoch": 0.10433669387101936, + "grad_norm": 0.06787604093551636, + "kl": 0.6662247460335493, + "learning_rate": 1.99989445756163e-05, + "loss": -0.0825, + "reward": 0.9942708492279053, + "reward_std": 0.20946806892752648, + "rewards/accuracy_reward": 0.07500000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9192708551883697, + "step": 326 + }, + { + "clip_ratio": 0.0, + "completion_length": 929.2458435058594, + "epoch": 0.10465674507921267, + "grad_norm": 0.0813896581530571, + "kl": 0.6081900119781494, + "learning_rate": 1.999877596095949e-05, + "loss": -0.0544, + "reward": 1.0223958611488342, + "reward_std": 0.2039100807160139, + "rewards/accuracy_reward": 0.09375000353902578, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9286458551883697, + "step": 327 + }, + { + "clip_ratio": 0.0, + "completion_length": 955.689599609375, + "epoch": 0.10497679628740599, + "grad_norm": 0.06041910871863365, + "kl": 0.9724824227392673, + "learning_rate": 1.9998594857383756e-05, + "loss": -0.0732, + "reward": 1.024479204416275, + "reward_std": 0.20781310126185418, + "rewards/accuracy_reward": 0.11666666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9078125298023224, + "step": 328 + }, + { + "clip_ratio": 0.0, + "completion_length": 923.4541870117188, + "epoch": 0.1052968474955993, + "grad_norm": 0.07968918979167938, + "kl": 0.5805226668715477, + "learning_rate": 1.99984012651153e-05, + "loss": -0.0639, + "reward": 1.0041666865348815, + "reward_std": 0.23270507901906967, + "rewards/accuracy_reward": 0.09166666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.912500011920929, + "step": 329 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.1896118164062, + "epoch": 0.10561689870379261, + "grad_norm": 0.04409494251012802, + "kl": 1.243473695591092, + "learning_rate": 1.999819518439593e-05, + "loss": -0.0951, + "reward": 1.0411458551883697, + "reward_std": 0.2635225549340248, + "rewards/accuracy_reward": 0.14166667070239783, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8994791746139527, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completion_length": 953.5604309082031, + "epoch": 0.10593694991198592, + "grad_norm": 0.3219020366668701, + "kl": 0.40860943496227264, + "learning_rate": 1.9997976615483042e-05, + "loss": -0.0361, + "reward": 1.0703125178813935, + "reward_std": 0.22589490786194802, + "rewards/accuracy_reward": 0.14583333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9244791865348816, + "step": 331 + }, + { + "clip_ratio": 0.0, + "completion_length": 940.6187683105469, + "epoch": 0.10625700112017923, + "grad_norm": 0.13125485181808472, + "kl": 0.620623991638422, + "learning_rate": 1.9997745558649647e-05, + "loss": -0.0476, + "reward": 0.9453125238418579, + "reward_std": 0.19392418172210454, + "rewards/accuracy_reward": 0.02708333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9182291865348816, + "step": 332 + }, + { + "clip_ratio": 0.0, + "completion_length": 947.670849609375, + "epoch": 0.10657705232837254, + "grad_norm": 0.054816894233226776, + "kl": 1.84022078178823, + "learning_rate": 1.9997502014184348e-05, + "loss": -0.0839, + "reward": 1.018229204416275, + "reward_std": 0.2765422374010086, + "rewards/accuracy_reward": 0.11875000596046448, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.899479192495346, + "step": 333 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.4208557128907, + "epoch": 0.10689710353656585, + "grad_norm": 0.099627286195755, + "kl": 1.0009838584810495, + "learning_rate": 1.9997245982391335e-05, + "loss": -0.0466, + "reward": 0.9750000238418579, + "reward_std": 0.23383331745862962, + "rewards/accuracy_reward": 0.05625000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9187500238418579, + "step": 334 + }, + { + "clip_ratio": 0.0, + "completion_length": 934.4166809082031, + "epoch": 0.10721715474475917, + "grad_norm": 0.1346578598022461, + "kl": 2.6244244679808615, + "learning_rate": 1.9996977463590404e-05, + "loss": -0.0918, + "reward": 0.9026041865348816, + "reward_std": 0.2553748100996017, + "rewards/accuracy_reward": 0.006250000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8963541805744171, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completion_length": 904.3145935058594, + "epoch": 0.10753720595295248, + "grad_norm": 0.264816552400589, + "kl": 3.066745951771736, + "learning_rate": 1.9996696458116953e-05, + "loss": -0.1219, + "reward": 1.0015625298023223, + "reward_std": 0.3009426400065422, + "rewards/accuracy_reward": 0.12083333898335695, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8807291924953461, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completion_length": 890.7854309082031, + "epoch": 0.10785725716114579, + "grad_norm": 4.931536674499512, + "kl": 2.893843525648117, + "learning_rate": 1.9996402966321962e-05, + "loss": -0.1432, + "reward": 0.8864583551883698, + "reward_std": 0.2413546308875084, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8843750178813934, + "step": 337 + }, + { + "clip_ratio": 0.0, + "completion_length": 856.8562622070312, + "epoch": 0.1081773083693391, + "grad_norm": 0.24606415629386902, + "kl": 1.80047315210104, + "learning_rate": 1.9996096988572026e-05, + "loss": -0.1215, + "reward": 0.9812500238418579, + "reward_std": 0.250549491494894, + "rewards/accuracy_reward": 0.06666666958481074, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9145833551883698, + "step": 338 + }, + { + "clip_ratio": 0.0, + "completion_length": 782.8062805175781, + "epoch": 0.1084973595775324, + "grad_norm": 0.3353343605995178, + "kl": 1.0522517710924149, + "learning_rate": 1.999577852524931e-05, + "loss": -0.1469, + "reward": 1.084375023841858, + "reward_std": 0.2656236305832863, + "rewards/accuracy_reward": 0.2104166716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8739583551883697, + "step": 339 + }, + { + "clip_ratio": 0.0, + "completion_length": 768.8812683105468, + "epoch": 0.10881741078572571, + "grad_norm": 0.5174300670623779, + "kl": 1.7762677013874053, + "learning_rate": 1.9995447576751605e-05, + "loss": -0.1842, + "reward": 1.0130208492279054, + "reward_std": 0.317549966275692, + "rewards/accuracy_reward": 0.17916667219251395, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8338541746139526, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completion_length": 610.8937683105469, + "epoch": 0.10913746199391902, + "grad_norm": 0.8624448776245117, + "kl": 3.7803176999092103, + "learning_rate": 1.999510414349227e-05, + "loss": -0.3299, + "reward": 0.7390625357627869, + "reward_std": 0.36838062703609464, + "rewards/accuracy_reward": 0.07291666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6661458611488342, + "step": 341 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.7271026611328, + "epoch": 0.10945751320211233, + "grad_norm": 0.5762320160865784, + "kl": 2.1153181910514833, + "learning_rate": 1.9994748225900277e-05, + "loss": -0.3942, + "reward": 0.6640625327825547, + "reward_std": 0.36143629252910614, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.622395858168602, + "step": 342 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.2395965576172, + "epoch": 0.10977756441030564, + "grad_norm": 0.8417689204216003, + "kl": 1.8840416431427003, + "learning_rate": 1.999437982442017e-05, + "loss": -0.4202, + "reward": 0.5890625149011612, + "reward_std": 0.38141846358776094, + "rewards/accuracy_reward": 0.01666666716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.572395846247673, + "step": 343 + }, + { + "clip_ratio": 0.0, + "completion_length": 717.4833557128907, + "epoch": 0.11009761561849896, + "grad_norm": 0.527220606803894, + "kl": 1.1728574931621552, + "learning_rate": 1.9993998939512113e-05, + "loss": -0.2318, + "reward": 0.7536458611488343, + "reward_std": 0.3177986368536949, + "rewards/accuracy_reward": 0.010416666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7432291865348816, + "step": 344 + }, + { + "clip_ratio": 0.0, + "completion_length": 806.5271057128906, + "epoch": 0.11041766682669227, + "grad_norm": 0.39360710978507996, + "kl": 0.9293541312217712, + "learning_rate": 1.9993605571651838e-05, + "loss": -0.1071, + "reward": 0.9187500238418579, + "reward_std": 0.28557206094264986, + "rewards/accuracy_reward": 0.07291666921228171, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8458333432674408, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completion_length": 802.089599609375, + "epoch": 0.11073771803488558, + "grad_norm": 0.4722810387611389, + "kl": 1.2333260536193849, + "learning_rate": 1.9993199721330684e-05, + "loss": -0.0961, + "reward": 0.9656250417232514, + "reward_std": 0.28917737156152723, + "rewards/accuracy_reward": 0.09791666977107524, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8677083551883698, + "step": 346 + }, + { + "clip_ratio": 0.0, + "completion_length": 814.895849609375, + "epoch": 0.11105776924307889, + "grad_norm": 1.1194065809249878, + "kl": 1.6250961005687714, + "learning_rate": 1.9992781389055576e-05, + "loss": -0.089, + "reward": 0.8333333492279053, + "reward_std": 0.2882629260420799, + "rewards/accuracy_reward": 0.008333333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8250000178813934, + "step": 347 + }, + { + "clip_ratio": 0.0, + "completion_length": 799.427099609375, + "epoch": 0.1113778204512722, + "grad_norm": 0.31969064474105835, + "kl": 0.6975180029869079, + "learning_rate": 1.999235057534903e-05, + "loss": -0.065, + "reward": 0.9895833671092987, + "reward_std": 0.22908852323889733, + "rewards/accuracy_reward": 0.11666667070239782, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8729166984558105, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completion_length": 855.2979370117188, + "epoch": 0.11169787165946551, + "grad_norm": 0.10219820588827133, + "kl": 0.6862524829804897, + "learning_rate": 1.9991907280749148e-05, + "loss": -0.0616, + "reward": 0.985416692495346, + "reward_std": 0.2100462459027767, + "rewards/accuracy_reward": 0.07083333488553763, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9145833432674408, + "step": 349 + }, + { + "clip_ratio": 0.0, + "completion_length": 830.7208557128906, + "epoch": 0.11201792286765883, + "grad_norm": 0.08064544945955276, + "kl": 0.41558183878660204, + "learning_rate": 1.999145150580963e-05, + "loss": -0.0925, + "reward": 1.020312523841858, + "reward_std": 0.2063022270798683, + "rewards/accuracy_reward": 0.10208333488553763, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9182291865348816, + "step": 350 + }, + { + "clip_ratio": 0.0, + "completion_length": 853.3708618164062, + "epoch": 0.11233797407585214, + "grad_norm": 0.0841941237449646, + "kl": 0.29322315752506256, + "learning_rate": 1.9990983251099755e-05, + "loss": -0.0366, + "reward": 0.9796875238418579, + "reward_std": 0.18505462631583214, + "rewards/accuracy_reward": 0.05416666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9255208551883698, + "step": 351 + }, + { + "clip_ratio": 0.0, + "completion_length": 903.9750244140625, + "epoch": 0.11265802528404545, + "grad_norm": 0.13465115427970886, + "kl": 0.41350857689976694, + "learning_rate": 1.99905025172044e-05, + "loss": -0.0597, + "reward": 1.0109375178813935, + "reward_std": 0.24164980798959732, + "rewards/accuracy_reward": 0.09166667014360427, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9192708492279053, + "step": 352 + }, + { + "clip_ratio": 0.0, + "completion_length": 899.0729309082031, + "epoch": 0.11297807649223876, + "grad_norm": 0.08185650408267975, + "kl": 0.18878451809287072, + "learning_rate": 1.999000930472401e-05, + "loss": -0.0331, + "reward": 1.0036458432674409, + "reward_std": 0.22958993241190911, + "rewards/accuracy_reward": 0.08541667070239782, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9182291746139526, + "step": 353 + }, + { + "clip_ratio": 0.0, + "completion_length": 914.9812683105469, + "epoch": 0.11329812770043207, + "grad_norm": 0.07535237818956375, + "kl": 0.24463820680975915, + "learning_rate": 1.9989503614274647e-05, + "loss": -0.0303, + "reward": 1.0432291984558106, + "reward_std": 0.14151984304189683, + "rewards/accuracy_reward": 0.0854166692122817, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.957812511920929, + "step": 354 + }, + { + "clip_ratio": 0.0, + "completion_length": 887.8062622070313, + "epoch": 0.11361817890862538, + "grad_norm": 0.08462147414684296, + "kl": 0.13488904759287834, + "learning_rate": 1.998898544648793e-05, + "loss": -0.0307, + "reward": 1.0223958611488342, + "reward_std": 0.18293874636292456, + "rewards/accuracy_reward": 0.07083333563059568, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9515625178813935, + "step": 355 + }, + { + "clip_ratio": 0.0, + "completion_length": 883.7875183105468, + "epoch": 0.1139382301168187, + "grad_norm": 0.09306566417217255, + "kl": 0.21404382959008217, + "learning_rate": 1.9988454802011077e-05, + "loss": -0.0349, + "reward": 1.0661458611488341, + "reward_std": 0.16979529410600663, + "rewards/accuracy_reward": 0.11041666772216559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9557291805744171, + "step": 356 + }, + { + "clip_ratio": 0.0, + "completion_length": 874.3729431152344, + "epoch": 0.114258281325012, + "grad_norm": 0.08024463802576065, + "kl": 0.23921761214733123, + "learning_rate": 1.9987911681506886e-05, + "loss": -0.0446, + "reward": 1.0171875357627869, + "reward_std": 0.14690931476652622, + "rewards/accuracy_reward": 0.05625000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9609375178813935, + "step": 357 + }, + { + "clip_ratio": 0.0, + "completion_length": 870.5250122070313, + "epoch": 0.11457833253320532, + "grad_norm": 0.14203424751758575, + "kl": 0.28406247273087504, + "learning_rate": 1.9987356085653738e-05, + "loss": -0.0558, + "reward": 1.035416692495346, + "reward_std": 0.18686591796576976, + "rewards/accuracy_reward": 0.09583333693444729, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9395833432674408, + "step": 358 + }, + { + "clip_ratio": 0.0, + "completion_length": 818.483349609375, + "epoch": 0.11489838374139863, + "grad_norm": 0.28132274746894836, + "kl": 0.4927747845649719, + "learning_rate": 1.9986788015145597e-05, + "loss": -0.1037, + "reward": 0.997916704416275, + "reward_std": 0.19941441863775253, + "rewards/accuracy_reward": 0.05625000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9416666924953461, + "step": 359 + }, + { + "clip_ratio": 0.0, + "completion_length": 880.3646057128906, + "epoch": 0.11521843494959194, + "grad_norm": 0.05523926019668579, + "kl": 0.2463509477674961, + "learning_rate": 1.9986207470692012e-05, + "loss": -0.023, + "reward": 1.0234375298023224, + "reward_std": 0.1478660933673382, + "rewards/accuracy_reward": 0.07708333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9463541865348816, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completion_length": 821.1583557128906, + "epoch": 0.11553848615778524, + "grad_norm": 0.05657940357923508, + "kl": 0.20335216149687768, + "learning_rate": 1.99856144530181e-05, + "loss": -0.0431, + "reward": 1.0885416924953462, + "reward_std": 0.16149957925081254, + "rewards/accuracy_reward": 0.12708333637565375, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9614583551883698, + "step": 361 + }, + { + "clip_ratio": 0.0, + "completion_length": 840.4000122070313, + "epoch": 0.11585853736597855, + "grad_norm": 0.06203540042042732, + "kl": 0.3108247257769108, + "learning_rate": 1.9985008962864582e-05, + "loss": -0.0275, + "reward": 1.0005208551883698, + "reward_std": 0.17917531132698059, + "rewards/accuracy_reward": 0.04583333469927311, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9546875119209289, + "step": 362 + }, + { + "clip_ratio": 0.0, + "completion_length": 803.0208557128906, + "epoch": 0.11617858857417186, + "grad_norm": 0.07565395534038544, + "kl": 0.26133017241954803, + "learning_rate": 1.998439100098773e-05, + "loss": -0.0152, + "reward": 1.1135416984558106, + "reward_std": 0.16440754048526288, + "rewards/accuracy_reward": 0.1500000035390258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.963541692495346, + "step": 363 + }, + { + "clip_ratio": 0.0, + "completion_length": 850.1541809082031, + "epoch": 0.11649863978236517, + "grad_norm": 0.10145269334316254, + "kl": 0.15966624915599822, + "learning_rate": 1.998376056815941e-05, + "loss": -0.0171, + "reward": 1.0598958611488343, + "reward_std": 0.1260958842933178, + "rewards/accuracy_reward": 0.0916666692122817, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9682291805744171, + "step": 364 + }, + { + "clip_ratio": 0.0, + "completion_length": 796.1166809082031, + "epoch": 0.11681869099055849, + "grad_norm": 0.05918211117386818, + "kl": 0.19708489403128623, + "learning_rate": 1.998311766516706e-05, + "loss": -0.0228, + "reward": 1.0380208671092988, + "reward_std": 0.12481046803295612, + "rewards/accuracy_reward": 0.06458333600312471, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9734375178813934, + "step": 365 + }, + { + "clip_ratio": 0.0, + "completion_length": 813.2312683105469, + "epoch": 0.1171387421987518, + "grad_norm": 0.4396161735057831, + "kl": 0.4802224151790142, + "learning_rate": 1.9982462292813693e-05, + "loss": -0.0249, + "reward": 1.076562523841858, + "reward_std": 0.12219937145709991, + "rewards/accuracy_reward": 0.10833333730697632, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9682291865348815, + "step": 366 + }, + { + "clip_ratio": 0.0, + "completion_length": 806.1104309082032, + "epoch": 0.11745879340694511, + "grad_norm": 0.19936463236808777, + "kl": 0.4187732309103012, + "learning_rate": 1.99817944519179e-05, + "loss": -0.0197, + "reward": 0.9619791805744171, + "reward_std": 0.1182825243100524, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9598958492279053, + "step": 367 + }, + { + "clip_ratio": 0.0, + "completion_length": 776.3541809082031, + "epoch": 0.11777884461513842, + "grad_norm": 0.16198308765888214, + "kl": 0.5005293294787407, + "learning_rate": 1.998111414331385e-05, + "loss": 0.0007, + "reward": 1.0546875298023224, + "reward_std": 0.15743937194347382, + "rewards/accuracy_reward": 0.08958333563059569, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9651041746139526, + "step": 368 + }, + { + "clip_ratio": 0.0, + "completion_length": 801.1104370117188, + "epoch": 0.11809889582333173, + "grad_norm": 0.09354270994663239, + "kl": 0.3035903625190258, + "learning_rate": 1.9980421367851268e-05, + "loss": -0.0126, + "reward": 1.025000023841858, + "reward_std": 0.1401256587356329, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9562500178813934, + "step": 369 + }, + { + "clip_ratio": 0.0, + "completion_length": 722.7916931152344, + "epoch": 0.11841894703152504, + "grad_norm": 0.08353027701377869, + "kl": 0.282787824422121, + "learning_rate": 1.997971612639547e-05, + "loss": -0.007, + "reward": 1.0192708611488341, + "reward_std": 0.149939689040184, + "rewards/accuracy_reward": 0.05416666921228171, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9651041805744172, + "step": 370 + }, + { + "clip_ratio": 0.0, + "completion_length": 701.8291870117188, + "epoch": 0.11873899823971835, + "grad_norm": 0.06042907014489174, + "kl": 0.4572564627975225, + "learning_rate": 1.9978998419827328e-05, + "loss": -0.0004, + "reward": 1.0651041924953462, + "reward_std": 0.12739601023495198, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9609375178813935, + "step": 371 + }, + { + "clip_ratio": 0.0, + "completion_length": 751.1271057128906, + "epoch": 0.11905904944791167, + "grad_norm": 0.06057741865515709, + "kl": 0.5198903292417526, + "learning_rate": 1.9978268249043296e-05, + "loss": -0.0026, + "reward": 1.0135417044162751, + "reward_std": 0.17310468517243863, + "rewards/accuracy_reward": 0.05833333507180214, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9552083551883698, + "step": 372 + }, + { + "clip_ratio": 0.0, + "completion_length": 774.7937744140625, + "epoch": 0.11937910065610498, + "grad_norm": 0.043199047446250916, + "kl": 0.21139018721878527, + "learning_rate": 1.9977525614955388e-05, + "loss": -0.0245, + "reward": 1.0348958611488341, + "reward_std": 0.11672738809138536, + "rewards/accuracy_reward": 0.06041666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9744791865348816, + "step": 373 + }, + { + "clip_ratio": 0.0, + "completion_length": 740.489599609375, + "epoch": 0.11969915186429829, + "grad_norm": 0.10318101197481155, + "kl": 0.26686358749866484, + "learning_rate": 1.9976770518491184e-05, + "loss": -0.0266, + "reward": 1.0958333611488342, + "reward_std": 0.1272334760054946, + "rewards/accuracy_reward": 0.12916667088866235, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9666666865348816, + "step": 374 + }, + { + "clip_ratio": 0.0, + "completion_length": 719.5041870117187, + "epoch": 0.1200192030724916, + "grad_norm": 0.07899950444698334, + "kl": 0.3719005145132542, + "learning_rate": 1.9976002960593833e-05, + "loss": -0.0268, + "reward": 1.0302083611488342, + "reward_std": 0.1475877858698368, + "rewards/accuracy_reward": 0.06250000204890967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9677083492279053, + "step": 375 + }, + { + "clip_ratio": 0.0, + "completion_length": 741.1729309082032, + "epoch": 0.12033925428068491, + "grad_norm": 0.04784136265516281, + "kl": 0.19529346860945224, + "learning_rate": 1.9975222942222054e-05, + "loss": -0.0211, + "reward": 1.074479192495346, + "reward_std": 0.10090533643960953, + "rewards/accuracy_reward": 0.0895833358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9848958432674408, + "step": 376 + }, + { + "clip_ratio": 0.0, + "completion_length": 741.6521118164062, + "epoch": 0.12065930548887822, + "grad_norm": 0.062281284481287, + "kl": 0.323552380874753, + "learning_rate": 1.9974430464350125e-05, + "loss": -0.0074, + "reward": 0.9817708432674408, + "reward_std": 0.10530988723039628, + "rewards/accuracy_reward": 0.008333333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.973437511920929, + "step": 377 + }, + { + "clip_ratio": 0.0, + "completion_length": 756.4166809082031, + "epoch": 0.12097935669707154, + "grad_norm": 0.030522586777806282, + "kl": 0.09794650189578533, + "learning_rate": 1.997362552796788e-05, + "loss": -0.0219, + "reward": 1.0385416865348815, + "reward_std": 0.09175706021487713, + "rewards/accuracy_reward": 0.05625000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9822916805744171, + "step": 378 + }, + { + "clip_ratio": 0.0, + "completion_length": 735.7312744140625, + "epoch": 0.12129940790526485, + "grad_norm": 0.0645739808678627, + "kl": 0.14556628093123436, + "learning_rate": 1.9972808134080726e-05, + "loss": -0.0531, + "reward": 1.093750035762787, + "reward_std": 0.14756546672433615, + "rewards/accuracy_reward": 0.11875000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9750000238418579, + "step": 379 + }, + { + "clip_ratio": 0.0, + "completion_length": 737.6625183105468, + "epoch": 0.12161945911345816, + "grad_norm": 0.06165073439478874, + "kl": 0.16611265018582344, + "learning_rate": 1.9971978283709624e-05, + "loss": 0.0031, + "reward": 1.0317708551883698, + "reward_std": 0.11711971126496792, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9651041865348816, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completion_length": 768.0479431152344, + "epoch": 0.12193951032165147, + "grad_norm": 0.08346700668334961, + "kl": 0.1517067790031433, + "learning_rate": 1.9971135977891093e-05, + "loss": 0.0006, + "reward": 1.0239583611488343, + "reward_std": 0.16878514513373374, + "rewards/accuracy_reward": 0.07500000335276127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9489583551883698, + "step": 381 + }, + { + "clip_ratio": 0.0, + "completion_length": 697.964599609375, + "epoch": 0.12225956152984478, + "grad_norm": 0.20756888389587402, + "kl": 0.3224208764731884, + "learning_rate": 1.9970281217677207e-05, + "loss": 0.0314, + "reward": 1.0057291865348816, + "reward_std": 0.20953557565808295, + "rewards/accuracy_reward": 0.06875000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9369791865348815, + "step": 382 + }, + { + "clip_ratio": 0.0, + "completion_length": 760.6562744140625, + "epoch": 0.12257961273803808, + "grad_norm": 0.12297973781824112, + "kl": 0.1835591211915016, + "learning_rate": 1.996941400413561e-05, + "loss": 0.0196, + "reward": 0.9614583671092987, + "reward_std": 0.15597959961742164, + "rewards/accuracy_reward": 0.014583333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9468750298023224, + "step": 383 + }, + { + "clip_ratio": 0.0, + "completion_length": 731.2791809082031, + "epoch": 0.12289966394623139, + "grad_norm": 0.14354756474494934, + "kl": 0.22697254866361619, + "learning_rate": 1.996853433834948e-05, + "loss": -0.0106, + "reward": 1.0005208551883698, + "reward_std": 0.13219761587679385, + "rewards/accuracy_reward": 0.04583333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9546875178813934, + "step": 384 + }, + { + "clip_ratio": 0.0, + "completion_length": 721.2291809082031, + "epoch": 0.1232197151544247, + "grad_norm": 0.3283243179321289, + "kl": 0.7633262783288955, + "learning_rate": 1.996764222141756e-05, + "loss": -0.0299, + "reward": 0.9703125298023224, + "reward_std": 0.25556026250123975, + "rewards/accuracy_reward": 0.05833333563059569, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9119791924953461, + "step": 385 + }, + { + "clip_ratio": 0.0, + "completion_length": 719.9437683105468, + "epoch": 0.12353976636261801, + "grad_norm": 0.8115874528884888, + "kl": 0.4921633303165436, + "learning_rate": 1.9966737654454153e-05, + "loss": -0.0014, + "reward": 0.9791666984558105, + "reward_std": 0.21891369968652724, + "rewards/accuracy_reward": 0.0645833358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9145833671092987, + "step": 386 + }, + { + "clip_ratio": 0.0, + "completion_length": 712.5854431152344, + "epoch": 0.12385981757081133, + "grad_norm": 0.18551310896873474, + "kl": 0.20581552758812904, + "learning_rate": 1.9965820638589095e-05, + "loss": 0.052, + "reward": 1.1244791924953461, + "reward_std": 0.19005530402064325, + "rewards/accuracy_reward": 0.1687500074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9557291865348816, + "step": 387 + }, + { + "clip_ratio": 0.0, + "completion_length": 771.4646057128906, + "epoch": 0.12417986877900464, + "grad_norm": 0.09523138403892517, + "kl": 0.0998759150505066, + "learning_rate": 1.9964891174967786e-05, + "loss": 0.0152, + "reward": 0.9786458432674408, + "reward_std": 0.14558621719479561, + "rewards/accuracy_reward": 0.02708333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.951562511920929, + "step": 388 + }, + { + "clip_ratio": 0.0, + "completion_length": 762.3541931152344, + "epoch": 0.12449991998719795, + "grad_norm": 0.05193324387073517, + "kl": 0.09262499958276749, + "learning_rate": 1.996394926475116e-05, + "loss": 0.0111, + "reward": 1.0489583432674408, + "reward_std": 0.1252418950200081, + "rewards/accuracy_reward": 0.08125000242143869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9677083492279053, + "step": 389 + }, + { + "clip_ratio": 0.0, + "completion_length": 715.6854370117187, + "epoch": 0.12481997119539126, + "grad_norm": 0.06766840070486069, + "kl": 0.10407169163227081, + "learning_rate": 1.996299490911571e-05, + "loss": 0.0009, + "reward": 1.0505208671092987, + "reward_std": 0.15630092974752188, + "rewards/accuracy_reward": 0.0875000013038516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9630208551883698, + "step": 390 + }, + { + "clip_ratio": 0.0, + "completion_length": 739.745849609375, + "epoch": 0.12514002240358457, + "grad_norm": 0.06490278989076614, + "kl": 0.22728676721453667, + "learning_rate": 1.9962028109253474e-05, + "loss": 0.0092, + "reward": 0.9703125119209289, + "reward_std": 0.1388203686103225, + "rewards/accuracy_reward": 0.010416666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9598958492279053, + "step": 391 + }, + { + "clip_ratio": 0.0, + "completion_length": 788.3354370117188, + "epoch": 0.12546007361177788, + "grad_norm": 0.07218482345342636, + "kl": 0.12018184587359429, + "learning_rate": 1.9961048866372016e-05, + "loss": 0.0265, + "reward": 1.0890625238418579, + "reward_std": 0.1456417091190815, + "rewards/accuracy_reward": 0.1250000024214387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9640625178813934, + "step": 392 + }, + { + "clip_ratio": 0.0, + "completion_length": 780.0896057128906, + "epoch": 0.1257801248199712, + "grad_norm": 0.09532662481069565, + "kl": 0.10603573620319366, + "learning_rate": 1.9960057181694464e-05, + "loss": 0.0204, + "reward": 1.0380208492279053, + "reward_std": 0.10570584982633591, + "rewards/accuracy_reward": 0.07083333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.967187511920929, + "step": 393 + }, + { + "clip_ratio": 0.0, + "completion_length": 766.1437683105469, + "epoch": 0.1261001760281645, + "grad_norm": 0.07415325194597244, + "kl": 0.16074122115969658, + "learning_rate": 1.9959053056459474e-05, + "loss": 0.0278, + "reward": 1.024479192495346, + "reward_std": 0.13585025519132615, + "rewards/accuracy_reward": 0.06458333507180214, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9598958492279053, + "step": 394 + }, + { + "clip_ratio": 0.0, + "completion_length": 751.6562805175781, + "epoch": 0.12642022723635782, + "grad_norm": 0.050613872706890106, + "kl": 0.09890075251460076, + "learning_rate": 1.995803649192124e-05, + "loss": 0.0305, + "reward": 1.012500023841858, + "reward_std": 0.16258562356233597, + "rewards/accuracy_reward": 0.05000000093132258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9625000119209289, + "step": 395 + }, + { + "clip_ratio": 0.0, + "completion_length": 792.3625183105469, + "epoch": 0.12674027844455113, + "grad_norm": 0.07034117728471756, + "kl": 0.1736940063536167, + "learning_rate": 1.9957007489349505e-05, + "loss": 0.0082, + "reward": 1.0932291924953461, + "reward_std": 0.12479073386639357, + "rewards/accuracy_reward": 0.11458333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458492279053, + "step": 396 + }, + { + "clip_ratio": 0.0, + "completion_length": 803.2937683105469, + "epoch": 0.12706032965274444, + "grad_norm": 0.1689036339521408, + "kl": 0.11238553300499916, + "learning_rate": 1.995596605002953e-05, + "loss": 0.0313, + "reward": 1.078125035762787, + "reward_std": 0.14898527152836322, + "rewards/accuracy_reward": 0.1104166692122817, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9677083551883697, + "step": 397 + }, + { + "clip_ratio": 0.0, + "completion_length": 820.8812744140625, + "epoch": 0.12738038086093775, + "grad_norm": 0.1099768802523613, + "kl": 0.09775342904031277, + "learning_rate": 1.9954912175262122e-05, + "loss": 0.0426, + "reward": 1.015625011920929, + "reward_std": 0.13907793909311295, + "rewards/accuracy_reward": 0.054166667722165586, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9614583432674408, + "step": 398 + }, + { + "clip_ratio": 0.0, + "completion_length": 788.308349609375, + "epoch": 0.12770043206913106, + "grad_norm": 0.33350008726119995, + "kl": 0.5557160004973412, + "learning_rate": 1.995384586636362e-05, + "loss": -0.0012, + "reward": 1.067187523841858, + "reward_std": 0.12045919597148895, + "rewards/accuracy_reward": 0.10208333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9651041805744172, + "step": 399 + }, + { + "clip_ratio": 0.0, + "completion_length": 744.6021057128906, + "epoch": 0.12802048327732438, + "grad_norm": 0.14938224852085114, + "kl": 0.1822477553039789, + "learning_rate": 1.9952767124665892e-05, + "loss": 0.0283, + "reward": 1.0151041924953461, + "reward_std": 0.14971655271947384, + "rewards/accuracy_reward": 0.06250000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9526041865348815, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completion_length": 792.327099609375, + "epoch": 0.1283405344855177, + "grad_norm": 0.063927561044693, + "kl": 0.1587657429277897, + "learning_rate": 1.995167595151633e-05, + "loss": 0.0413, + "reward": 1.0302083671092988, + "reward_std": 0.21063872575759887, + "rewards/accuracy_reward": 0.08750000335276127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9427083611488343, + "step": 401 + }, + { + "clip_ratio": 0.0, + "completion_length": 761.5937683105469, + "epoch": 0.128660585693711, + "grad_norm": 0.17094162106513977, + "kl": 0.2179608315229416, + "learning_rate": 1.9950572348277862e-05, + "loss": 0.0222, + "reward": 1.0036458551883698, + "reward_std": 0.17909386456012727, + "rewards/accuracy_reward": 0.07083333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.932812511920929, + "step": 402 + }, + { + "clip_ratio": 0.0, + "completion_length": 768.9104309082031, + "epoch": 0.1289806369019043, + "grad_norm": 0.09395050257444382, + "kl": 0.18149636760354043, + "learning_rate": 1.9949456316328942e-05, + "loss": 0.0304, + "reward": 1.0598958492279054, + "reward_std": 0.17585733085870742, + "rewards/accuracy_reward": 0.12083333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9390625178813934, + "step": 403 + }, + { + "clip_ratio": 0.0, + "completion_length": 814.5562622070313, + "epoch": 0.12930068811009762, + "grad_norm": 0.14399601519107819, + "kl": 0.2368574135005474, + "learning_rate": 1.9948327857063536e-05, + "loss": 0.0752, + "reward": 0.9562500298023224, + "reward_std": 0.22825298383831977, + "rewards/accuracy_reward": 0.04375, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9125000238418579, + "step": 404 + }, + { + "clip_ratio": 0.0, + "completion_length": 792.5541870117188, + "epoch": 0.12962073931829093, + "grad_norm": 0.14934572577476501, + "kl": 0.2532314211130142, + "learning_rate": 1.9947186971891143e-05, + "loss": 0.0565, + "reward": 0.9265625298023223, + "reward_std": 0.25942430049180987, + "rewards/accuracy_reward": 0.03333333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8932291865348816, + "step": 405 + }, + { + "clip_ratio": 0.0, + "completion_length": 788.277099609375, + "epoch": 0.12994079052648425, + "grad_norm": 0.07283183187246323, + "kl": 0.11642909124493599, + "learning_rate": 1.9946033662236778e-05, + "loss": 0.0257, + "reward": 1.043750023841858, + "reward_std": 0.19481766372919082, + "rewards/accuracy_reward": 0.11250000447034836, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.931250023841858, + "step": 406 + }, + { + "clip_ratio": 0.0, + "completion_length": 776.9812683105469, + "epoch": 0.13026084173467756, + "grad_norm": 0.23960764706134796, + "kl": 0.1925484672188759, + "learning_rate": 1.994486792954098e-05, + "loss": 0.0636, + "reward": 0.9286458611488342, + "reward_std": 0.20316977724432944, + "rewards/accuracy_reward": 0.01041666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9182291865348816, + "step": 407 + }, + { + "clip_ratio": 0.0, + "completion_length": 784.9062622070312, + "epoch": 0.13058089294287087, + "grad_norm": 0.3904155194759369, + "kl": 0.26361445263028144, + "learning_rate": 1.99436897752598e-05, + "loss": 0.0458, + "reward": 0.986979192495346, + "reward_std": 0.23148050159215927, + "rewards/accuracy_reward": 0.07916666939854622, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9078125178813934, + "step": 408 + }, + { + "clip_ratio": 0.0, + "completion_length": 727.0875244140625, + "epoch": 0.13090094415106418, + "grad_norm": 0.1020391508936882, + "kl": 0.31486469730734823, + "learning_rate": 1.9942499200864805e-05, + "loss": 0.0486, + "reward": 1.0520833492279054, + "reward_std": 0.2606917768716812, + "rewards/accuracy_reward": 0.14791667014360427, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9041666865348816, + "step": 409 + }, + { + "clip_ratio": 0.0, + "completion_length": 751.4396057128906, + "epoch": 0.1312209953592575, + "grad_norm": 0.11322091519832611, + "kl": 0.16842807456851006, + "learning_rate": 1.994129620784307e-05, + "loss": 0.0705, + "reward": 1.0145833551883698, + "reward_std": 0.18669217731803656, + "rewards/accuracy_reward": 0.08333333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9312500178813934, + "step": 410 + }, + { + "clip_ratio": 0.0, + "completion_length": 718.8125183105469, + "epoch": 0.1315410465674508, + "grad_norm": 0.14536863565444946, + "kl": 0.3029049329459667, + "learning_rate": 1.9940080797697203e-05, + "loss": 0.0313, + "reward": 1.0515625178813934, + "reward_std": 0.2398408681154251, + "rewards/accuracy_reward": 0.13958334028720856, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9119791865348816, + "step": 411 + }, + { + "clip_ratio": 0.0, + "completion_length": 765.3958557128906, + "epoch": 0.13186109777564411, + "grad_norm": 0.20916740596294403, + "kl": 0.19605226889252664, + "learning_rate": 1.993885297194529e-05, + "loss": 0.0686, + "reward": 0.9598958611488342, + "reward_std": 0.18760251104831696, + "rewards/accuracy_reward": 0.025, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9348958611488343, + "step": 412 + }, + { + "clip_ratio": 0.0, + "completion_length": 730.7062805175781, + "epoch": 0.13218114898383743, + "grad_norm": 0.38499322533607483, + "kl": 0.2664633825421333, + "learning_rate": 1.9937612732120947e-05, + "loss": 0.0846, + "reward": 1.0156250238418578, + "reward_std": 0.2381811335682869, + "rewards/accuracy_reward": 0.10833333637565375, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9072916805744171, + "step": 413 + }, + { + "clip_ratio": 0.0, + "completion_length": 704.5041931152343, + "epoch": 0.13250120019203074, + "grad_norm": 0.2399313747882843, + "kl": 0.23138594403862953, + "learning_rate": 1.9936360079773287e-05, + "loss": 0.0258, + "reward": 1.0177083551883697, + "reward_std": 0.13459027968347073, + "rewards/accuracy_reward": 0.06875000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9489583611488343, + "step": 414 + }, + { + "clip_ratio": 0.0, + "completion_length": 679.6562805175781, + "epoch": 0.13282125140022405, + "grad_norm": 0.12286972999572754, + "kl": 0.19738261252641678, + "learning_rate": 1.993509501646693e-05, + "loss": 0.0594, + "reward": 0.9828125357627868, + "reward_std": 0.17468988746404648, + "rewards/accuracy_reward": 0.04583333469927311, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9369791865348815, + "step": 415 + }, + { + "clip_ratio": 0.0, + "completion_length": 700.7916870117188, + "epoch": 0.13314130260841733, + "grad_norm": 0.09988022595643997, + "kl": 0.1340769723057747, + "learning_rate": 1.9933817543781998e-05, + "loss": 0.0263, + "reward": 1.0812500238418579, + "reward_std": 0.10942931771278382, + "rewards/accuracy_reward": 0.11041667014360428, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333432674408, + "step": 416 + }, + { + "clip_ratio": 0.0, + "completion_length": 683.0896118164062, + "epoch": 0.13346135381661065, + "grad_norm": 0.1567991077899933, + "kl": 0.16551008746027945, + "learning_rate": 1.9932527663314113e-05, + "loss": 0.0718, + "reward": 1.1494791865348817, + "reward_std": 0.19006953090429307, + "rewards/accuracy_reward": 0.1979166731238365, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9515625178813935, + "step": 417 + }, + { + "clip_ratio": 0.0, + "completion_length": 681.19169921875, + "epoch": 0.13378140502480396, + "grad_norm": 0.09303200244903564, + "kl": 0.1856502816081047, + "learning_rate": 1.9931225376674388e-05, + "loss": 0.0536, + "reward": 0.9911458492279053, + "reward_std": 0.13494500890374184, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9557291805744171, + "step": 418 + }, + { + "clip_ratio": 0.0, + "completion_length": 687.7937683105469, + "epoch": 0.13410145623299727, + "grad_norm": 0.1906178593635559, + "kl": 0.13947633281350136, + "learning_rate": 1.992991068548944e-05, + "loss": 0.0264, + "reward": 1.035937523841858, + "reward_std": 0.13257458936423064, + "rewards/accuracy_reward": 0.07500000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9609375238418579, + "step": 419 + }, + { + "clip_ratio": 0.0, + "completion_length": 660.9916870117188, + "epoch": 0.13442150744119058, + "grad_norm": 0.07890919595956802, + "kl": 0.14936097264289855, + "learning_rate": 1.9928583591401376e-05, + "loss": 0.061, + "reward": 1.021875023841858, + "reward_std": 0.12687795273959637, + "rewards/accuracy_reward": 0.05208333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916746139527, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completion_length": 636.2854431152343, + "epoch": 0.1347415586493839, + "grad_norm": 0.1735936552286148, + "kl": 0.1333605393767357, + "learning_rate": 1.99272440960678e-05, + "loss": 0.049, + "reward": 1.0536458551883698, + "reward_std": 0.13581256624311208, + "rewards/accuracy_reward": 0.07291666902601719, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9807291805744172, + "step": 421 + }, + { + "clip_ratio": 0.0, + "completion_length": 653.3291931152344, + "epoch": 0.1350616098575772, + "grad_norm": 0.07206610590219498, + "kl": 0.12243280112743378, + "learning_rate": 1.9925892201161794e-05, + "loss": 0.0189, + "reward": 1.0270833551883698, + "reward_std": 0.12921709027141332, + "rewards/accuracy_reward": 0.05416666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9729166805744172, + "step": 422 + }, + { + "clip_ratio": 0.0, + "completion_length": 599.7333435058594, + "epoch": 0.13538166106577051, + "grad_norm": 0.08999348431825638, + "kl": 0.15753435716032982, + "learning_rate": 1.9924527908371942e-05, + "loss": 0.0224, + "reward": 1.1036458671092988, + "reward_std": 0.08559925891458989, + "rewards/accuracy_reward": 0.12500000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458492279053, + "step": 423 + }, + { + "clip_ratio": 0.0, + "completion_length": 695.9896118164063, + "epoch": 0.13570171227396383, + "grad_norm": 0.14725755155086517, + "kl": 0.19224329441785812, + "learning_rate": 1.9923151219402308e-05, + "loss": 0.0054, + "reward": 1.025000023841858, + "reward_std": 0.18301806673407556, + "rewards/accuracy_reward": 0.07083333469927311, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9541666865348816, + "step": 424 + }, + { + "clip_ratio": 0.0, + "completion_length": 676.3000183105469, + "epoch": 0.13602176348215714, + "grad_norm": 0.15762047469615936, + "kl": 0.16184305176138877, + "learning_rate": 1.9921762135972433e-05, + "loss": 0.0042, + "reward": 1.0750000417232513, + "reward_std": 0.11091457530856133, + "rewards/accuracy_reward": 0.10416667070239782, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333611488342, + "step": 425 + }, + { + "clip_ratio": 0.0, + "completion_length": 672.7541900634766, + "epoch": 0.13634181469035045, + "grad_norm": 0.18144048750400543, + "kl": 0.1525034002959728, + "learning_rate": 1.9920360659817345e-05, + "loss": 0.0096, + "reward": 1.0182291865348816, + "reward_std": 0.11338572651147842, + "rewards/accuracy_reward": 0.03958333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458492279053, + "step": 426 + }, + { + "clip_ratio": 0.0, + "completion_length": 671.8291870117188, + "epoch": 0.13666186589854376, + "grad_norm": 0.5098335146903992, + "kl": 0.28342188000679014, + "learning_rate": 1.9918946792687553e-05, + "loss": -0.0123, + "reward": 1.0364583671092986, + "reward_std": 0.1355344034731388, + "rewards/accuracy_reward": 0.07500000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9614583551883698, + "step": 427 + }, + { + "clip_ratio": 0.0, + "completion_length": 660.2937744140625, + "epoch": 0.13698191710673707, + "grad_norm": 0.6447596549987793, + "kl": 0.19238312169909477, + "learning_rate": 1.9917520536349043e-05, + "loss": -0.0135, + "reward": 1.0156250298023224, + "reward_std": 0.1723696757107973, + "rewards/accuracy_reward": 0.06458333507180214, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9510416865348816, + "step": 428 + }, + { + "clip_ratio": 0.0, + "completion_length": 674.4146057128906, + "epoch": 0.13730196831493038, + "grad_norm": 0.3913459777832031, + "kl": 0.6216646380722523, + "learning_rate": 1.9916081892583264e-05, + "loss": -0.0312, + "reward": 1.0171875298023223, + "reward_std": 0.14305626451969147, + "rewards/accuracy_reward": 0.05625000167638063, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9609375178813935, + "step": 429 + }, + { + "clip_ratio": 0.0, + "completion_length": 651.2750183105469, + "epoch": 0.1376220195231237, + "grad_norm": 2.6290104389190674, + "kl": 2.6219520051032306, + "learning_rate": 1.9914630863187156e-05, + "loss": 0.0012, + "reward": 1.2223958551883698, + "reward_std": 0.13749304972589016, + "rewards/accuracy_reward": 0.2604166746139526, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9619791865348816, + "step": 430 + }, + { + "clip_ratio": 0.0, + "completion_length": 639.3666870117188, + "epoch": 0.137942070731317, + "grad_norm": 0.130653515458107, + "kl": 0.22933876588940622, + "learning_rate": 1.991316744997311e-05, + "loss": -0.0182, + "reward": 1.1442708611488341, + "reward_std": 0.2026175085455179, + "rewards/accuracy_reward": 0.18541667368263007, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9588541924953461, + "step": 431 + }, + { + "clip_ratio": 0.0, + "completion_length": 644.3041931152344, + "epoch": 0.13826212193951032, + "grad_norm": 0.14742442965507507, + "kl": 0.15334233343601228, + "learning_rate": 1.9911691654769004e-05, + "loss": -0.017, + "reward": 1.0322916865348817, + "reward_std": 0.11324199475347996, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9614583551883698, + "step": 432 + }, + { + "clip_ratio": 0.0, + "completion_length": 635.5687744140625, + "epoch": 0.13858217314770363, + "grad_norm": 0.10946609079837799, + "kl": 0.2557044789195061, + "learning_rate": 1.991020347941817e-05, + "loss": -0.0395, + "reward": 1.055729192495346, + "reward_std": 0.17008791603147982, + "rewards/accuracy_reward": 0.08958333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9661458492279053, + "step": 433 + }, + { + "clip_ratio": 0.0, + "completion_length": 641.2375244140625, + "epoch": 0.13890222435589694, + "grad_norm": 0.37317514419555664, + "kl": 0.4989757601171732, + "learning_rate": 1.99087029257794e-05, + "loss": -0.073, + "reward": 1.0625000238418578, + "reward_std": 0.1615608898922801, + "rewards/accuracy_reward": 0.11875000596046448, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9437500238418579, + "step": 434 + }, + { + "clip_ratio": 0.0, + "completion_length": 670.2771118164062, + "epoch": 0.13922227556409025, + "grad_norm": 0.4209003448486328, + "kl": 0.7271633610129357, + "learning_rate": 1.990718999572696e-05, + "loss": -0.0251, + "reward": 1.0343750298023224, + "reward_std": 0.15719158351421356, + "rewards/accuracy_reward": 0.08333333693444729, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9510416805744171, + "step": 435 + }, + { + "clip_ratio": 0.0, + "completion_length": 663.7791931152344, + "epoch": 0.13954232677228356, + "grad_norm": 0.3114742934703827, + "kl": 0.837194798886776, + "learning_rate": 1.9905664691150567e-05, + "loss": -0.0674, + "reward": 1.0057291865348816, + "reward_std": 0.20192696750164033, + "rewards/accuracy_reward": 0.06041666902601719, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9453125178813935, + "step": 436 + }, + { + "clip_ratio": 0.0, + "completion_length": 706.2041870117188, + "epoch": 0.13986237798047688, + "grad_norm": 0.32630032300949097, + "kl": 0.714079699665308, + "learning_rate": 1.9904127013955385e-05, + "loss": -0.0474, + "reward": 0.9906250298023224, + "reward_std": 0.1500589970499277, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9531250238418579, + "step": 437 + }, + { + "clip_ratio": 0.0, + "completion_length": 729.1125244140625, + "epoch": 0.1401824291886702, + "grad_norm": 0.21482321619987488, + "kl": 1.2284217976033687, + "learning_rate": 1.990257696606205e-05, + "loss": -0.0535, + "reward": 0.970312523841858, + "reward_std": 0.17871998697519303, + "rewards/accuracy_reward": 0.025, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9453125178813935, + "step": 438 + }, + { + "clip_ratio": 0.0, + "completion_length": 685.1937683105468, + "epoch": 0.1405024803968635, + "grad_norm": 0.15007822215557098, + "kl": 0.6202835611999035, + "learning_rate": 1.9901014549406647e-05, + "loss": -0.047, + "reward": 1.0390625238418578, + "reward_std": 0.1587829865515232, + "rewards/accuracy_reward": 0.08333333488553762, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.955729192495346, + "step": 439 + }, + { + "clip_ratio": 0.0, + "completion_length": 674.8812683105468, + "epoch": 0.1408225316050568, + "grad_norm": 0.0724816843867302, + "kl": 0.7360513672232628, + "learning_rate": 1.9899439765940687e-05, + "loss": -0.0453, + "reward": 1.043229192495346, + "reward_std": 0.2185194693505764, + "rewards/accuracy_reward": 0.11666667200624943, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.926562511920929, + "step": 440 + }, + { + "clip_ratio": 0.0, + "completion_length": 664.8854309082031, + "epoch": 0.14114258281325012, + "grad_norm": 0.1124361976981163, + "kl": 0.5966664545238018, + "learning_rate": 1.989785261763116e-05, + "loss": -0.0513, + "reward": 1.0666667103767395, + "reward_std": 0.17953022867441176, + "rewards/accuracy_reward": 0.11041666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9562500238418579, + "step": 441 + }, + { + "clip_ratio": 0.0, + "completion_length": 738.7437622070313, + "epoch": 0.14146263402144343, + "grad_norm": 0.07581885904073715, + "kl": 0.2596233807504177, + "learning_rate": 1.9896253106460484e-05, + "loss": -0.0163, + "reward": 1.0000000238418578, + "reward_std": 0.1336134120821953, + "rewards/accuracy_reward": 0.047916668094694616, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9520833432674408, + "step": 442 + }, + { + "clip_ratio": 0.0, + "completion_length": 709.8104370117187, + "epoch": 0.14178268522963675, + "grad_norm": 0.10569097846746445, + "kl": 0.3374425023794174, + "learning_rate": 1.9894641234426512e-05, + "loss": -0.0491, + "reward": 1.0265625357627868, + "reward_std": 0.18511903360486032, + "rewards/accuracy_reward": 0.07083333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9557291865348816, + "step": 443 + }, + { + "clip_ratio": 0.0, + "completion_length": 713.5937683105469, + "epoch": 0.14210273643783006, + "grad_norm": 0.06325946003198624, + "kl": 0.23610839396715164, + "learning_rate": 1.989301700354255e-05, + "loss": -0.0304, + "reward": 1.028125023841858, + "reward_std": 0.13945788703858852, + "rewards/accuracy_reward": 0.058333336375653744, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.969791692495346, + "step": 444 + }, + { + "clip_ratio": 0.0, + "completion_length": 761.3291870117188, + "epoch": 0.14242278764602337, + "grad_norm": 0.06835056841373444, + "kl": 0.48913782387971877, + "learning_rate": 1.9891380415837333e-05, + "loss": -0.049, + "reward": 1.0885416984558105, + "reward_std": 0.13168836012482643, + "rewards/accuracy_reward": 0.12083333730697632, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9677083432674408, + "step": 445 + }, + { + "clip_ratio": 0.0, + "completion_length": 700.1416809082032, + "epoch": 0.14274283885421668, + "grad_norm": 0.20968970656394958, + "kl": 1.0379090007394551, + "learning_rate": 1.9889731473355037e-05, + "loss": -0.0439, + "reward": 1.088541680574417, + "reward_std": 0.17453248277306557, + "rewards/accuracy_reward": 0.12916667181998492, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9593750178813935, + "step": 446 + }, + { + "clip_ratio": 0.0, + "completion_length": 709.220849609375, + "epoch": 0.14306289006241, + "grad_norm": 0.10043805092573166, + "kl": 0.9384193673729897, + "learning_rate": 1.9888070178155255e-05, + "loss": -0.0607, + "reward": 0.9505208551883697, + "reward_std": 0.1598832830786705, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9505208551883697, + "step": 447 + }, + { + "clip_ratio": 0.0, + "completion_length": 738.3750183105469, + "epoch": 0.1433829412706033, + "grad_norm": 0.0979234129190445, + "kl": 0.8160348013043404, + "learning_rate": 1.9886396532313033e-05, + "loss": -0.0677, + "reward": 1.085937511920929, + "reward_std": 0.169689111225307, + "rewards/accuracy_reward": 0.13333333432674407, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9526041805744171, + "step": 448 + }, + { + "clip_ratio": 0.0, + "completion_length": 747.2396057128906, + "epoch": 0.14370299247879662, + "grad_norm": 0.14574353396892548, + "kl": 0.419120055437088, + "learning_rate": 1.9884710537918817e-05, + "loss": -0.0409, + "reward": 1.0651041865348816, + "reward_std": 0.1468802396208048, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9609375119209289, + "step": 449 + }, + { + "clip_ratio": 0.0, + "completion_length": 696.0854400634765, + "epoch": 0.14402304368698993, + "grad_norm": 0.16010096669197083, + "kl": 0.9446754395961762, + "learning_rate": 1.9883012197078497e-05, + "loss": -0.0705, + "reward": 1.1078125298023225, + "reward_std": 0.14860073514282704, + "rewards/accuracy_reward": 0.1479166716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9598958432674408, + "step": 450 + }, + { + "clip_ratio": 0.0, + "completion_length": 707.6729370117188, + "epoch": 0.14434309489518324, + "grad_norm": 0.1689026951789856, + "kl": 0.75221516340971, + "learning_rate": 1.9881301511913372e-05, + "loss": -0.0726, + "reward": 1.0015625298023223, + "reward_std": 0.20324954241514206, + "rewards/accuracy_reward": 0.06041666716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9411458492279052, + "step": 451 + }, + { + "clip_ratio": 0.0, + "completion_length": 748.652099609375, + "epoch": 0.14466314610337655, + "grad_norm": 0.3587219715118408, + "kl": 0.660324102267623, + "learning_rate": 1.987957848456017e-05, + "loss": -0.0702, + "reward": 1.0984375417232513, + "reward_std": 0.17780651152133942, + "rewards/accuracy_reward": 0.14375000316649675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9546875178813934, + "step": 452 + }, + { + "clip_ratio": 0.0, + "completion_length": 756.3708557128906, + "epoch": 0.14498319731156986, + "grad_norm": 0.12359297275543213, + "kl": 0.466735539957881, + "learning_rate": 1.9877843117171025e-05, + "loss": -0.0601, + "reward": 0.9791666865348816, + "reward_std": 0.14083536192774773, + "rewards/accuracy_reward": 0.018750001117587088, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9604166805744171, + "step": 453 + }, + { + "clip_ratio": 0.0, + "completion_length": 719.5229370117188, + "epoch": 0.14530324851976317, + "grad_norm": 0.16601644456386566, + "kl": 0.2682775568217039, + "learning_rate": 1.9876095411913492e-05, + "loss": -0.0461, + "reward": 1.0067708611488342, + "reward_std": 0.14443401992321014, + "rewards/accuracy_reward": 0.04791666772216559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9588541865348816, + "step": 454 + }, + { + "clip_ratio": 0.0, + "completion_length": 730.7833557128906, + "epoch": 0.14562329972795648, + "grad_norm": 0.08124036341905594, + "kl": 0.20361214652657508, + "learning_rate": 1.9874335370970527e-05, + "loss": -0.0202, + "reward": 1.025000011920929, + "reward_std": 0.10506038665771485, + "rewards/accuracy_reward": 0.04791666772216559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833432674408, + "step": 455 + }, + { + "clip_ratio": 0.0, + "completion_length": 742.1937774658203, + "epoch": 0.1459433509361498, + "grad_norm": 0.15889286994934082, + "kl": 0.24789012856781484, + "learning_rate": 1.9872562996540506e-05, + "loss": -0.0089, + "reward": 0.957812511920929, + "reward_std": 0.13532231301069259, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9557291805744171, + "step": 456 + }, + { + "clip_ratio": 0.0, + "completion_length": 673.864599609375, + "epoch": 0.1462634021443431, + "grad_norm": 0.10304388403892517, + "kl": 0.5019782140851021, + "learning_rate": 1.9870778290837198e-05, + "loss": -0.0221, + "reward": 1.0114583551883698, + "reward_std": 0.18528176210820674, + "rewards/accuracy_reward": 0.05208333507180214, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.959375011920929, + "step": 457 + }, + { + "clip_ratio": 0.0, + "completion_length": 751.6729248046875, + "epoch": 0.14658345335253642, + "grad_norm": 0.46115314960479736, + "kl": 0.4596158303320408, + "learning_rate": 1.986898125608979e-05, + "loss": -0.0049, + "reward": 0.9968750059604645, + "reward_std": 0.159324312210083, + "rewards/accuracy_reward": 0.045833333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9510416746139526, + "step": 458 + }, + { + "clip_ratio": 0.0, + "completion_length": 728.0625244140625, + "epoch": 0.1469035045607297, + "grad_norm": 0.1962408721446991, + "kl": 1.2458348341286183, + "learning_rate": 1.9867171894542848e-05, + "loss": -0.0097, + "reward": 1.0625000238418578, + "reward_std": 0.20020099878311157, + "rewards/accuracy_reward": 0.12708333786576986, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9354166746139526, + "step": 459 + }, + { + "clip_ratio": 0.0, + "completion_length": 699.1791931152344, + "epoch": 0.14722355576892301, + "grad_norm": 0.26008763909339905, + "kl": 1.0917382821440698, + "learning_rate": 1.9865350208456354e-05, + "loss": -0.0603, + "reward": 0.9786458551883698, + "reward_std": 0.21206720396876336, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9098958551883698, + "step": 460 + }, + { + "clip_ratio": 0.0, + "completion_length": 641.1541870117187, + "epoch": 0.14754360697711633, + "grad_norm": 0.2097797393798828, + "kl": 0.6077730596065521, + "learning_rate": 1.986351620010567e-05, + "loss": -0.0465, + "reward": 0.9921875238418579, + "reward_std": 0.2253425493836403, + "rewards/accuracy_reward": 0.0916666692122817, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9005208373069763, + "step": 461 + }, + { + "clip_ratio": 0.0, + "completion_length": 664.3604400634765, + "epoch": 0.14786365818530964, + "grad_norm": 0.16634413599967957, + "kl": 0.8130259275436401, + "learning_rate": 1.9861669871781558e-05, + "loss": -0.056, + "reward": 0.9109375178813934, + "reward_std": 0.2595849081873894, + "rewards/accuracy_reward": 0.03750000167638064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.873437511920929, + "step": 462 + }, + { + "clip_ratio": 0.0, + "completion_length": 656.364599609375, + "epoch": 0.14818370939350295, + "grad_norm": 0.20586910843849182, + "kl": 0.7062367737293244, + "learning_rate": 1.9859811225790164e-05, + "loss": -0.0838, + "reward": 0.9447916805744171, + "reward_std": 0.25804681032896043, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8760416805744171, + "step": 463 + }, + { + "clip_ratio": 0.0, + "completion_length": 686.0896057128906, + "epoch": 0.14850376060169626, + "grad_norm": 0.1890844851732254, + "kl": 0.8060359954833984, + "learning_rate": 1.9857940264453015e-05, + "loss": -0.0408, + "reward": 1.0729166984558105, + "reward_std": 0.27789904475212096, + "rewards/accuracy_reward": 0.18958333879709244, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8833333432674408, + "step": 464 + }, + { + "clip_ratio": 0.0, + "completion_length": 699.0854431152344, + "epoch": 0.14882381180988957, + "grad_norm": 0.1812220811843872, + "kl": 0.8772768050432205, + "learning_rate": 1.9856056990107035e-05, + "loss": -0.0376, + "reward": 0.9104166924953461, + "reward_std": 0.2736320853233337, + "rewards/accuracy_reward": 0.025000000558793544, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.885416692495346, + "step": 465 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.462515258789, + "epoch": 0.14914386301808288, + "grad_norm": 0.4718879461288452, + "kl": 0.8429543949663639, + "learning_rate": 1.9854161405104512e-05, + "loss": -0.0093, + "reward": 1.0270833551883698, + "reward_std": 0.2788348212838173, + "rewards/accuracy_reward": 0.12083333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9062500119209289, + "step": 466 + }, + { + "clip_ratio": 0.0, + "completion_length": 671.1646057128906, + "epoch": 0.1494639142262762, + "grad_norm": 0.12379126250743866, + "kl": 0.4919497549533844, + "learning_rate": 1.9852253511813117e-05, + "loss": -0.026, + "reward": 0.9906250178813935, + "reward_std": 0.17988998740911483, + "rewards/accuracy_reward": 0.037500000186264516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9531250178813935, + "step": 467 + }, + { + "clip_ratio": 0.0, + "completion_length": 699.8041931152344, + "epoch": 0.1497839654344695, + "grad_norm": 0.0915331318974495, + "kl": 0.4208625890314579, + "learning_rate": 1.9850333312615895e-05, + "loss": 0.0, + "reward": 0.9916666865348815, + "reward_std": 0.13873637914657594, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9520833432674408, + "step": 468 + }, + { + "clip_ratio": 0.0, + "completion_length": 650.2416931152344, + "epoch": 0.15010401664266282, + "grad_norm": 0.06342560797929764, + "kl": 0.2490247033536434, + "learning_rate": 1.9848400809911255e-05, + "loss": -0.0083, + "reward": 1.0442708551883697, + "reward_std": 0.13400398399680852, + "rewards/accuracy_reward": 0.07291667014360428, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9713541746139527, + "step": 469 + }, + { + "clip_ratio": 0.0, + "completion_length": 678.0625183105469, + "epoch": 0.15042406785085613, + "grad_norm": 0.08342447876930237, + "kl": 0.3054353781044483, + "learning_rate": 1.9846456006112993e-05, + "loss": 0.0171, + "reward": 1.0041666865348815, + "reward_std": 0.11574141420423985, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9666666746139526, + "step": 470 + }, + { + "clip_ratio": 0.0, + "completion_length": 665.9395935058594, + "epoch": 0.15074411905904944, + "grad_norm": 0.05020037665963173, + "kl": 0.21988069340586663, + "learning_rate": 1.9844498903650246e-05, + "loss": -0.0038, + "reward": 1.044791692495346, + "reward_std": 0.1145276602357626, + "rewards/accuracy_reward": 0.07291666921228171, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9718750178813934, + "step": 471 + }, + { + "clip_ratio": 0.0, + "completion_length": 649.4000183105469, + "epoch": 0.15106417026724275, + "grad_norm": 0.07973285764455795, + "kl": 0.13310433998703958, + "learning_rate": 1.9842529504967522e-05, + "loss": -0.0014, + "reward": 1.0661458551883698, + "reward_std": 0.08744059428572655, + "rewards/accuracy_reward": 0.08125000223517417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9848958492279053, + "step": 472 + }, + { + "clip_ratio": 0.0, + "completion_length": 631.7521057128906, + "epoch": 0.15138422147543606, + "grad_norm": 0.07526786625385284, + "kl": 0.1988142393529415, + "learning_rate": 1.9840547812524692e-05, + "loss": 0.0236, + "reward": 1.0739583671092987, + "reward_std": 0.10084521546959876, + "rewards/accuracy_reward": 0.0958333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250178813934, + "step": 473 + }, + { + "clip_ratio": 0.0, + "completion_length": 642.5104370117188, + "epoch": 0.15170427268362938, + "grad_norm": 0.10696788132190704, + "kl": 0.20733307376503946, + "learning_rate": 1.9838553828796977e-05, + "loss": 0.0667, + "reward": 1.1098958671092987, + "reward_std": 0.1374841509386897, + "rewards/accuracy_reward": 0.13750000540167093, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9723958432674408, + "step": 474 + }, + { + "clip_ratio": 0.0, + "completion_length": 691.658349609375, + "epoch": 0.1520243238918227, + "grad_norm": 0.08135160803794861, + "kl": 0.13316810727119446, + "learning_rate": 1.9836547556274954e-05, + "loss": 0.0229, + "reward": 1.1130208492279052, + "reward_std": 0.07824347745627165, + "rewards/accuracy_reward": 0.1270833395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9859375059604645, + "step": 475 + }, + { + "clip_ratio": 0.0, + "completion_length": 714.3812805175781, + "epoch": 0.152344375100016, + "grad_norm": 0.04559599235653877, + "kl": 0.12910185605287552, + "learning_rate": 1.9834528997464543e-05, + "loss": 0.0007, + "reward": 0.9895833492279053, + "reward_std": 0.05833333432674408, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.987500011920929, + "step": 476 + }, + { + "clip_ratio": 0.0, + "completion_length": 711.7854309082031, + "epoch": 0.1526644263082093, + "grad_norm": 0.04492470622062683, + "kl": 0.42608394622802737, + "learning_rate": 1.983249815488702e-05, + "loss": 0.0006, + "reward": 1.1776041865348816, + "reward_std": 0.08468389138579369, + "rewards/accuracy_reward": 0.1895833395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9880208432674408, + "step": 477 + }, + { + "clip_ratio": 0.0, + "completion_length": 693.3146179199218, + "epoch": 0.15298447751640262, + "grad_norm": 0.11721441149711609, + "kl": 0.16851173639297484, + "learning_rate": 1.9830455031078994e-05, + "loss": 0.0135, + "reward": 1.0994791984558105, + "reward_std": 0.09610393829643726, + "rewards/accuracy_reward": 0.11875000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9807291865348816, + "step": 478 + }, + { + "clip_ratio": 0.0, + "completion_length": 718.1646057128906, + "epoch": 0.15330452872459593, + "grad_norm": 0.06621968746185303, + "kl": 0.16904096342623234, + "learning_rate": 1.9828399628592415e-05, + "loss": 0.019, + "reward": 1.0234375178813935, + "reward_std": 0.08933095633983612, + "rewards/accuracy_reward": 0.041666668839752674, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708373069763, + "step": 479 + }, + { + "clip_ratio": 0.0, + "completion_length": 676.7208557128906, + "epoch": 0.15362457993278925, + "grad_norm": 0.04756326973438263, + "kl": 0.1703629747033119, + "learning_rate": 1.982633194999458e-05, + "loss": 0.0217, + "reward": 1.0296875178813933, + "reward_std": 0.12128421142697335, + "rewards/accuracy_reward": 0.04583333414047956, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9838541805744171, + "step": 480 + }, + { + "clip_ratio": 0.0, + "completion_length": 705.4875244140625, + "epoch": 0.15394463114098256, + "grad_norm": 0.05060265213251114, + "kl": 0.14812782481312753, + "learning_rate": 1.982425199786811e-05, + "loss": 0.0175, + "reward": 0.9734375178813934, + "reward_std": 0.0796633617952466, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9734375178813934, + "step": 481 + }, + { + "clip_ratio": 0.0, + "completion_length": 673.8708526611329, + "epoch": 0.15426468234917587, + "grad_norm": 0.05663428083062172, + "kl": 0.11479860544204712, + "learning_rate": 1.982215977481096e-05, + "loss": 0.0233, + "reward": 1.0630208492279052, + "reward_std": 0.1249284602701664, + "rewards/accuracy_reward": 0.08541666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9776041686534882, + "step": 482 + }, + { + "clip_ratio": 0.0, + "completion_length": 654.2604431152344, + "epoch": 0.15458473355736918, + "grad_norm": 0.04875878244638443, + "kl": 0.15140649005770684, + "learning_rate": 1.9820055283436405e-05, + "loss": 0.0177, + "reward": 1.0848958671092988, + "reward_std": 0.13879981879144906, + "rewards/accuracy_reward": 0.10833333656191826, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9765625178813935, + "step": 483 + }, + { + "clip_ratio": 0.0, + "completion_length": 684.2354248046875, + "epoch": 0.1549047847655625, + "grad_norm": 0.19684112071990967, + "kl": 0.12192679718136787, + "learning_rate": 1.981793852637305e-05, + "loss": 0.012, + "reward": 1.0083333671092987, + "reward_std": 0.13445462454110385, + "rewards/accuracy_reward": 0.033333333767950536, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9750000178813935, + "step": 484 + }, + { + "clip_ratio": 0.0, + "completion_length": 677.3979370117188, + "epoch": 0.1552248359737558, + "grad_norm": 0.0429503507912159, + "kl": 0.07962028235197068, + "learning_rate": 1.9815809506264822e-05, + "loss": 0.0169, + "reward": 1.1307291984558105, + "reward_std": 0.10726220346987247, + "rewards/accuracy_reward": 0.15000000800937413, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9807291805744172, + "step": 485 + }, + { + "clip_ratio": 0.0, + "completion_length": 700.1291931152343, + "epoch": 0.15554488718194912, + "grad_norm": 0.1666691154241562, + "kl": 0.3849435657262802, + "learning_rate": 1.9813668225770963e-05, + "loss": 0.0189, + "reward": 0.9781250059604645, + "reward_std": 0.14107858017086983, + "rewards/accuracy_reward": 0.018750000558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.959375011920929, + "step": 486 + }, + { + "clip_ratio": 0.0, + "completion_length": 645.3687744140625, + "epoch": 0.15586493839014243, + "grad_norm": 0.06179690733551979, + "kl": 0.11919120997190476, + "learning_rate": 1.981151468756603e-05, + "loss": 0.0271, + "reward": 1.0656250357627868, + "reward_std": 0.07652283795177936, + "rewards/accuracy_reward": 0.07916666902601718, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9864583492279053, + "step": 487 + }, + { + "clip_ratio": 0.0, + "completion_length": 695.7000183105469, + "epoch": 0.15618498959833574, + "grad_norm": 0.10747893154621124, + "kl": 0.18021718934178352, + "learning_rate": 1.9809348894339878e-05, + "loss": 0.0037, + "reward": 1.0776042103767396, + "reward_std": 0.13260272592306138, + "rewards/accuracy_reward": 0.10208333786576987, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208432674408, + "step": 488 + }, + { + "clip_ratio": 0.0, + "completion_length": 677.889599609375, + "epoch": 0.15650504080652905, + "grad_norm": 0.05964363366365433, + "kl": 0.2115800127387047, + "learning_rate": 1.9807170848797693e-05, + "loss": 0.0441, + "reward": 1.0671875417232513, + "reward_std": 0.14080357179045677, + "rewards/accuracy_reward": 0.10000000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9671875178813935, + "step": 489 + }, + { + "clip_ratio": 0.0, + "completion_length": 729.8416870117187, + "epoch": 0.15682509201472236, + "grad_norm": 0.05163416638970375, + "kl": 0.11210698634386063, + "learning_rate": 1.980498055365994e-05, + "loss": 0.0233, + "reward": 1.0614583611488342, + "reward_std": 0.11770116221159696, + "rewards/accuracy_reward": 0.08750000316649675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583551883697, + "step": 490 + }, + { + "clip_ratio": 0.0, + "completion_length": 724.8958618164063, + "epoch": 0.15714514322291567, + "grad_norm": 0.06177474930882454, + "kl": 0.19536799862980841, + "learning_rate": 1.9802778011662406e-05, + "loss": 0.0382, + "reward": 1.037500023841858, + "reward_std": 0.1432920940220356, + "rewards/accuracy_reward": 0.06875000111758708, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9687500178813935, + "step": 491 + }, + { + "clip_ratio": 0.0, + "completion_length": 728.8104431152344, + "epoch": 0.15746519443110898, + "grad_norm": 0.07783558964729309, + "kl": 0.4894620396196842, + "learning_rate": 1.980056322555616e-05, + "loss": 0.0228, + "reward": 0.9973958551883697, + "reward_std": 0.16165089309215547, + "rewards/accuracy_reward": 0.043750002048909664, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9536458611488342, + "step": 492 + }, + { + "clip_ratio": 0.0, + "completion_length": 691.7229370117187, + "epoch": 0.1577852456393023, + "grad_norm": 0.04840158671140671, + "kl": 0.1310286693274975, + "learning_rate": 1.9798336198107567e-05, + "loss": 0.0466, + "reward": 1.143229216337204, + "reward_std": 0.16915795914828777, + "rewards/accuracy_reward": 0.1791666742414236, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9640625178813934, + "step": 493 + }, + { + "clip_ratio": 0.0, + "completion_length": 710.3750183105469, + "epoch": 0.1581052968474956, + "grad_norm": 0.07563085108995438, + "kl": 0.16674732267856598, + "learning_rate": 1.979609693209829e-05, + "loss": 0.0548, + "reward": 1.014062523841858, + "reward_std": 0.19497016742825507, + "rewards/accuracy_reward": 0.06458333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9494791805744172, + "step": 494 + }, + { + "clip_ratio": 0.0, + "completion_length": 643.0791870117188, + "epoch": 0.15842534805568892, + "grad_norm": 0.07467242330312729, + "kl": 0.22235333174467087, + "learning_rate": 1.9793845430325263e-05, + "loss": 0.0294, + "reward": 1.0197916865348815, + "reward_std": 0.16965479105710984, + "rewards/accuracy_reward": 0.04791666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9718750178813934, + "step": 495 + }, + { + "clip_ratio": 0.0, + "completion_length": 677.2750244140625, + "epoch": 0.15874539926388223, + "grad_norm": 0.06882388889789581, + "kl": 0.171281161904335, + "learning_rate": 1.9791581695600722e-05, + "loss": 0.0324, + "reward": 1.0578125298023224, + "reward_std": 0.15982217490673065, + "rewards/accuracy_reward": 0.08750000353902579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9703125119209289, + "step": 496 + }, + { + "clip_ratio": 0.0, + "completion_length": 697.9937683105469, + "epoch": 0.15906545047207554, + "grad_norm": 0.11645076423883438, + "kl": 0.20202036798000336, + "learning_rate": 1.9789305730752167e-05, + "loss": 0.0403, + "reward": 1.0416666924953462, + "reward_std": 0.13704411685466766, + "rewards/accuracy_reward": 0.07708333563059569, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9645833432674408, + "step": 497 + }, + { + "clip_ratio": 0.0, + "completion_length": 668.1021057128906, + "epoch": 0.15938550168026885, + "grad_norm": 0.06207313388586044, + "kl": 0.2349511541426182, + "learning_rate": 1.978701753862238e-05, + "loss": 0.0339, + "reward": 1.0593750298023223, + "reward_std": 0.1413394134491682, + "rewards/accuracy_reward": 0.0916666692122817, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9677083551883697, + "step": 498 + }, + { + "clip_ratio": 0.0, + "completion_length": 714.4166870117188, + "epoch": 0.15970555288846217, + "grad_norm": 0.058839015662670135, + "kl": 0.28067911267280576, + "learning_rate": 1.9784717122069425e-05, + "loss": 0.025, + "reward": 1.0302083551883698, + "reward_std": 0.17109771873801946, + "rewards/accuracy_reward": 0.06458333563059568, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.965625011920929, + "step": 499 + }, + { + "clip_ratio": 0.0, + "completion_length": 648.9312744140625, + "epoch": 0.16002560409665548, + "grad_norm": 0.1544368416070938, + "kl": 0.24343355521559715, + "learning_rate": 1.978240448396661e-05, + "loss": 0.042, + "reward": 1.090625023841858, + "reward_std": 0.16572601571679116, + "rewards/accuracy_reward": 0.12708333656191825, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 500 + }, + { + "clip_ratio": 0.0, + "completion_length": 688.120849609375, + "epoch": 0.1603456553048488, + "grad_norm": 0.0498308427631855, + "kl": 0.10392797477543354, + "learning_rate": 1.9780079627202534e-05, + "loss": 0.0175, + "reward": 1.0885416924953462, + "reward_std": 0.09727377630770206, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750119209289, + "step": 501 + }, + { + "clip_ratio": 0.0, + "completion_length": 658.6937622070312, + "epoch": 0.1606657065130421, + "grad_norm": 0.1539488285779953, + "kl": 0.19753100723028183, + "learning_rate": 1.9777742554681044e-05, + "loss": 0.0408, + "reward": 1.0635416984558106, + "reward_std": 0.15354348700493575, + "rewards/accuracy_reward": 0.09166666939854622, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.971875011920929, + "step": 502 + }, + { + "clip_ratio": 0.0, + "completion_length": 661.1750183105469, + "epoch": 0.16098575772123538, + "grad_norm": 0.09290697425603867, + "kl": 0.33730019479990003, + "learning_rate": 1.9775393269321252e-05, + "loss": 0.0218, + "reward": 1.115104192495346, + "reward_std": 0.12255375757813454, + "rewards/accuracy_reward": 0.13750000447034835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9776041805744171, + "step": 503 + }, + { + "clip_ratio": 0.0, + "completion_length": 671.9041809082031, + "epoch": 0.1613058089294287, + "grad_norm": 0.2346062958240509, + "kl": 0.2508824057877064, + "learning_rate": 1.9773031774057515e-05, + "loss": 0.0345, + "reward": 1.0421875298023224, + "reward_std": 0.2036103442311287, + "rewards/accuracy_reward": 0.07916666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9630208492279053, + "step": 504 + }, + { + "clip_ratio": 0.0, + "completion_length": 625.0000213623047, + "epoch": 0.161625860137622, + "grad_norm": 0.11217735707759857, + "kl": 0.1789027236402035, + "learning_rate": 1.9770658071839448e-05, + "loss": 0.0414, + "reward": 1.0046875119209289, + "reward_std": 0.1331900667399168, + "rewards/accuracy_reward": 0.02916666716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208373069764, + "step": 505 + }, + { + "clip_ratio": 0.0, + "completion_length": 656.3062683105469, + "epoch": 0.16194591134581532, + "grad_norm": 0.062012333422899246, + "kl": 0.26196385324001314, + "learning_rate": 1.976827216563191e-05, + "loss": 0.0301, + "reward": 1.0640625178813934, + "reward_std": 0.12917129397392274, + "rewards/accuracy_reward": 0.08750000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9765625059604645, + "step": 506 + }, + { + "clip_ratio": 0.0, + "completion_length": 683.8312622070313, + "epoch": 0.16226596255400863, + "grad_norm": 0.06038212776184082, + "kl": 0.14489686340093613, + "learning_rate": 1.9765874058415013e-05, + "loss": 0.0513, + "reward": 1.046875011920929, + "reward_std": 0.10234173312783242, + "rewards/accuracy_reward": 0.07291666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 507 + }, + { + "clip_ratio": 0.0, + "completion_length": 635.5646118164062, + "epoch": 0.16258601376220194, + "grad_norm": 0.08867309987545013, + "kl": 0.30611826479434967, + "learning_rate": 1.9763463753184092e-05, + "loss": 0.0301, + "reward": 1.055729192495346, + "reward_std": 0.11597478222101927, + "rewards/accuracy_reward": 0.08958333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9661458432674408, + "step": 508 + }, + { + "clip_ratio": 0.0, + "completion_length": 662.7104339599609, + "epoch": 0.16290606497039525, + "grad_norm": 0.09610988944768906, + "kl": 0.25172789543867113, + "learning_rate": 1.9761041252949725e-05, + "loss": 0.0491, + "reward": 1.045312511920929, + "reward_std": 0.13255398720502853, + "rewards/accuracy_reward": 0.08333333432674409, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9619791746139527, + "step": 509 + }, + { + "clip_ratio": 0.0, + "completion_length": 671.8375183105469, + "epoch": 0.16322611617858857, + "grad_norm": 0.07011920213699341, + "kl": 0.2210270531475544, + "learning_rate": 1.975860656073773e-05, + "loss": 0.0615, + "reward": 1.0250000178813934, + "reward_std": 0.1721810780465603, + "rewards/accuracy_reward": 0.06666666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 510 + }, + { + "clip_ratio": 0.0, + "completion_length": 674.189599609375, + "epoch": 0.16354616738678188, + "grad_norm": 0.08514299988746643, + "kl": 0.3713012598454952, + "learning_rate": 1.9756159679589143e-05, + "loss": 0.0475, + "reward": 0.9869791805744171, + "reward_std": 0.1320252813398838, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9536458432674408, + "step": 511 + }, + { + "clip_ratio": 0.0, + "completion_length": 674.545849609375, + "epoch": 0.1638662185949752, + "grad_norm": 0.14546971023082733, + "kl": 0.31686792969703675, + "learning_rate": 1.9753700612560228e-05, + "loss": 0.0723, + "reward": 1.0312500298023224, + "reward_std": 0.19421770237386227, + "rewards/accuracy_reward": 0.08750000316649675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.943750011920929, + "step": 512 + }, + { + "clip_ratio": 0.0, + "completion_length": 647.0000183105469, + "epoch": 0.1641862698031685, + "grad_norm": 0.11060936003923416, + "kl": 0.23016732260584832, + "learning_rate": 1.9751229362722467e-05, + "loss": 0.0717, + "reward": 1.0494791865348816, + "reward_std": 0.12477188017219305, + "rewards/accuracy_reward": 0.08125000204890967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9682291805744171, + "step": 513 + }, + { + "clip_ratio": 0.0, + "completion_length": 647.9833557128907, + "epoch": 0.1645063210113618, + "grad_norm": 0.1363779753446579, + "kl": 0.27407467886805537, + "learning_rate": 1.974874593316257e-05, + "loss": 0.0754, + "reward": 1.0812500238418579, + "reward_std": 0.17592179030179977, + "rewards/accuracy_reward": 0.13333333730697633, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9479166746139527, + "step": 514 + }, + { + "clip_ratio": 0.0, + "completion_length": 663.7750244140625, + "epoch": 0.16482637221955512, + "grad_norm": 0.23831957578659058, + "kl": 0.25771130472421644, + "learning_rate": 1.9746250326982444e-05, + "loss": 0.0599, + "reward": 1.0421875298023224, + "reward_std": 0.16807909309864044, + "rewards/accuracy_reward": 0.09375000204890967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9484375178813934, + "step": 515 + }, + { + "clip_ratio": 0.0, + "completion_length": 642.4479370117188, + "epoch": 0.16514642342774843, + "grad_norm": 0.1442326009273529, + "kl": 0.2700896874070168, + "learning_rate": 1.9743742547299213e-05, + "loss": 0.0615, + "reward": 1.0427083492279052, + "reward_std": 0.15232707187533379, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.959375011920929, + "step": 516 + }, + { + "clip_ratio": 0.0, + "completion_length": 650.9854370117188, + "epoch": 0.16546647463594175, + "grad_norm": 0.6016630530357361, + "kl": 0.3060616210103035, + "learning_rate": 1.974122259724521e-05, + "loss": 0.0772, + "reward": 1.001562523841858, + "reward_std": 0.17864162977784873, + "rewards/accuracy_reward": 0.04375000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9578125059604645, + "step": 517 + }, + { + "clip_ratio": 0.0, + "completion_length": 667.845849609375, + "epoch": 0.16578652584413506, + "grad_norm": 0.15949472784996033, + "kl": 0.39566795006394384, + "learning_rate": 1.9738690479967964e-05, + "loss": 0.0775, + "reward": 1.064062523841858, + "reward_std": 0.160395810008049, + "rewards/accuracy_reward": 0.11250000353902578, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.951562511920929, + "step": 518 + }, + { + "clip_ratio": 0.0, + "completion_length": 701.4521057128907, + "epoch": 0.16610657705232837, + "grad_norm": 0.32095056772232056, + "kl": 0.6456972368061542, + "learning_rate": 1.9736146198630207e-05, + "loss": 0.0871, + "reward": 0.9614583492279053, + "reward_std": 0.23185053169727327, + "rewards/accuracy_reward": 0.05208333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.909375011920929, + "step": 519 + }, + { + "clip_ratio": 0.0, + "completion_length": 693.208349609375, + "epoch": 0.16642662826052168, + "grad_norm": 0.18055865168571472, + "kl": 0.4101051360368729, + "learning_rate": 1.973358975640985e-05, + "loss": 0.0719, + "reward": 0.979687511920929, + "reward_std": 0.2121795818209648, + "rewards/accuracy_reward": 0.05000000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9296875119209289, + "step": 520 + }, + { + "clip_ratio": 0.0, + "completion_length": 684.1354309082031, + "epoch": 0.166746679468715, + "grad_norm": 0.23663505911827087, + "kl": 0.4086972147226334, + "learning_rate": 1.9731021156500015e-05, + "loss": 0.125, + "reward": 0.9437500238418579, + "reward_std": 0.2564606711268425, + "rewards/accuracy_reward": 0.031250000558793546, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9125000178813935, + "step": 521 + }, + { + "clip_ratio": 0.0, + "completion_length": 684.6520935058594, + "epoch": 0.1670667306769083, + "grad_norm": 0.294890433549881, + "kl": 0.3842778980731964, + "learning_rate": 1.972844040210899e-05, + "loss": 0.1195, + "reward": 0.8927083373069763, + "reward_std": 0.26684831380844115, + "rewards/accuracy_reward": 0.01041666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8822916746139526, + "step": 522 + }, + { + "clip_ratio": 0.0, + "completion_length": 721.1958557128906, + "epoch": 0.16738678188510162, + "grad_norm": 0.36491721868515015, + "kl": 0.6396187901496887, + "learning_rate": 1.9725847496460256e-05, + "loss": 0.1289, + "reward": 0.8583333551883697, + "reward_std": 0.29743548184633256, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8187500178813935, + "step": 523 + }, + { + "clip_ratio": 0.0, + "completion_length": 710.1604370117187, + "epoch": 0.16770683309329493, + "grad_norm": 0.4135096073150635, + "kl": 0.605698075890541, + "learning_rate": 1.9723242442792473e-05, + "loss": 0.1082, + "reward": 0.8958333551883697, + "reward_std": 0.3122894302010536, + "rewards/accuracy_reward": 0.07083333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8250000178813934, + "step": 524 + }, + { + "clip_ratio": 0.0, + "completion_length": 688.0000183105469, + "epoch": 0.16802688430148824, + "grad_norm": 0.39389896392822266, + "kl": 0.43594875633716584, + "learning_rate": 1.972062524435946e-05, + "loss": 0.1242, + "reward": 0.9848958611488342, + "reward_std": 0.28320216238498686, + "rewards/accuracy_reward": 0.11250000204890967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8723958492279053, + "step": 525 + }, + { + "clip_ratio": 0.0, + "completion_length": 669.5000244140625, + "epoch": 0.16834693550968155, + "grad_norm": 0.27188587188720703, + "kl": 0.47881949692964554, + "learning_rate": 1.9717995904430224e-05, + "loss": 0.1263, + "reward": 0.9119791865348816, + "reward_std": 0.22102243602275848, + "rewards/accuracy_reward": 0.00625, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9057291865348815, + "step": 526 + }, + { + "clip_ratio": 0.0, + "completion_length": 636.5646057128906, + "epoch": 0.16866698671787486, + "grad_norm": 0.19776694476604462, + "kl": 0.42077905088663103, + "learning_rate": 1.9715354426288923e-05, + "loss": 0.1345, + "reward": 0.993229192495346, + "reward_std": 0.22253528684377671, + "rewards/accuracy_reward": 0.07916666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9140625178813935, + "step": 527 + }, + { + "clip_ratio": 0.0, + "completion_length": 652.664599609375, + "epoch": 0.16898703792606817, + "grad_norm": 0.2069554179906845, + "kl": 0.3527437448501587, + "learning_rate": 1.971270081323488e-05, + "loss": 0.0745, + "reward": 1.058854204416275, + "reward_std": 0.2211466073989868, + "rewards/accuracy_reward": 0.11458333563059568, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9442708432674408, + "step": 528 + }, + { + "clip_ratio": 0.0, + "completion_length": 666.9166870117188, + "epoch": 0.16930708913426148, + "grad_norm": 0.14599734544754028, + "kl": 0.2297988161444664, + "learning_rate": 1.9710035068582586e-05, + "loss": 0.0446, + "reward": 1.0453125357627868, + "reward_std": 0.18706899434328078, + "rewards/accuracy_reward": 0.07708333600312471, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9682291805744171, + "step": 529 + }, + { + "clip_ratio": 0.0, + "completion_length": 712.2750244140625, + "epoch": 0.1696271403424548, + "grad_norm": 0.1098317950963974, + "kl": 0.262774883210659, + "learning_rate": 1.9707357195661663e-05, + "loss": 0.0559, + "reward": 1.0963541865348816, + "reward_std": 0.1620855674147606, + "rewards/accuracy_reward": 0.13125000558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9651041805744172, + "step": 530 + }, + { + "clip_ratio": 0.0, + "completion_length": 697.2083557128906, + "epoch": 0.1699471915506481, + "grad_norm": 0.11071398854255676, + "kl": 0.1833828866481781, + "learning_rate": 1.9704667197816906e-05, + "loss": 0.0336, + "reward": 1.0375000357627868, + "reward_std": 0.15114332139492034, + "rewards/accuracy_reward": 0.06041666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833432674408, + "step": 531 + }, + { + "clip_ratio": 0.0, + "completion_length": 655.6458465576172, + "epoch": 0.17026724275884142, + "grad_norm": 0.04358408972620964, + "kl": 0.14082614332437515, + "learning_rate": 1.970196507840823e-05, + "loss": 0.0266, + "reward": 1.1223958730697632, + "reward_std": 0.11901317611336708, + "rewards/accuracy_reward": 0.1395833373069763, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9828125178813935, + "step": 532 + }, + { + "clip_ratio": 0.0, + "completion_length": 667.9833618164063, + "epoch": 0.17058729396703473, + "grad_norm": 0.07485391199588776, + "kl": 0.16539901047945021, + "learning_rate": 1.9699250840810714e-05, + "loss": 0.0206, + "reward": 0.9927083492279053, + "reward_std": 0.09257282391190529, + "rewards/accuracy_reward": 0.014583333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250119209289, + "step": 533 + }, + { + "clip_ratio": 0.0, + "completion_length": 657.2625183105469, + "epoch": 0.17090734517522804, + "grad_norm": 0.07787440717220306, + "kl": 0.14856073185801505, + "learning_rate": 1.969652448841456e-05, + "loss": 0.0205, + "reward": 1.1171875238418578, + "reward_std": 0.09560411293059587, + "rewards/accuracy_reward": 0.1312500059604645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9859375178813934, + "step": 534 + }, + { + "clip_ratio": 0.0, + "completion_length": 682.5000305175781, + "epoch": 0.17122739638342135, + "grad_norm": 0.05641574785113335, + "kl": 0.16644731312990188, + "learning_rate": 1.9693786024625097e-05, + "loss": 0.0178, + "reward": 1.0593750178813934, + "reward_std": 0.10782541166990996, + "rewards/accuracy_reward": 0.0770833358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9822916805744171, + "step": 535 + }, + { + "clip_ratio": 0.0, + "completion_length": 659.7396057128906, + "epoch": 0.17154744759161467, + "grad_norm": 0.1265953779220581, + "kl": 0.16372175514698029, + "learning_rate": 1.9691035452862798e-05, + "loss": 0.028, + "reward": 1.0588541984558106, + "reward_std": 0.13097876124083996, + "rewards/accuracy_reward": 0.0812500050291419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.977604192495346, + "step": 536 + }, + { + "clip_ratio": 0.0, + "completion_length": 698.2125183105469, + "epoch": 0.17186749879980798, + "grad_norm": 0.07919764518737793, + "kl": 0.15723595917224883, + "learning_rate": 1.9688272776563248e-05, + "loss": 0.0262, + "reward": 1.0427083611488341, + "reward_std": 0.12501449398696424, + "rewards/accuracy_reward": 0.05833333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750119209289, + "step": 537 + }, + { + "clip_ratio": 0.0, + "completion_length": 662.214599609375, + "epoch": 0.1721875500080013, + "grad_norm": 0.038847651332616806, + "kl": 0.128957362473011, + "learning_rate": 1.968549799917715e-05, + "loss": 0.0213, + "reward": 1.0822916984558106, + "reward_std": 0.11970577985048295, + "rewards/accuracy_reward": 0.10000000465661288, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9822916805744171, + "step": 538 + }, + { + "clip_ratio": 0.0, + "completion_length": 653.0750244140625, + "epoch": 0.1725076012161946, + "grad_norm": 0.04241522029042244, + "kl": 0.10394577831029891, + "learning_rate": 1.9682711124170325e-05, + "loss": 0.0061, + "reward": 1.0479166805744171, + "reward_std": 0.10726981312036514, + "rewards/accuracy_reward": 0.060416667722165585, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9875000059604645, + "step": 539 + }, + { + "clip_ratio": 0.0, + "completion_length": 674.6229370117187, + "epoch": 0.1728276524243879, + "grad_norm": 0.03727027401328087, + "kl": 0.11779571026563644, + "learning_rate": 1.9679912155023713e-05, + "loss": 0.0261, + "reward": 1.0494791984558105, + "reward_std": 0.1118181511759758, + "rewards/accuracy_reward": 0.06250000204890967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9869791746139527, + "step": 540 + }, + { + "clip_ratio": 0.0, + "completion_length": 710.9666870117187, + "epoch": 0.17314770363258122, + "grad_norm": 0.05394143611192703, + "kl": 0.12853079214692115, + "learning_rate": 1.9677101095233342e-05, + "loss": 0.0488, + "reward": 1.0333333671092988, + "reward_std": 0.17711131498217583, + "rewards/accuracy_reward": 0.06250000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333432674408, + "step": 541 + }, + { + "clip_ratio": 0.0, + "completion_length": 680.5291809082031, + "epoch": 0.17346775484077454, + "grad_norm": 0.05786094814538956, + "kl": 0.12042107433080673, + "learning_rate": 1.9674277948310355e-05, + "loss": 0.0222, + "reward": 1.0182291924953462, + "reward_std": 0.10973568204790354, + "rewards/accuracy_reward": 0.04166666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9765625059604645, + "step": 542 + }, + { + "clip_ratio": 0.0, + "completion_length": 713.570849609375, + "epoch": 0.17378780604896785, + "grad_norm": 0.09852173924446106, + "kl": 0.108553709089756, + "learning_rate": 1.9671442717780992e-05, + "loss": 0.0338, + "reward": 1.0500000357627868, + "reward_std": 0.0853736650198698, + "rewards/accuracy_reward": 0.06875000316649675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9812500178813934, + "step": 543 + }, + { + "clip_ratio": 0.0, + "completion_length": 759.470849609375, + "epoch": 0.17410785725716116, + "grad_norm": 0.045692119747400284, + "kl": 0.21100176870822906, + "learning_rate": 1.966859540718658e-05, + "loss": 0.0341, + "reward": 1.0578125298023224, + "reward_std": 0.12945474069565535, + "rewards/accuracy_reward": 0.08541666902601719, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9723958432674408, + "step": 544 + }, + { + "clip_ratio": 0.0, + "completion_length": 707.0541870117188, + "epoch": 0.17442790846535447, + "grad_norm": 0.2436332404613495, + "kl": 0.1932619445025921, + "learning_rate": 1.9665736020083533e-05, + "loss": 0.0672, + "reward": 1.1473958611488342, + "reward_std": 0.14116937890648842, + "rewards/accuracy_reward": 0.18125000596046448, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9661458432674408, + "step": 545 + }, + { + "clip_ratio": 0.0, + "completion_length": 678.1062683105469, + "epoch": 0.17474795967354778, + "grad_norm": 0.08150272816419601, + "kl": 0.1950148455798626, + "learning_rate": 1.9662864560043364e-05, + "loss": 0.066, + "reward": 0.9979166865348816, + "reward_std": 0.12375719584524632, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9604166865348815, + "step": 546 + }, + { + "clip_ratio": 0.0, + "completion_length": 715.6021118164062, + "epoch": 0.17506801088174107, + "grad_norm": 0.06785853207111359, + "kl": 0.1598280780017376, + "learning_rate": 1.9659981030652648e-05, + "loss": 0.0494, + "reward": 0.9994791865348815, + "reward_std": 0.1482737548649311, + "rewards/accuracy_reward": 0.02708333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9723958492279052, + "step": 547 + }, + { + "clip_ratio": 0.0, + "completion_length": 705.9791931152344, + "epoch": 0.17538806208993438, + "grad_norm": 0.0760614350438118, + "kl": 0.15669554099440575, + "learning_rate": 1.9657085435513043e-05, + "loss": 0.0302, + "reward": 0.987500011920929, + "reward_std": 0.09059235211461783, + "rewards/accuracy_reward": 0.008333333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9791666746139527, + "step": 548 + }, + { + "clip_ratio": 0.0, + "completion_length": 646.0375183105468, + "epoch": 0.1757081132981277, + "grad_norm": 0.055525023490190506, + "kl": 0.15209799632430077, + "learning_rate": 1.9654177778241278e-05, + "loss": 0.0474, + "reward": 1.1031250298023223, + "reward_std": 0.16906238086521624, + "rewards/accuracy_reward": 0.1333333384245634, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916865348816, + "step": 549 + }, + { + "clip_ratio": 0.0, + "completion_length": 682.064599609375, + "epoch": 0.176028164506321, + "grad_norm": 0.11843759566545486, + "kl": 0.14061254411935806, + "learning_rate": 1.965125806246915e-05, + "loss": 0.0696, + "reward": 1.0270833551883698, + "reward_std": 0.1306549172848463, + "rewards/accuracy_reward": 0.054166667722165586, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9729166805744172, + "step": 550 + }, + { + "clip_ratio": 0.0, + "completion_length": 685.5396118164062, + "epoch": 0.1763482157145143, + "grad_norm": 0.0710177794098854, + "kl": 0.1787277102470398, + "learning_rate": 1.9648326291843505e-05, + "loss": 0.0245, + "reward": 1.0244791865348817, + "reward_std": 0.08614383526146412, + "rewards/accuracy_reward": 0.04375000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9807291865348816, + "step": 551 + }, + { + "clip_ratio": 0.0, + "completion_length": 662.0729309082031, + "epoch": 0.17666826692270762, + "grad_norm": 0.18198657035827637, + "kl": 0.2556367240846157, + "learning_rate": 1.9645382470026267e-05, + "loss": 0.0487, + "reward": 1.0348958432674409, + "reward_std": 0.16033005770295858, + "rewards/accuracy_reward": 0.075, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9598958432674408, + "step": 552 + }, + { + "clip_ratio": 0.0, + "completion_length": 681.495849609375, + "epoch": 0.17698831813090093, + "grad_norm": 0.04028153792023659, + "kl": 0.10996652320027352, + "learning_rate": 1.9642426600694395e-05, + "loss": 0.0267, + "reward": 1.0442708551883697, + "reward_std": 0.13642770163714885, + "rewards/accuracy_reward": 0.07083333563059568, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.973437511920929, + "step": 553 + }, + { + "clip_ratio": 0.0, + "completion_length": 731.7521057128906, + "epoch": 0.17730836933909425, + "grad_norm": 0.06031052768230438, + "kl": 0.13037733137607574, + "learning_rate": 1.9639458687539905e-05, + "loss": 0.0409, + "reward": 1.0692708611488342, + "reward_std": 0.1615061044692993, + "rewards/accuracy_reward": 0.11250000204890967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9567708492279052, + "step": 554 + }, + { + "clip_ratio": 0.0, + "completion_length": 666.8333557128906, + "epoch": 0.17762842054728756, + "grad_norm": 0.08267563581466675, + "kl": 0.16332112476229668, + "learning_rate": 1.9636478734269854e-05, + "loss": 0.0564, + "reward": 0.9770833432674408, + "reward_std": 0.11380629241466522, + "rewards/accuracy_reward": 0.006250000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333432674408, + "step": 555 + }, + { + "clip_ratio": 0.0, + "completion_length": 627.0062683105468, + "epoch": 0.17794847175548087, + "grad_norm": 0.13827405869960785, + "kl": 0.19004355743527412, + "learning_rate": 1.963348674460633e-05, + "loss": 0.0621, + "reward": 1.0869791865348817, + "reward_std": 0.15473262146115302, + "rewards/accuracy_reward": 0.1208333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9661458432674408, + "step": 556 + }, + { + "clip_ratio": 0.0, + "completion_length": 709.895849609375, + "epoch": 0.17826852296367418, + "grad_norm": 0.08501515537500381, + "kl": 0.14372501894831657, + "learning_rate": 1.9630482722286473e-05, + "loss": 0.0429, + "reward": 1.0583333671092987, + "reward_std": 0.13740524798631668, + "rewards/accuracy_reward": 0.08541666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9729166746139526, + "step": 557 + }, + { + "clip_ratio": 0.0, + "completion_length": 679.3437683105469, + "epoch": 0.1785885741718675, + "grad_norm": 0.3827410936355591, + "kl": 0.22375328987836837, + "learning_rate": 1.9627466671062434e-05, + "loss": 0.0238, + "reward": 1.1010416984558105, + "reward_std": 0.11596148405224085, + "rewards/accuracy_reward": 0.12916667032986878, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9718750178813934, + "step": 558 + }, + { + "clip_ratio": 0.0, + "completion_length": 637.4833557128907, + "epoch": 0.1789086253800608, + "grad_norm": 0.07304264605045319, + "kl": 0.41341082081198693, + "learning_rate": 1.9624438594701397e-05, + "loss": 0.042, + "reward": 1.0317708432674408, + "reward_std": 0.16881751529872419, + "rewards/accuracy_reward": 0.06250000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9692708492279053, + "step": 559 + }, + { + "clip_ratio": 0.0, + "completion_length": 664.8666931152344, + "epoch": 0.17922867658825412, + "grad_norm": 0.07624661922454834, + "kl": 0.18999108150601388, + "learning_rate": 1.9621398496985566e-05, + "loss": 0.0489, + "reward": 1.0333333611488342, + "reward_std": 0.12361449729651212, + "rewards/accuracy_reward": 0.07083333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9625000119209289, + "step": 560 + }, + { + "clip_ratio": 0.0, + "completion_length": 669.8458557128906, + "epoch": 0.17954872779644743, + "grad_norm": 0.07759030163288116, + "kl": 0.18845606744289398, + "learning_rate": 1.9618346381712163e-05, + "loss": 0.0296, + "reward": 1.015625011920929, + "reward_std": 0.11664673164486886, + "rewards/accuracy_reward": 0.037500002235174176, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250119209289, + "step": 561 + }, + { + "clip_ratio": 0.0, + "completion_length": 685.6666809082031, + "epoch": 0.17986877900464074, + "grad_norm": 0.08038783073425293, + "kl": 0.14941411837935448, + "learning_rate": 1.9615282252693407e-05, + "loss": 0.04, + "reward": 1.0671875298023223, + "reward_std": 0.1277464386075735, + "rewards/accuracy_reward": 0.0937500026077032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.973437511920929, + "step": 562 + }, + { + "clip_ratio": 0.0, + "completion_length": 654.0541870117188, + "epoch": 0.18018883021283405, + "grad_norm": 0.061528515070676804, + "kl": 0.23907251805067062, + "learning_rate": 1.9612206113756536e-05, + "loss": 0.0627, + "reward": 1.0000000238418578, + "reward_std": 0.1499695971608162, + "rewards/accuracy_reward": 0.039583335630595684, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9604166805744171, + "step": 563 + }, + { + "clip_ratio": 0.0, + "completion_length": 636.6979370117188, + "epoch": 0.18050888142102736, + "grad_norm": 0.11629898101091385, + "kl": 0.22134559452533722, + "learning_rate": 1.9609117968743794e-05, + "loss": 0.0883, + "reward": 1.0432291865348815, + "reward_std": 0.15978550240397454, + "rewards/accuracy_reward": 0.08333333544433116, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9598958492279053, + "step": 564 + }, + { + "clip_ratio": 0.0, + "completion_length": 688.502099609375, + "epoch": 0.18082893262922067, + "grad_norm": 0.25423702597618103, + "kl": 0.21465194523334502, + "learning_rate": 1.9606017821512405e-05, + "loss": 0.067, + "reward": 0.9765625059604645, + "reward_std": 0.1502749115228653, + "rewards/accuracy_reward": 0.01666666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9598958373069764, + "step": 565 + }, + { + "clip_ratio": 0.0, + "completion_length": 692.4937744140625, + "epoch": 0.18114898383741398, + "grad_norm": 0.16149629652500153, + "kl": 0.3055619314312935, + "learning_rate": 1.960290567593459e-05, + "loss": 0.0905, + "reward": 0.9885416865348816, + "reward_std": 0.18179295882582663, + "rewards/accuracy_reward": 0.04583333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9427083492279053, + "step": 566 + }, + { + "clip_ratio": 0.0, + "completion_length": 666.0479309082032, + "epoch": 0.1814690350456073, + "grad_norm": 0.25656628608703613, + "kl": 0.4261516511440277, + "learning_rate": 1.9599781535897562e-05, + "loss": 0.1483, + "reward": 0.9687500059604645, + "reward_std": 0.21279908418655397, + "rewards/accuracy_reward": 0.04791666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9208333432674408, + "step": 567 + }, + { + "clip_ratio": 0.0, + "completion_length": 696.0646057128906, + "epoch": 0.1817890862538006, + "grad_norm": 0.25229644775390625, + "kl": 0.43460706919431685, + "learning_rate": 1.9596645405303508e-05, + "loss": 0.1277, + "reward": 0.994791692495346, + "reward_std": 0.23564037531614304, + "rewards/accuracy_reward": 0.06666666734963655, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.928125011920929, + "step": 568 + }, + { + "clip_ratio": 0.0, + "completion_length": 711.927099609375, + "epoch": 0.18210913746199392, + "grad_norm": 0.4171068072319031, + "kl": 0.6513818740844727, + "learning_rate": 1.9593497288069603e-05, + "loss": 0.1526, + "reward": 0.9005208492279053, + "reward_std": 0.24727483838796616, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8963541805744171, + "step": 569 + }, + { + "clip_ratio": 0.0, + "completion_length": 632.3687713623046, + "epoch": 0.18242918867018723, + "grad_norm": 0.2483467310667038, + "kl": 0.7798386961221695, + "learning_rate": 1.9590337188127978e-05, + "loss": 0.1642, + "reward": 1.012500023841858, + "reward_std": 0.27335314750671386, + "rewards/accuracy_reward": 0.11666666939854622, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8958333432674408, + "step": 570 + }, + { + "clip_ratio": 0.0, + "completion_length": 681.7146057128906, + "epoch": 0.18274923987838054, + "grad_norm": 0.3533123731613159, + "kl": 0.6377503961324692, + "learning_rate": 1.9587165109425746e-05, + "loss": 0.1724, + "reward": 0.9296875298023224, + "reward_std": 0.2813147783279419, + "rewards/accuracy_reward": 0.05208333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8776041865348816, + "step": 571 + }, + { + "clip_ratio": 0.0, + "completion_length": 652.0479431152344, + "epoch": 0.18306929108657385, + "grad_norm": 0.21204252541065216, + "kl": 0.408619812130928, + "learning_rate": 1.9583981055924966e-05, + "loss": 0.1507, + "reward": 0.9921875178813935, + "reward_std": 0.24514594674110413, + "rewards/accuracy_reward": 0.10625000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.885937511920929, + "step": 572 + }, + { + "clip_ratio": 0.0, + "completion_length": 710.6229370117187, + "epoch": 0.18338934229476717, + "grad_norm": 0.26197925209999084, + "kl": 0.49179228246212003, + "learning_rate": 1.9580785031602673e-05, + "loss": 0.1293, + "reward": 0.9187500178813934, + "reward_std": 0.27653331905603407, + "rewards/accuracy_reward": 0.058333336375653744, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8604166805744171, + "step": 573 + }, + { + "clip_ratio": 0.0, + "completion_length": 667.7625183105469, + "epoch": 0.18370939350296048, + "grad_norm": 0.3436623513698578, + "kl": 0.43792471289634705, + "learning_rate": 1.9577577040450842e-05, + "loss": 0.1387, + "reward": 0.9255208611488343, + "reward_std": 0.22069956436753274, + "rewards/accuracy_reward": 0.010416666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9151041805744171, + "step": 574 + }, + { + "clip_ratio": 0.0, + "completion_length": 683.5166870117188, + "epoch": 0.1840294447111538, + "grad_norm": 0.38407763838768005, + "kl": 0.5805969923734665, + "learning_rate": 1.9574357086476398e-05, + "loss": 0.1509, + "reward": 0.9453125178813935, + "reward_std": 0.2484004467725754, + "rewards/accuracy_reward": 0.04791666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8973958432674408, + "step": 575 + }, + { + "clip_ratio": 0.0, + "completion_length": 669.6416931152344, + "epoch": 0.1843494959193471, + "grad_norm": 0.3657831847667694, + "kl": 1.5230638265609742, + "learning_rate": 1.95711251737012e-05, + "loss": 0.1762, + "reward": 0.9635416865348816, + "reward_std": 0.23155898600816727, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8968750178813935, + "step": 576 + }, + { + "clip_ratio": 0.0, + "completion_length": 706.039599609375, + "epoch": 0.1846695471275404, + "grad_norm": 0.2620721757411957, + "kl": 0.5607627764344215, + "learning_rate": 1.9567881306162065e-05, + "loss": 0.1382, + "reward": 1.0312500178813935, + "reward_std": 0.24164563566446304, + "rewards/accuracy_reward": 0.11458333544433116, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9166666746139527, + "step": 577 + }, + { + "clip_ratio": 0.0, + "completion_length": 691.7416931152344, + "epoch": 0.18498959833573372, + "grad_norm": 0.3571402132511139, + "kl": 0.5915179625153542, + "learning_rate": 1.956462548791072e-05, + "loss": 0.1009, + "reward": 0.9963541805744172, + "reward_std": 0.24583946019411088, + "rewards/accuracy_reward": 0.07291666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9234375059604645, + "step": 578 + }, + { + "clip_ratio": 0.0, + "completion_length": 680.7125183105469, + "epoch": 0.18530964954392704, + "grad_norm": 0.26667729020118713, + "kl": 0.4023525446653366, + "learning_rate": 1.9561357723013827e-05, + "loss": 0.1324, + "reward": 1.0218750298023225, + "reward_std": 0.21049386411905288, + "rewards/accuracy_reward": 0.09583333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9260416805744172, + "step": 579 + }, + { + "clip_ratio": 0.0, + "completion_length": 695.9666931152344, + "epoch": 0.18562970075212035, + "grad_norm": 0.15235862135887146, + "kl": 0.41516488790512085, + "learning_rate": 1.9558078015552973e-05, + "loss": 0.0822, + "reward": 0.9942708492279053, + "reward_std": 0.21924960762262344, + "rewards/accuracy_reward": 0.05416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9401041746139527, + "step": 580 + }, + { + "clip_ratio": 0.0, + "completion_length": 656.2229370117187, + "epoch": 0.18594975196031366, + "grad_norm": 0.16361404955387115, + "kl": 0.3572592079639435, + "learning_rate": 1.9554786369624666e-05, + "loss": 0.1157, + "reward": 1.0755208492279054, + "reward_std": 0.23814705833792688, + "rewards/accuracy_reward": 0.13541667088866233, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9401041746139527, + "step": 581 + }, + { + "clip_ratio": 0.0, + "completion_length": 644.3979370117188, + "epoch": 0.18626980316850697, + "grad_norm": 0.2164674550294876, + "kl": 0.3529484748840332, + "learning_rate": 1.9551482789340308e-05, + "loss": 0.0896, + "reward": 1.0656250357627868, + "reward_std": 0.1942263960838318, + "rewards/accuracy_reward": 0.1104166716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9552083492279053, + "step": 582 + }, + { + "clip_ratio": 0.0, + "completion_length": 688.3125244140625, + "epoch": 0.18658985437670028, + "grad_norm": 0.13018456101417542, + "kl": 0.40574545711278914, + "learning_rate": 1.9548167278826224e-05, + "loss": 0.0848, + "reward": 1.0317708492279052, + "reward_std": 0.14726283103227616, + "rewards/accuracy_reward": 0.07708333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9546875119209289, + "step": 583 + }, + { + "clip_ratio": 0.0, + "completion_length": 750.0979431152343, + "epoch": 0.1869099055848936, + "grad_norm": 0.1716415286064148, + "kl": 0.30623201876878736, + "learning_rate": 1.9544839842223636e-05, + "loss": 0.0671, + "reward": 1.0182291865348816, + "reward_std": 0.17932818606495857, + "rewards/accuracy_reward": 0.06250000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9557291805744171, + "step": 584 + }, + { + "clip_ratio": 0.0, + "completion_length": 698.2916870117188, + "epoch": 0.1872299567930869, + "grad_norm": 0.23720037937164307, + "kl": 0.29293742030858994, + "learning_rate": 1.9541500483688663e-05, + "loss": 0.1013, + "reward": 1.0354166865348815, + "reward_std": 0.19150078296661377, + "rewards/accuracy_reward": 0.08958333488553763, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9458333373069763, + "step": 585 + }, + { + "clip_ratio": 0.0, + "completion_length": 757.683349609375, + "epoch": 0.18755000800128022, + "grad_norm": 0.08887584507465363, + "kl": 0.18934873640537261, + "learning_rate": 1.9538149207392306e-05, + "loss": 0.0544, + "reward": 0.9760416746139526, + "reward_std": 0.0908809632062912, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583373069763, + "step": 586 + }, + { + "clip_ratio": 0.0, + "completion_length": 721.4979309082031, + "epoch": 0.18787005920947353, + "grad_norm": 0.07203318923711777, + "kl": 0.24687618166208267, + "learning_rate": 1.9534786017520466e-05, + "loss": 0.0771, + "reward": 1.0692708551883698, + "reward_std": 0.16076359152793884, + "rewards/accuracy_reward": 0.11666667014360428, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9526041746139526, + "step": 587 + }, + { + "clip_ratio": 0.0, + "completion_length": 739.3646057128906, + "epoch": 0.18819011041766684, + "grad_norm": 0.27686551213264465, + "kl": 0.5672411054372788, + "learning_rate": 1.9531410918273915e-05, + "loss": 0.0731, + "reward": 0.9807291924953461, + "reward_std": 0.2045274019241333, + "rewards/accuracy_reward": 0.06041666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9203125059604644, + "step": 588 + }, + { + "clip_ratio": 0.0, + "completion_length": 728.4708618164062, + "epoch": 0.18851016162586015, + "grad_norm": 0.18232998251914978, + "kl": 0.3686387039721012, + "learning_rate": 1.9528023913868305e-05, + "loss": 0.0819, + "reward": 0.9927083611488342, + "reward_std": 0.16963814198970795, + "rewards/accuracy_reward": 0.03958333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9531250059604645, + "step": 589 + }, + { + "clip_ratio": 0.0, + "completion_length": 690.6479370117188, + "epoch": 0.18883021283405343, + "grad_norm": 0.10632241517305374, + "kl": 0.3312704361975193, + "learning_rate": 1.9524625008534153e-05, + "loss": 0.0953, + "reward": 1.021875011920929, + "reward_std": 0.1697681626304984, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9531250119209289, + "step": 590 + }, + { + "clip_ratio": 0.0, + "completion_length": 703.708349609375, + "epoch": 0.18915026404224675, + "grad_norm": 0.05055807903409004, + "kl": 0.2949476674199104, + "learning_rate": 1.9521214206516845e-05, + "loss": 0.0396, + "reward": 0.9864583492279053, + "reward_std": 0.09214165061712265, + "rewards/accuracy_reward": 0.00833333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250119209289, + "step": 591 + }, + { + "clip_ratio": 0.0, + "completion_length": 717.8375244140625, + "epoch": 0.18947031525044006, + "grad_norm": 0.16359588503837585, + "kl": 0.19101431891322135, + "learning_rate": 1.9517791512076628e-05, + "loss": 0.0395, + "reward": 1.0151041984558105, + "reward_std": 0.14598013013601302, + "rewards/accuracy_reward": 0.05625000167638063, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9588541805744171, + "step": 592 + }, + { + "clip_ratio": 0.0, + "completion_length": 688.8396057128906, + "epoch": 0.18979036645863337, + "grad_norm": 0.09676505625247955, + "kl": 0.188329254090786, + "learning_rate": 1.95143569294886e-05, + "loss": 0.0821, + "reward": 0.9786458432674408, + "reward_std": 0.1273002317175269, + "rewards/accuracy_reward": 0.01458333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.964062511920929, + "step": 593 + }, + { + "clip_ratio": 0.0, + "completion_length": 679.6104309082032, + "epoch": 0.19011041766682668, + "grad_norm": 0.08294422924518585, + "kl": 0.1577234983444214, + "learning_rate": 1.9510910463042704e-05, + "loss": 0.0623, + "reward": 1.005729180574417, + "reward_std": 0.13024714030325413, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9682291805744171, + "step": 594 + }, + { + "clip_ratio": 0.0, + "completion_length": 738.9958618164062, + "epoch": 0.19043046887502, + "grad_norm": 0.07775887846946716, + "kl": 0.13679290562868118, + "learning_rate": 1.9507452117043736e-05, + "loss": 0.0505, + "reward": 1.0354166805744172, + "reward_std": 0.1238449014723301, + "rewards/accuracy_reward": 0.06041666716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9750000059604644, + "step": 595 + }, + { + "clip_ratio": 0.0, + "completion_length": 657.9041870117187, + "epoch": 0.1907505200832133, + "grad_norm": 0.1872749775648117, + "kl": 0.17498825788497924, + "learning_rate": 1.950398189581132e-05, + "loss": 0.0693, + "reward": 1.1302083551883697, + "reward_std": 0.11385347358882428, + "rewards/accuracy_reward": 0.16458333730697633, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9656250059604645, + "step": 596 + }, + { + "clip_ratio": 0.0, + "completion_length": 733.158349609375, + "epoch": 0.19107057129140662, + "grad_norm": 0.07586333155632019, + "kl": 0.17170817404985428, + "learning_rate": 1.9500499803679925e-05, + "loss": 0.0651, + "reward": 0.9713541805744171, + "reward_std": 0.10218179430812598, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.967187511920929, + "step": 597 + }, + { + "clip_ratio": 0.0, + "completion_length": 692.6416809082032, + "epoch": 0.19139062249959993, + "grad_norm": 0.08267262578010559, + "kl": 0.2973045527935028, + "learning_rate": 1.9497005844998835e-05, + "loss": 0.065, + "reward": 1.0682291984558105, + "reward_std": 0.17779658660292624, + "rewards/accuracy_reward": 0.12083333693444728, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9473958432674408, + "step": 598 + }, + { + "clip_ratio": 0.0, + "completion_length": 728.98544921875, + "epoch": 0.19171067370779324, + "grad_norm": 0.04687151312828064, + "kl": 0.19852605685591698, + "learning_rate": 1.949350002413216e-05, + "loss": 0.0557, + "reward": 0.9947916805744171, + "reward_std": 0.11155761461704969, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9572916805744172, + "step": 599 + }, + { + "clip_ratio": 0.0, + "completion_length": 656.8187683105468, + "epoch": 0.19203072491598655, + "grad_norm": 0.1460895538330078, + "kl": 0.262048863619566, + "learning_rate": 1.9489982345458832e-05, + "loss": 0.121, + "reward": 1.0598958492279054, + "reward_std": 0.12374843284487724, + "rewards/accuracy_reward": 0.10000000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9598958432674408, + "step": 600 + }, + { + "clip_ratio": 0.0, + "completion_length": 675.1812683105469, + "epoch": 0.19235077612417986, + "grad_norm": 0.17061711847782135, + "kl": 0.31913367435336115, + "learning_rate": 1.9486452813372586e-05, + "loss": 0.1273, + "reward": 1.0255208551883697, + "reward_std": 0.15627394691109658, + "rewards/accuracy_reward": 0.08541666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9401041746139527, + "step": 601 + }, + { + "clip_ratio": 0.0, + "completion_length": 699.1958557128906, + "epoch": 0.19267082733237317, + "grad_norm": 0.12607340514659882, + "kl": 0.23554740473628044, + "learning_rate": 1.9482911432281963e-05, + "loss": 0.0923, + "reward": 1.0270833611488341, + "reward_std": 0.13728910144418477, + "rewards/accuracy_reward": 0.06458333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9625000119209289, + "step": 602 + }, + { + "clip_ratio": 0.0, + "completion_length": 717.2666809082032, + "epoch": 0.19299087854056649, + "grad_norm": 0.17443907260894775, + "kl": 0.2461421586573124, + "learning_rate": 1.947935820661031e-05, + "loss": 0.066, + "reward": 1.0833333492279054, + "reward_std": 0.15556199103593826, + "rewards/accuracy_reward": 0.12291667107492685, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9604166805744171, + "step": 603 + }, + { + "clip_ratio": 0.0, + "completion_length": 674.7812744140625, + "epoch": 0.1933109297487598, + "grad_norm": 0.13457360863685608, + "kl": 0.29405288547277453, + "learning_rate": 1.947579314079577e-05, + "loss": 0.0852, + "reward": 0.9744791865348816, + "reward_std": 0.16201976984739302, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9348958492279053, + "step": 604 + }, + { + "clip_ratio": 0.0, + "completion_length": 656.6604309082031, + "epoch": 0.1936309809569531, + "grad_norm": 0.13644298911094666, + "kl": 0.29756073504686353, + "learning_rate": 1.9472216239291256e-05, + "loss": 0.1072, + "reward": 0.9921875178813935, + "reward_std": 0.15694627091288565, + "rewards/accuracy_reward": 0.04375000130385161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9484375059604645, + "step": 605 + }, + { + "clip_ratio": 0.0, + "completion_length": 707.0354309082031, + "epoch": 0.19395103216514642, + "grad_norm": 0.09973659366369247, + "kl": 0.2315961815416813, + "learning_rate": 1.946862750656449e-05, + "loss": 0.0682, + "reward": 1.0463541805744172, + "reward_std": 0.17754550725221635, + "rewards/accuracy_reward": 0.10000000316649675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9463541746139527, + "step": 606 + }, + { + "clip_ratio": 0.0, + "completion_length": 702.8583557128907, + "epoch": 0.19427108337333973, + "grad_norm": 0.11070112884044647, + "kl": 0.3192076399922371, + "learning_rate": 1.946502694709796e-05, + "loss": 0.1039, + "reward": 1.0281250178813934, + "reward_std": 0.19837626814842224, + "rewards/accuracy_reward": 0.09583333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9322916805744171, + "step": 607 + }, + { + "clip_ratio": 0.0, + "completion_length": 648.0583557128906, + "epoch": 0.19459113458153304, + "grad_norm": 0.1061927005648613, + "kl": 0.30815952718257905, + "learning_rate": 1.9461414565388917e-05, + "loss": 0.1082, + "reward": 1.0973958492279052, + "reward_std": 0.21912664771080018, + "rewards/accuracy_reward": 0.15416667331010103, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9432291746139526, + "step": 608 + }, + { + "clip_ratio": 0.0, + "completion_length": 687.2354370117188, + "epoch": 0.19491118578972635, + "grad_norm": 0.17305564880371094, + "kl": 0.31731778383255005, + "learning_rate": 1.9457790365949395e-05, + "loss": 0.0975, + "reward": 1.020312535762787, + "reward_std": 0.18112142533063888, + "rewards/accuracy_reward": 0.0854166692122817, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9348958492279053, + "step": 609 + }, + { + "clip_ratio": 0.0, + "completion_length": 657.0562622070313, + "epoch": 0.19523123699791967, + "grad_norm": 0.17710889875888824, + "kl": 0.3631772108376026, + "learning_rate": 1.945415435330618e-05, + "loss": 0.1042, + "reward": 0.9520833492279053, + "reward_std": 0.16206833571195603, + "rewards/accuracy_reward": 0.006250000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9458333551883698, + "step": 610 + }, + { + "clip_ratio": 0.0, + "completion_length": 722.6562683105469, + "epoch": 0.19555128820611298, + "grad_norm": 0.1479700654745102, + "kl": 0.531545577943325, + "learning_rate": 1.945050653200081e-05, + "loss": 0.1065, + "reward": 0.9593750238418579, + "reward_std": 0.23627854734659196, + "rewards/accuracy_reward": 0.043750001676380636, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9156250119209289, + "step": 611 + }, + { + "clip_ratio": 0.0, + "completion_length": 741.0146118164063, + "epoch": 0.1958713394143063, + "grad_norm": 0.3069762885570526, + "kl": 0.7132264107465744, + "learning_rate": 1.9446846906589586e-05, + "loss": 0.1666, + "reward": 0.9562500238418579, + "reward_std": 0.2901140823960304, + "rewards/accuracy_reward": 0.0708333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8854166805744171, + "step": 612 + }, + { + "clip_ratio": 0.0, + "completion_length": 687.283349609375, + "epoch": 0.1961913906224996, + "grad_norm": 0.303815633058548, + "kl": 0.5620258882641792, + "learning_rate": 1.9443175481643536e-05, + "loss": 0.1316, + "reward": 0.9885416805744172, + "reward_std": 0.24952167868614197, + "rewards/accuracy_reward": 0.09166666995733977, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.896875011920929, + "step": 613 + }, + { + "clip_ratio": 0.0, + "completion_length": 759.2500183105469, + "epoch": 0.1965114418306929, + "grad_norm": 0.10386354476213455, + "kl": 0.34386143982410433, + "learning_rate": 1.9439492261748438e-05, + "loss": 0.0905, + "reward": 1.0187500178813935, + "reward_std": 0.19125093519687653, + "rewards/accuracy_reward": 0.08333333544433116, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9354166805744171, + "step": 614 + }, + { + "clip_ratio": 0.0, + "completion_length": 689.5208557128906, + "epoch": 0.19683149303888622, + "grad_norm": 0.16504688560962677, + "kl": 0.2635188832879066, + "learning_rate": 1.9435797251504797e-05, + "loss": 0.1012, + "reward": 1.0119791805744172, + "reward_std": 0.1481368623673916, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9453125178813935, + "step": 615 + }, + { + "clip_ratio": 0.0, + "completion_length": 706.8146118164062, + "epoch": 0.19715154424707954, + "grad_norm": 0.23499853909015656, + "kl": 0.4958648651838303, + "learning_rate": 1.9432090455527847e-05, + "loss": 0.1062, + "reward": 0.9473958432674408, + "reward_std": 0.22377754971385003, + "rewards/accuracy_reward": 0.03125000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9161458373069763, + "step": 616 + }, + { + "clip_ratio": 0.0, + "completion_length": 685.5250183105469, + "epoch": 0.19747159545527285, + "grad_norm": 0.1540578454732895, + "kl": 0.22922171503305436, + "learning_rate": 1.9428371878447545e-05, + "loss": 0.0918, + "reward": 1.160416716337204, + "reward_std": 0.1973195567727089, + "rewards/accuracy_reward": 0.21250000689178705, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9479166805744171, + "step": 617 + }, + { + "clip_ratio": 0.0, + "completion_length": 794.7541931152343, + "epoch": 0.19779164666346616, + "grad_norm": 0.1357450783252716, + "kl": 0.2704797863960266, + "learning_rate": 1.9424641524908553e-05, + "loss": 0.0411, + "reward": 0.9552083492279053, + "reward_std": 0.1474937668070197, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9510416805744171, + "step": 618 + }, + { + "clip_ratio": 0.0, + "completion_length": 722.7125183105469, + "epoch": 0.19811169787165947, + "grad_norm": 0.16005557775497437, + "kl": 0.3636905699968338, + "learning_rate": 1.942089939957026e-05, + "loss": 0.0828, + "reward": 0.9494791805744172, + "reward_std": 0.2315253049135208, + "rewards/accuracy_reward": 0.02500000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9244791865348816, + "step": 619 + }, + { + "clip_ratio": 0.0, + "completion_length": 678.9395935058594, + "epoch": 0.19843174907985278, + "grad_norm": 0.21744054555892944, + "kl": 0.5238471448421478, + "learning_rate": 1.9417145507106737e-05, + "loss": 0.1072, + "reward": 0.9651041865348816, + "reward_std": 0.21560292392969133, + "rewards/accuracy_reward": 0.05416666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.910937511920929, + "step": 620 + }, + { + "clip_ratio": 0.0, + "completion_length": 685.8021118164063, + "epoch": 0.1987518002880461, + "grad_norm": 0.2656537890434265, + "kl": 0.6612590730190278, + "learning_rate": 1.9413379852206772e-05, + "loss": 0.1127, + "reward": 0.9630208432674408, + "reward_std": 0.1951449528336525, + "rewards/accuracy_reward": 0.04166666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9213541746139526, + "step": 621 + }, + { + "clip_ratio": 0.0, + "completion_length": 686.4250244140625, + "epoch": 0.1990718514962394, + "grad_norm": 0.17960472404956818, + "kl": 0.41118341088294985, + "learning_rate": 1.940960243957383e-05, + "loss": 0.1284, + "reward": 1.0088541805744171, + "reward_std": 0.16900431141257286, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.942187511920929, + "step": 622 + }, + { + "clip_ratio": 0.0, + "completion_length": 668.5521057128906, + "epoch": 0.19939190270443272, + "grad_norm": 0.14084936678409576, + "kl": 0.3053492411971092, + "learning_rate": 1.9405813273926076e-05, + "loss": 0.0959, + "reward": 1.0520833551883697, + "reward_std": 0.15083030611276627, + "rewards/accuracy_reward": 0.10208333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.950000011920929, + "step": 623 + }, + { + "clip_ratio": 0.0, + "completion_length": 638.0021057128906, + "epoch": 0.19971195391262603, + "grad_norm": 0.33902284502983093, + "kl": 0.4690784841775894, + "learning_rate": 1.9402012359996342e-05, + "loss": 0.1339, + "reward": 1.1656250417232514, + "reward_std": 0.2067689336836338, + "rewards/accuracy_reward": 0.2250000076368451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.940625011920929, + "step": 624 + }, + { + "clip_ratio": 0.0, + "completion_length": 687.9312683105469, + "epoch": 0.20003200512081934, + "grad_norm": 0.12033872306346893, + "kl": 0.23370456770062448, + "learning_rate": 1.9398199702532143e-05, + "loss": 0.0821, + "reward": 1.0390625298023224, + "reward_std": 0.19397076815366746, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9557291746139527, + "step": 625 + }, + { + "clip_ratio": 0.0, + "completion_length": 687.9896057128906, + "epoch": 0.20035205632901265, + "grad_norm": 0.1661445051431656, + "kl": 0.20272066816687584, + "learning_rate": 1.9394375306295655e-05, + "loss": 0.0791, + "reward": 0.9723958492279052, + "reward_std": 0.12908698618412018, + "rewards/accuracy_reward": 0.008333333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.964062511920929, + "step": 626 + }, + { + "clip_ratio": 0.0, + "completion_length": 623.8812683105468, + "epoch": 0.20067210753720596, + "grad_norm": 0.1688031256198883, + "kl": 0.2342596873641014, + "learning_rate": 1.9390539176063723e-05, + "loss": 0.0951, + "reward": 1.1093750178813935, + "reward_std": 0.11803668811917305, + "rewards/accuracy_reward": 0.14166667070239783, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9677083373069764, + "step": 627 + }, + { + "clip_ratio": 0.0, + "completion_length": 624.2083557128906, + "epoch": 0.20099215874539927, + "grad_norm": 0.13567431271076202, + "kl": 0.240685623139143, + "learning_rate": 1.9386691316627845e-05, + "loss": 0.0907, + "reward": 1.084375023841858, + "reward_std": 0.17157965078949927, + "rewards/accuracy_reward": 0.11875000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9656250059604645, + "step": 628 + }, + { + "clip_ratio": 0.0, + "completion_length": 647.1916809082031, + "epoch": 0.20131220995359259, + "grad_norm": 0.11089053750038147, + "kl": 0.3034446746110916, + "learning_rate": 1.938283173279417e-05, + "loss": 0.1056, + "reward": 0.9802083611488343, + "reward_std": 0.21385945081710817, + "rewards/accuracy_reward": 0.03750000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 629 + }, + { + "clip_ratio": 0.0, + "completion_length": 622.4396118164062, + "epoch": 0.2016322611617859, + "grad_norm": 0.13379743695259094, + "kl": 0.3716781333088875, + "learning_rate": 1.9378960429383494e-05, + "loss": 0.1141, + "reward": 0.9442708432674408, + "reward_std": 0.14153240323066713, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.942187511920929, + "step": 630 + }, + { + "clip_ratio": 0.0, + "completion_length": 630.175015258789, + "epoch": 0.2019523123699792, + "grad_norm": 0.3276001811027527, + "kl": 0.3941763326525688, + "learning_rate": 1.937507741123124e-05, + "loss": 0.0728, + "reward": 1.0151041984558105, + "reward_std": 0.17675597220659256, + "rewards/accuracy_reward": 0.0625000026077032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9526041865348815, + "step": 631 + }, + { + "clip_ratio": 0.0, + "completion_length": 619.908349609375, + "epoch": 0.20227236357817252, + "grad_norm": 0.1801009476184845, + "kl": 0.5507948979735374, + "learning_rate": 1.9371182683187477e-05, + "loss": 0.169, + "reward": 1.021354192495346, + "reward_std": 0.20031024273484946, + "rewards/accuracy_reward": 0.09583333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9255208492279052, + "step": 632 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.6271057128906, + "epoch": 0.20259241478636583, + "grad_norm": 0.25220805406570435, + "kl": 0.4888093382120132, + "learning_rate": 1.9367276250116894e-05, + "loss": 0.1247, + "reward": 1.0687500417232514, + "reward_std": 0.16525894403457642, + "rewards/accuracy_reward": 0.12291667014360427, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9458333492279053, + "step": 633 + }, + { + "clip_ratio": 0.0, + "completion_length": 636.6562652587891, + "epoch": 0.20291246599455912, + "grad_norm": 0.2679707705974579, + "kl": 0.541820627450943, + "learning_rate": 1.9363358116898804e-05, + "loss": 0.1338, + "reward": 1.0411458611488342, + "reward_std": 0.19658421874046325, + "rewards/accuracy_reward": 0.12083333842456341, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9203125238418579, + "step": 634 + }, + { + "clip_ratio": 0.0, + "completion_length": 650.052099609375, + "epoch": 0.20323251720275243, + "grad_norm": 0.2806706130504608, + "kl": 0.48353932052850723, + "learning_rate": 1.935942828842713e-05, + "loss": 0.1038, + "reward": 0.9880208551883698, + "reward_std": 0.20751163512468337, + "rewards/accuracy_reward": 0.05416666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9338541746139526, + "step": 635 + }, + { + "clip_ratio": 0.0, + "completion_length": 614.6312683105468, + "epoch": 0.20355256841094574, + "grad_norm": 0.14846235513687134, + "kl": 0.5935159817337989, + "learning_rate": 1.93554867696104e-05, + "loss": 0.1618, + "reward": 0.9677083611488342, + "reward_std": 0.22315222583711147, + "rewards/accuracy_reward": 0.04791666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9197916865348816, + "step": 636 + }, + { + "clip_ratio": 0.0, + "completion_length": 615.1833526611329, + "epoch": 0.20387261961913905, + "grad_norm": 0.1585318148136139, + "kl": 0.4165584176778793, + "learning_rate": 1.9351533565371747e-05, + "loss": 0.1312, + "reward": 1.0536458611488342, + "reward_std": 0.19778771847486495, + "rewards/accuracy_reward": 0.12083333693444728, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.932812511920929, + "step": 637 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.758349609375, + "epoch": 0.20419267082733236, + "grad_norm": 0.17503924667835236, + "kl": 0.47458241432905196, + "learning_rate": 1.9347568680648903e-05, + "loss": 0.1487, + "reward": 1.0119791805744172, + "reward_std": 0.26864703595638273, + "rewards/accuracy_reward": 0.08750000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9244791746139527, + "step": 638 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.4541809082032, + "epoch": 0.20451272203552567, + "grad_norm": 0.2242783159017563, + "kl": 0.33436805531382563, + "learning_rate": 1.9343592120394187e-05, + "loss": 0.1189, + "reward": 0.9973958611488343, + "reward_std": 0.18330222517251968, + "rewards/accuracy_reward": 0.05416666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9432291924953461, + "step": 639 + }, + { + "clip_ratio": 0.0, + "completion_length": 607.3437683105469, + "epoch": 0.20483277324371899, + "grad_norm": 0.1964443325996399, + "kl": 0.3340280294418335, + "learning_rate": 1.9339603889574498e-05, + "loss": 0.1283, + "reward": 0.9588541865348816, + "reward_std": 0.17630846053361893, + "rewards/accuracy_reward": 0.020833334513008596, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9380208551883698, + "step": 640 + }, + { + "clip_ratio": 0.0, + "completion_length": 653.9416870117187, + "epoch": 0.2051528244519123, + "grad_norm": 0.15593037009239197, + "kl": 0.4793477475643158, + "learning_rate": 1.9335603993171318e-05, + "loss": 0.1163, + "reward": 0.9802083492279052, + "reward_std": 0.15702517479658126, + "rewards/accuracy_reward": 0.04583333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.934375011920929, + "step": 641 + }, + { + "clip_ratio": 0.0, + "completion_length": 630.5083587646484, + "epoch": 0.2054728756601056, + "grad_norm": 0.1937408149242401, + "kl": 0.5273372441530227, + "learning_rate": 1.9331592436180698e-05, + "loss": 0.1689, + "reward": 0.9833333492279053, + "reward_std": 0.22838717848062515, + "rewards/accuracy_reward": 0.05625, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9270833492279053, + "step": 642 + }, + { + "clip_ratio": 0.0, + "completion_length": 654.8875183105469, + "epoch": 0.20579292686829892, + "grad_norm": 0.22712171077728271, + "kl": 0.6470546633005142, + "learning_rate": 1.932756922361325e-05, + "loss": 0.1601, + "reward": 1.032291704416275, + "reward_std": 0.2833305008709431, + "rewards/accuracy_reward": 0.13333333898335695, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8989583551883698, + "step": 643 + }, + { + "clip_ratio": 0.0, + "completion_length": 645.6250122070312, + "epoch": 0.20611297807649223, + "grad_norm": 0.29243704676628113, + "kl": 0.45784421265125275, + "learning_rate": 1.932353436049414e-05, + "loss": 0.1327, + "reward": 0.986979192495346, + "reward_std": 0.20242194682359696, + "rewards/accuracy_reward": 0.05625000167638063, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9307291805744171, + "step": 644 + }, + { + "clip_ratio": 0.0, + "completion_length": 671.0583557128906, + "epoch": 0.20643302928468554, + "grad_norm": 0.2416207194328308, + "kl": 0.39548794478178023, + "learning_rate": 1.9319487851863103e-05, + "loss": 0.1223, + "reward": 1.1203125298023224, + "reward_std": 0.2333257243037224, + "rewards/accuracy_reward": 0.1875000052154064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9328125059604645, + "step": 645 + }, + { + "clip_ratio": 0.0, + "completion_length": 624.8312683105469, + "epoch": 0.20675308049287885, + "grad_norm": 0.1228092610836029, + "kl": 0.2716518625617027, + "learning_rate": 1.9315429702774408e-05, + "loss": 0.0677, + "reward": 0.9864583551883698, + "reward_std": 0.15608318373560906, + "rewards/accuracy_reward": 0.029166667722165585, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9572916805744172, + "step": 646 + }, + { + "clip_ratio": 0.0, + "completion_length": 642.5541870117188, + "epoch": 0.20707313170107217, + "grad_norm": 0.17116791009902954, + "kl": 0.26060923784971235, + "learning_rate": 1.9311359918296855e-05, + "loss": 0.0978, + "reward": 1.0656250238418579, + "reward_std": 0.14626556485891343, + "rewards/accuracy_reward": 0.10000000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.965625011920929, + "step": 647 + }, + { + "clip_ratio": 0.0, + "completion_length": 697.2333557128907, + "epoch": 0.20739318290926548, + "grad_norm": 0.12016578763723373, + "kl": 0.24654133021831512, + "learning_rate": 1.9307278503513803e-05, + "loss": 0.061, + "reward": 1.037500023841858, + "reward_std": 0.12056761756539344, + "rewards/accuracy_reward": 0.07291666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9645833492279052, + "step": 648 + }, + { + "clip_ratio": 0.0, + "completion_length": 620.2396057128906, + "epoch": 0.2077132341174588, + "grad_norm": 0.07504566013813019, + "kl": 0.21631157025694847, + "learning_rate": 1.9303185463523108e-05, + "loss": 0.0896, + "reward": 1.0578125178813935, + "reward_std": 0.13687589354813098, + "rewards/accuracy_reward": 0.0916666692122817, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9661458432674408, + "step": 649 + }, + { + "clip_ratio": 0.0, + "completion_length": 648.6208435058594, + "epoch": 0.2080332853256521, + "grad_norm": 0.2659440040588379, + "kl": 0.1398515522480011, + "learning_rate": 1.929908080343717e-05, + "loss": 0.0441, + "reward": 1.048437511920929, + "reward_std": 0.11139777526259423, + "rewards/accuracy_reward": 0.06250000316649676, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9859375059604645, + "step": 650 + }, + { + "clip_ratio": 0.0, + "completion_length": 669.0771118164063, + "epoch": 0.2083533365338454, + "grad_norm": 0.12325471639633179, + "kl": 0.24901945143938065, + "learning_rate": 1.9294964528382885e-05, + "loss": 0.0615, + "reward": 1.0468750238418578, + "reward_std": 0.164197001978755, + "rewards/accuracy_reward": 0.0791666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9677083492279053, + "step": 651 + }, + { + "clip_ratio": 0.0, + "completion_length": 645.5500122070313, + "epoch": 0.20867338774203872, + "grad_norm": 0.10141347348690033, + "kl": 0.3649654157459736, + "learning_rate": 1.929083664350167e-05, + "loss": 0.1014, + "reward": 1.0307291924953461, + "reward_std": 0.14789256304502488, + "rewards/accuracy_reward": 0.08125000242143869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9494791746139526, + "step": 652 + }, + { + "clip_ratio": 0.0, + "completion_length": 692.3437683105469, + "epoch": 0.20899343895023204, + "grad_norm": 0.09904764592647552, + "kl": 0.2902476988732815, + "learning_rate": 1.9286697153949436e-05, + "loss": 0.0651, + "reward": 1.0135416865348816, + "reward_std": 0.14488410539925098, + "rewards/accuracy_reward": 0.04791666716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.965625011920929, + "step": 653 + }, + { + "clip_ratio": 0.0, + "completion_length": 662.4937744140625, + "epoch": 0.20931349015842535, + "grad_norm": 0.10897406935691833, + "kl": 0.20142997726798056, + "learning_rate": 1.9282546064896594e-05, + "loss": 0.0721, + "reward": 1.078125011920929, + "reward_std": 0.09604029338806867, + "rewards/accuracy_reward": 0.10625000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.971875011920929, + "step": 654 + }, + { + "clip_ratio": 0.0, + "completion_length": 625.9479431152344, + "epoch": 0.20963354136661866, + "grad_norm": 0.23808997869491577, + "kl": 0.22053091898560523, + "learning_rate": 1.9278383381528036e-05, + "loss": 0.0576, + "reward": 1.0166666805744171, + "reward_std": 0.12363926023244858, + "rewards/accuracy_reward": 0.03958333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833492279053, + "step": 655 + }, + { + "clip_ratio": 0.0, + "completion_length": 673.3208557128906, + "epoch": 0.20995359257481197, + "grad_norm": 0.07611843943595886, + "kl": 0.2132401891052723, + "learning_rate": 1.9274209109043146e-05, + "loss": 0.0521, + "reward": 1.0567708492279053, + "reward_std": 0.1237191118299961, + "rewards/accuracy_reward": 0.07916666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9776041746139527, + "step": 656 + }, + { + "clip_ratio": 0.0, + "completion_length": 634.1875122070312, + "epoch": 0.21027364378300528, + "grad_norm": 0.09360788017511368, + "kl": 0.23806697279214858, + "learning_rate": 1.927002325265577e-05, + "loss": 0.0523, + "reward": 1.0703125298023224, + "reward_std": 0.14319055881351234, + "rewards/accuracy_reward": 0.09375000428408384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9765625178813935, + "step": 657 + }, + { + "clip_ratio": 0.0, + "completion_length": 661.1416870117188, + "epoch": 0.2105936949911986, + "grad_norm": 0.07128574699163437, + "kl": 0.16915738582611084, + "learning_rate": 1.9265825817594232e-05, + "loss": 0.0558, + "reward": 1.0114583551883698, + "reward_std": 0.08447882384061814, + "rewards/accuracy_reward": 0.02708333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750119209289, + "step": 658 + }, + { + "clip_ratio": 0.0, + "completion_length": 677.1687744140625, + "epoch": 0.2109137461993919, + "grad_norm": 0.06273775547742844, + "kl": 0.1647022284567356, + "learning_rate": 1.9261616809101317e-05, + "loss": 0.0496, + "reward": 1.0583333432674409, + "reward_std": 0.1425313174724579, + "rewards/accuracy_reward": 0.08333333544433116, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9750000178813935, + "step": 659 + }, + { + "clip_ratio": 0.0, + "completion_length": 688.7479370117187, + "epoch": 0.21123379740758522, + "grad_norm": 0.11021216958761215, + "kl": 0.2603888504207134, + "learning_rate": 1.9257396232434266e-05, + "loss": 0.0737, + "reward": 1.0020833432674408, + "reward_std": 0.09916403293609619, + "rewards/accuracy_reward": 0.02916666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9729166805744172, + "step": 660 + }, + { + "clip_ratio": 0.0, + "completion_length": 658.9229431152344, + "epoch": 0.21155384861577853, + "grad_norm": 0.08434461057186127, + "kl": 0.16674437001347542, + "learning_rate": 1.9253164092864768e-05, + "loss": 0.059, + "reward": 1.0015625178813934, + "reward_std": 0.1232110183686018, + "rewards/accuracy_reward": 0.02916666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9723958432674408, + "step": 661 + }, + { + "clip_ratio": 0.0, + "completion_length": 655.5833557128906, + "epoch": 0.21187389982397184, + "grad_norm": 0.10858671367168427, + "kl": 0.13052089065313338, + "learning_rate": 1.9248920395678955e-05, + "loss": 0.0463, + "reward": 1.1036458551883697, + "reward_std": 0.11935643032193184, + "rewards/accuracy_reward": 0.12500000428408384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458492279053, + "step": 662 + }, + { + "clip_ratio": 0.0, + "completion_length": 656.4583435058594, + "epoch": 0.21219395103216515, + "grad_norm": 0.050972215831279755, + "kl": 0.13374503329396248, + "learning_rate": 1.9244665146177395e-05, + "loss": -0.0086, + "reward": 1.1062500059604645, + "reward_std": 0.10863641854375601, + "rewards/accuracy_reward": 0.11458333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9916666746139526, + "step": 663 + }, + { + "clip_ratio": 0.0, + "completion_length": 646.0729309082031, + "epoch": 0.21251400224035846, + "grad_norm": 0.06517866253852844, + "kl": 0.1450530506670475, + "learning_rate": 1.9240398349675083e-05, + "loss": 0.071, + "reward": 1.0520833551883697, + "reward_std": 0.09250790346413851, + "rewards/accuracy_reward": 0.07708333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.975000011920929, + "step": 664 + }, + { + "clip_ratio": 0.0, + "completion_length": 623.0187744140625, + "epoch": 0.21283405344855177, + "grad_norm": 0.10822878032922745, + "kl": 0.15510641485452653, + "learning_rate": 1.9236120011501442e-05, + "loss": 0.0592, + "reward": 1.1192708611488342, + "reward_std": 0.07371204420924186, + "rewards/accuracy_reward": 0.1375000050291419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708373069763, + "step": 665 + }, + { + "clip_ratio": 0.0, + "completion_length": 662.3271057128907, + "epoch": 0.21315410465674509, + "grad_norm": 0.09872405230998993, + "kl": 0.19749830849468708, + "learning_rate": 1.9231830137000305e-05, + "loss": 0.0727, + "reward": 0.9979166805744171, + "reward_std": 0.13651593923568725, + "rewards/accuracy_reward": 0.03125000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9666666746139526, + "step": 666 + }, + { + "clip_ratio": 0.0, + "completion_length": 677.6062744140625, + "epoch": 0.2134741558649384, + "grad_norm": 0.08014009892940521, + "kl": 0.16215406954288483, + "learning_rate": 1.922752873152992e-05, + "loss": 0.0448, + "reward": 1.037500011920929, + "reward_std": 0.12696239706128837, + "rewards/accuracy_reward": 0.06666667070239782, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333492279053, + "step": 667 + }, + { + "clip_ratio": 0.0, + "completion_length": 658.7479309082031, + "epoch": 0.2137942070731317, + "grad_norm": 0.17184635996818542, + "kl": 0.16198827996850013, + "learning_rate": 1.9223215800462937e-05, + "loss": 0.0821, + "reward": 1.041666680574417, + "reward_std": 0.12022981494665146, + "rewards/accuracy_reward": 0.07291666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9687500119209289, + "step": 668 + }, + { + "clip_ratio": 0.0, + "completion_length": 691.1166870117188, + "epoch": 0.21411425828132502, + "grad_norm": 0.1011757031083107, + "kl": 0.18935470506548882, + "learning_rate": 1.9218891349186394e-05, + "loss": 0.0567, + "reward": 1.0567708730697631, + "reward_std": 0.14946944694966077, + "rewards/accuracy_reward": 0.08958333730697632, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9671875238418579, + "step": 669 + }, + { + "clip_ratio": 0.0, + "completion_length": 652.1541870117187, + "epoch": 0.21443430948951833, + "grad_norm": 0.12701667845249176, + "kl": 0.2506525985896587, + "learning_rate": 1.9214555383101724e-05, + "loss": 0.0562, + "reward": 1.0416666924953462, + "reward_std": 0.1605004720389843, + "rewards/accuracy_reward": 0.07500000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9666666805744171, + "step": 670 + }, + { + "clip_ratio": 0.0, + "completion_length": 610.1229461669922, + "epoch": 0.21475436069771164, + "grad_norm": 0.10412585735321045, + "kl": 0.26857480928301813, + "learning_rate": 1.9210207907624748e-05, + "loss": 0.1258, + "reward": 1.0447917044162751, + "reward_std": 0.1726130098104477, + "rewards/accuracy_reward": 0.08958333618938923, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9552083551883698, + "step": 671 + }, + { + "clip_ratio": 0.0, + "completion_length": 668.1708618164063, + "epoch": 0.21507441190590496, + "grad_norm": 0.1114473044872284, + "kl": 0.3208130903542042, + "learning_rate": 1.920584892818566e-05, + "loss": 0.116, + "reward": 0.9984375238418579, + "reward_std": 0.20095318108797072, + "rewards/accuracy_reward": 0.04791666734963655, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9505208432674408, + "step": 672 + }, + { + "clip_ratio": 0.0, + "completion_length": 632.1812805175781, + "epoch": 0.21539446311409827, + "grad_norm": 0.33339497447013855, + "kl": 0.355214512348175, + "learning_rate": 1.9201478450229012e-05, + "loss": 0.1135, + "reward": 1.1171875238418578, + "reward_std": 0.13930478543043137, + "rewards/accuracy_reward": 0.1562500050291419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9609375059604645, + "step": 673 + }, + { + "clip_ratio": 0.0, + "completion_length": 669.3979370117188, + "epoch": 0.21571451432229158, + "grad_norm": 0.10623612254858017, + "kl": 0.3115902006626129, + "learning_rate": 1.919709647921373e-05, + "loss": 0.1144, + "reward": 0.9781250298023224, + "reward_std": 0.1574092723429203, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9572916805744172, + "step": 674 + }, + { + "clip_ratio": 0.0, + "completion_length": 647.3854370117188, + "epoch": 0.2160345655304849, + "grad_norm": 0.16619139909744263, + "kl": 0.23366658315062522, + "learning_rate": 1.9192703020613094e-05, + "loss": 0.0834, + "reward": 1.0635416984558106, + "reward_std": 0.11643952075392008, + "rewards/accuracy_reward": 0.10208333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9614583551883698, + "step": 675 + }, + { + "clip_ratio": 0.0, + "completion_length": 669.1562652587891, + "epoch": 0.2163546167386782, + "grad_norm": 0.3299196660518646, + "kl": 0.3408443845808506, + "learning_rate": 1.918829807991473e-05, + "loss": 0.1221, + "reward": 1.0786458611488343, + "reward_std": 0.18198420107364655, + "rewards/accuracy_reward": 0.13125000353902577, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9473958432674408, + "step": 676 + }, + { + "clip_ratio": 0.0, + "completion_length": 639.083349609375, + "epoch": 0.2166746679468715, + "grad_norm": 0.8060458898544312, + "kl": 0.29906757101416587, + "learning_rate": 1.9183881662620606e-05, + "loss": 0.0866, + "reward": 1.0479166805744171, + "reward_std": 0.1341792933642864, + "rewards/accuracy_reward": 0.08750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9604166746139526, + "step": 677 + }, + { + "clip_ratio": 0.0, + "completion_length": 664.6291809082031, + "epoch": 0.2169947191550648, + "grad_norm": 2.5255606174468994, + "kl": 0.7097936183214187, + "learning_rate": 1.9179453774247023e-05, + "loss": 0.1361, + "reward": 0.9807291865348816, + "reward_std": 0.15256869401782752, + "rewards/accuracy_reward": 0.04166666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9390625178813934, + "step": 678 + }, + { + "clip_ratio": 0.0, + "completion_length": 675.3479309082031, + "epoch": 0.2173147703632581, + "grad_norm": 342.9512023925781, + "kl": 60.940721249580385, + "learning_rate": 1.9175014420324613e-05, + "loss": 3.6971, + "reward": 0.9463541865348816, + "reward_std": 0.22579180002212523, + "rewards/accuracy_reward": 0.03125000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9151041805744171, + "step": 679 + }, + { + "clip_ratio": 0.0, + "completion_length": 627.5437683105469, + "epoch": 0.21763482157145142, + "grad_norm": 26.168609619140625, + "kl": 7.308992192149162, + "learning_rate": 1.917056360639833e-05, + "loss": 0.5832, + "reward": 1.0463541865348815, + "reward_std": 0.18034582138061522, + "rewards/accuracy_reward": 0.10000000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9463541746139527, + "step": 680 + }, + { + "clip_ratio": 0.0, + "completion_length": 634.9021057128906, + "epoch": 0.21795487277964473, + "grad_norm": 3.199669122695923, + "kl": 0.572056169807911, + "learning_rate": 1.9166101338027436e-05, + "loss": 0.1711, + "reward": 1.009375023841858, + "reward_std": 0.18491822630167007, + "rewards/accuracy_reward": 0.07708333544433117, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9302083492279053, + "step": 681 + }, + { + "clip_ratio": 0.0, + "completion_length": 636.4750183105468, + "epoch": 0.21827492398783804, + "grad_norm": 2.4151406288146973, + "kl": 0.709225732088089, + "learning_rate": 1.916162762078551e-05, + "loss": 0.1667, + "reward": 0.9307291805744171, + "reward_std": 0.1792273811995983, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9265625059604645, + "step": 682 + }, + { + "clip_ratio": 0.0, + "completion_length": 654.2396057128906, + "epoch": 0.21859497519603135, + "grad_norm": 1.0843818187713623, + "kl": 1.1302322834730147, + "learning_rate": 1.915714246026042e-05, + "loss": 0.1616, + "reward": 1.0307291865348815, + "reward_std": 0.17401037737727165, + "rewards/accuracy_reward": 0.08541666995733976, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9453125178813935, + "step": 683 + }, + { + "clip_ratio": 0.0, + "completion_length": 617.0979309082031, + "epoch": 0.21891502640422467, + "grad_norm": 0.26156195998191833, + "kl": 0.5273042991757393, + "learning_rate": 1.915264586205433e-05, + "loss": 0.1901, + "reward": 0.9677083551883697, + "reward_std": 0.193864406645298, + "rewards/accuracy_reward": 0.04791666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9197916805744171, + "step": 684 + }, + { + "clip_ratio": 0.0, + "completion_length": 649.1666931152344, + "epoch": 0.21923507761241798, + "grad_norm": 0.3921996057033539, + "kl": 0.7704705983400345, + "learning_rate": 1.91481378317837e-05, + "loss": 0.2797, + "reward": 0.9046875238418579, + "reward_std": 0.34336878657341, + "rewards/accuracy_reward": 0.07500000279396772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8296875119209289, + "step": 685 + }, + { + "clip_ratio": 0.0, + "completion_length": 715.9125244140625, + "epoch": 0.2195551288206113, + "grad_norm": 0.4275372326374054, + "kl": 0.7206552475690842, + "learning_rate": 1.9143618375079257e-05, + "loss": 0.2251, + "reward": 0.9151041805744171, + "reward_std": 0.33101013153791425, + "rewards/accuracy_reward": 0.09791666977107524, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.817187511920929, + "step": 686 + }, + { + "clip_ratio": 0.0, + "completion_length": 706.3562744140625, + "epoch": 0.2198751800288046, + "grad_norm": 0.6302100419998169, + "kl": 1.1008682191371917, + "learning_rate": 1.9139087497586004e-05, + "loss": 0.3284, + "reward": 0.7807291865348815, + "reward_std": 0.3698259711265564, + "rewards/accuracy_reward": 0.0708333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7098958492279053, + "step": 687 + }, + { + "clip_ratio": 0.0, + "completion_length": 666.8437744140625, + "epoch": 0.2201952312369979, + "grad_norm": 0.3019793927669525, + "kl": 0.6823631256818772, + "learning_rate": 1.9134545204963214e-05, + "loss": 0.2566, + "reward": 0.8682291805744171, + "reward_std": 0.32157149612903596, + "rewards/accuracy_reward": 0.05000000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8182291865348816, + "step": 688 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.3000213623047, + "epoch": 0.22051528244519122, + "grad_norm": 0.252623587846756, + "kl": 0.4231353387236595, + "learning_rate": 1.912999150288441e-05, + "loss": 0.236, + "reward": 0.912500011920929, + "reward_std": 0.24352166503667833, + "rewards/accuracy_reward": 0.01666666716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8958333373069763, + "step": 689 + }, + { + "clip_ratio": 0.0, + "completion_length": 625.7562805175781, + "epoch": 0.22083533365338454, + "grad_norm": 0.2588579058647156, + "kl": 0.4063556343317032, + "learning_rate": 1.912542639703737e-05, + "loss": 0.1746, + "reward": 0.9609375298023224, + "reward_std": 0.25126550942659376, + "rewards/accuracy_reward": 0.05416666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9067708611488342, + "step": 690 + }, + { + "clip_ratio": 0.0, + "completion_length": 628.4041870117187, + "epoch": 0.22115538486157785, + "grad_norm": 0.35281902551651, + "kl": 0.36444804519414903, + "learning_rate": 1.912084989312412e-05, + "loss": 0.1878, + "reward": 0.9937500178813934, + "reward_std": 0.21244567185640334, + "rewards/accuracy_reward": 0.08125000242143869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9125000059604644, + "step": 691 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.5083435058593, + "epoch": 0.22147543606977116, + "grad_norm": 1.7048529386520386, + "kl": 0.3061882697045803, + "learning_rate": 1.9116261996860914e-05, + "loss": 0.1228, + "reward": 0.9671875238418579, + "reward_std": 0.17979936115443707, + "rewards/accuracy_reward": 0.02083333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9463541805744171, + "step": 692 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.952099609375, + "epoch": 0.22179548727796447, + "grad_norm": 1.6680268049240112, + "kl": 0.5593406990170479, + "learning_rate": 1.9111662713978242e-05, + "loss": 0.0753, + "reward": 1.0125000119209289, + "reward_std": 0.06362286508083344, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9791666746139527, + "step": 693 + }, + { + "clip_ratio": 0.0, + "completion_length": 612.7687683105469, + "epoch": 0.22211553848615778, + "grad_norm": 43.46950912475586, + "kl": 12.55296850502491, + "learning_rate": 1.9107052050220808e-05, + "loss": 0.6179, + "reward": 1.1260417103767395, + "reward_std": 0.13523164130747317, + "rewards/accuracy_reward": 0.14791667070239783, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250178813934, + "step": 694 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.4521026611328, + "epoch": 0.2224355896943511, + "grad_norm": 47.668182373046875, + "kl": 10.92622417807579, + "learning_rate": 1.910243001134755e-05, + "loss": 0.739, + "reward": 1.0005208551883698, + "reward_std": 0.08894652742892503, + "rewards/accuracy_reward": 0.02291666716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9776041805744171, + "step": 695 + }, + { + "clip_ratio": 0.0, + "completion_length": 619.3916870117188, + "epoch": 0.2227556409025444, + "grad_norm": 0.6365566253662109, + "kl": 0.6296287894248962, + "learning_rate": 1.909779660313159e-05, + "loss": 0.0362, + "reward": 1.0015625178813934, + "reward_std": 0.1047313479706645, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9807291746139526, + "step": 696 + }, + { + "clip_ratio": 0.0, + "completion_length": 631.7125183105469, + "epoch": 0.22307569211073772, + "grad_norm": 0.41112789511680603, + "kl": 0.15625541731715203, + "learning_rate": 1.9093151831360268e-05, + "loss": 0.02, + "reward": 1.0812500178813935, + "reward_std": 0.11710264217108488, + "rewards/accuracy_reward": 0.10208333563059568, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9791666746139527, + "step": 697 + }, + { + "clip_ratio": 0.0, + "completion_length": 627.0625183105469, + "epoch": 0.22339574331893103, + "grad_norm": 0.29101845622062683, + "kl": 0.17532607764005662, + "learning_rate": 1.9088495701835113e-05, + "loss": 0.022, + "reward": 1.0437500059604645, + "reward_std": 0.07715525384992361, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833373069763, + "step": 698 + }, + { + "clip_ratio": 0.0, + "completion_length": 634.1250122070312, + "epoch": 0.22371579452712434, + "grad_norm": 0.20614954829216003, + "kl": 0.15920972526073457, + "learning_rate": 1.9083828220371835e-05, + "loss": 0.039, + "reward": 1.0114583551883698, + "reward_std": 0.12535146437585354, + "rewards/accuracy_reward": 0.04166666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916805744171, + "step": 699 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.0562683105469, + "epoch": 0.22403584573531765, + "grad_norm": 0.13199330866336823, + "kl": 0.1495030015707016, + "learning_rate": 1.907914939280033e-05, + "loss": 0.0005, + "reward": 0.9937500298023224, + "reward_std": 0.07663525212556124, + "rewards/accuracy_reward": 0.00833333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9854166805744171, + "step": 700 + }, + { + "clip_ratio": 0.0, + "completion_length": 638.8729309082031, + "epoch": 0.22435589694351096, + "grad_norm": 0.06893268972635269, + "kl": 0.12032232657074929, + "learning_rate": 1.907445922496466e-05, + "loss": 0.0137, + "reward": 0.9947916865348816, + "reward_std": 0.04492462687194347, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.990625011920929, + "step": 701 + }, + { + "clip_ratio": 0.0, + "completion_length": 628.9604309082031, + "epoch": 0.22467594815170427, + "grad_norm": 0.11694058030843735, + "kl": 0.14955301880836486, + "learning_rate": 1.906975772272306e-05, + "loss": 0.0213, + "reward": 1.0473958492279052, + "reward_std": 0.10141881592571736, + "rewards/accuracy_reward": 0.06875000260770321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458492279053, + "step": 702 + }, + { + "clip_ratio": 0.0, + "completion_length": 664.6687744140625, + "epoch": 0.22499599935989759, + "grad_norm": 0.09218557178974152, + "kl": 0.1656613454222679, + "learning_rate": 1.906504489194791e-05, + "loss": 0.0414, + "reward": 1.0697916924953461, + "reward_std": 0.11464045755565166, + "rewards/accuracy_reward": 0.10208333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9677083432674408, + "step": 703 + }, + { + "clip_ratio": 0.0, + "completion_length": 639.5875305175781, + "epoch": 0.2253160505680909, + "grad_norm": 0.15477770566940308, + "kl": 0.163698972761631, + "learning_rate": 1.9060320738525756e-05, + "loss": 0.0578, + "reward": 1.0651041865348816, + "reward_std": 0.12406023722141982, + "rewards/accuracy_reward": 0.09375000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9713541805744171, + "step": 704 + }, + { + "clip_ratio": 0.0, + "completion_length": 636.4208557128907, + "epoch": 0.2256361017762842, + "grad_norm": 0.06102179363369942, + "kl": 0.14229050129652024, + "learning_rate": 1.905558526835727e-05, + "loss": 0.001, + "reward": 0.9911458492279053, + "reward_std": 0.06305509340018034, + "rewards/accuracy_reward": 0.00625, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9848958432674408, + "step": 705 + }, + { + "clip_ratio": 0.0, + "completion_length": 634.3583618164063, + "epoch": 0.22595615298447752, + "grad_norm": 0.08625641465187073, + "kl": 0.12337017208337783, + "learning_rate": 1.9050838487357267e-05, + "loss": 0.0239, + "reward": 1.0260416865348816, + "reward_std": 0.09488309193402529, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9885416746139526, + "step": 706 + }, + { + "clip_ratio": 0.0, + "completion_length": 671.083349609375, + "epoch": 0.22627620419267083, + "grad_norm": 0.10532180964946747, + "kl": 0.14307744055986404, + "learning_rate": 1.904608040145469e-05, + "loss": 0.0231, + "reward": 1.0723958551883697, + "reward_std": 0.12180216908454895, + "rewards/accuracy_reward": 0.09375000316649676, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458373069763, + "step": 707 + }, + { + "clip_ratio": 0.0, + "completion_length": 612.1125183105469, + "epoch": 0.22659625540086414, + "grad_norm": 0.14669953286647797, + "kl": 0.16547591611742973, + "learning_rate": 1.9041311016592603e-05, + "loss": 0.0294, + "reward": 1.0380208551883698, + "reward_std": 0.12260267194360494, + "rewards/accuracy_reward": 0.058333334513008595, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9796875059604645, + "step": 708 + }, + { + "clip_ratio": 0.0, + "completion_length": 619.3270935058594, + "epoch": 0.22691630660905746, + "grad_norm": 0.12648318707942963, + "kl": 0.16989169344305993, + "learning_rate": 1.903653033872818e-05, + "loss": 0.0318, + "reward": 1.1723958611488343, + "reward_std": 0.12271066904067993, + "rewards/accuracy_reward": 0.18958333879709244, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9828125059604644, + "step": 709 + }, + { + "clip_ratio": 0.0, + "completion_length": 641.5833618164063, + "epoch": 0.22723635781725077, + "grad_norm": 0.3657815158367157, + "kl": 0.27132780849933624, + "learning_rate": 1.90317383738327e-05, + "loss": 0.0943, + "reward": 0.9718750178813934, + "reward_std": 0.16001009345054626, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9510416805744171, + "step": 710 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.6895935058594, + "epoch": 0.22755640902544408, + "grad_norm": 0.18482835590839386, + "kl": 0.19547294229269027, + "learning_rate": 1.902693512789154e-05, + "loss": 0.0638, + "reward": 1.1067708611488343, + "reward_std": 0.07665946874767542, + "rewards/accuracy_reward": 0.12291667070239783, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9838541746139526, + "step": 711 + }, + { + "clip_ratio": 0.0, + "completion_length": 593.2645965576172, + "epoch": 0.2278764602336374, + "grad_norm": 0.45472854375839233, + "kl": 0.2865824416279793, + "learning_rate": 1.902212060690418e-05, + "loss": 0.0903, + "reward": 1.0869791984558106, + "reward_std": 0.16627902090549468, + "rewards/accuracy_reward": 0.11875000428408385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9682291805744171, + "step": 712 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.1000244140625, + "epoch": 0.2281965114418307, + "grad_norm": 0.3362894058227539, + "kl": 0.2055267460644245, + "learning_rate": 1.901729481688416e-05, + "loss": 0.0843, + "reward": 1.0875000298023223, + "reward_std": 0.09311237446963787, + "rewards/accuracy_reward": 0.11041667014360428, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833432674408, + "step": 713 + }, + { + "clip_ratio": 0.0, + "completion_length": 652.6833557128906, + "epoch": 0.228516562650024, + "grad_norm": 0.3454423248767853, + "kl": 0.3243007093667984, + "learning_rate": 1.9012457763859117e-05, + "loss": 0.1082, + "reward": 1.0291666865348816, + "reward_std": 0.13885583207011223, + "rewards/accuracy_reward": 0.0645833345130086, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9645833432674408, + "step": 714 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.847933959961, + "epoch": 0.22883661385821732, + "grad_norm": 0.3572365641593933, + "kl": 0.4481654688715935, + "learning_rate": 1.9007609453870738e-05, + "loss": 0.1275, + "reward": 1.0604166865348816, + "reward_std": 0.16449397206306457, + "rewards/accuracy_reward": 0.0958333371207118, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9645833432674408, + "step": 715 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.216683959961, + "epoch": 0.22915666506641064, + "grad_norm": 0.6198186278343201, + "kl": 0.7278454639017582, + "learning_rate": 1.9002749892974785e-05, + "loss": 0.1337, + "reward": 1.1057291865348815, + "reward_std": 0.14216041043400765, + "rewards/accuracy_reward": 0.13125000502914191, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9744791686534882, + "step": 716 + }, + { + "clip_ratio": 0.0, + "completion_length": 611.1416839599609, + "epoch": 0.22947671627460395, + "grad_norm": 0.6964424252510071, + "kl": 0.856408603489399, + "learning_rate": 1.8997879087241065e-05, + "loss": 0.1282, + "reward": 1.0177083492279053, + "reward_std": 0.11686915159225464, + "rewards/accuracy_reward": 0.04583333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.971875011920929, + "step": 717 + }, + { + "clip_ratio": 0.0, + "completion_length": 628.9312622070313, + "epoch": 0.22979676748279726, + "grad_norm": 0.7025728821754456, + "kl": 0.38182810619473456, + "learning_rate": 1.8992997042753437e-05, + "loss": 0.0746, + "reward": 1.1567708611488343, + "reward_std": 0.10790105611085891, + "rewards/accuracy_reward": 0.170833339355886, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9859375059604645, + "step": 718 + }, + { + "clip_ratio": 0.0, + "completion_length": 610.341683959961, + "epoch": 0.23011681869099057, + "grad_norm": 1.644096851348877, + "kl": 1.7317875981330872, + "learning_rate": 1.8988103765609788e-05, + "loss": 0.1786, + "reward": 1.080729180574417, + "reward_std": 0.1288726843893528, + "rewards/accuracy_reward": 0.1020833358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458373069763, + "step": 719 + }, + { + "clip_ratio": 0.0, + "completion_length": 678.0250183105469, + "epoch": 0.23043686989918388, + "grad_norm": 0.5850602984428406, + "kl": 1.0677958868443966, + "learning_rate": 1.898319926192204e-05, + "loss": 0.09, + "reward": 1.0921875298023225, + "reward_std": 0.07778571378439665, + "rewards/accuracy_reward": 0.11041666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708432674408, + "step": 720 + }, + { + "clip_ratio": 0.0, + "completion_length": 662.208349609375, + "epoch": 0.2307569211073772, + "grad_norm": 7.026660919189453, + "kl": 3.291440422087908, + "learning_rate": 1.897828353781614e-05, + "loss": 0.2707, + "reward": 0.9755208551883697, + "reward_std": 0.10903428643941879, + "rewards/accuracy_reward": 0.006250000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9692708432674408, + "step": 721 + }, + { + "clip_ratio": 0.0, + "completion_length": 698.3687744140625, + "epoch": 0.23107697231557048, + "grad_norm": 0.7763417363166809, + "kl": 0.5105241164565086, + "learning_rate": 1.897335659943205e-05, + "loss": 0.0862, + "reward": 1.010416680574417, + "reward_std": 0.09227172508835793, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.975000011920929, + "step": 722 + }, + { + "clip_ratio": 0.0, + "completion_length": 639.0229339599609, + "epoch": 0.2313970235237638, + "grad_norm": 3.8951375484466553, + "kl": 2.1611140362918375, + "learning_rate": 1.8968418452923735e-05, + "loss": 0.2017, + "reward": 1.1088541865348815, + "reward_std": 0.1484844669699669, + "rewards/accuracy_reward": 0.13750000167638063, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9713541686534881, + "step": 723 + }, + { + "clip_ratio": 0.0, + "completion_length": 673.3583557128907, + "epoch": 0.2317170747319571, + "grad_norm": 0.4952332675457001, + "kl": 0.6473507910966874, + "learning_rate": 1.8963469104459157e-05, + "loss": 0.0957, + "reward": 1.062500011920929, + "reward_std": 0.1178272632881999, + "rewards/accuracy_reward": 0.08541666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833373069763, + "step": 724 + }, + { + "clip_ratio": 0.0, + "completion_length": 638.5625244140625, + "epoch": 0.2320371259401504, + "grad_norm": 105.82144165039062, + "kl": 18.172766876220702, + "learning_rate": 1.8958508560220276e-05, + "loss": 1.323, + "reward": 1.0343750238418579, + "reward_std": 0.1931321881711483, + "rewards/accuracy_reward": 0.07916666772216559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9552083432674408, + "step": 725 + }, + { + "clip_ratio": 0.0, + "completion_length": 641.7812683105469, + "epoch": 0.23235717714834372, + "grad_norm": 1.5600378513336182, + "kl": 0.5689830243587494, + "learning_rate": 1.8953536826403035e-05, + "loss": 0.102, + "reward": 1.1260416984558106, + "reward_std": 0.17821024954319, + "rewards/accuracy_reward": 0.1562500052154064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916746139527, + "step": 726 + }, + { + "clip_ratio": 0.0, + "completion_length": 634.5479309082032, + "epoch": 0.23267722835653704, + "grad_norm": 12.669792175292969, + "kl": 4.572777527570724, + "learning_rate": 1.8948553909217354e-05, + "loss": 0.4281, + "reward": 1.0020833492279053, + "reward_std": 0.20425619408488274, + "rewards/accuracy_reward": 0.04791666772216559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9541666805744171, + "step": 727 + }, + { + "clip_ratio": 0.0, + "completion_length": 642.2770935058594, + "epoch": 0.23299727956473035, + "grad_norm": 1.4605772495269775, + "kl": 1.2693428099155426, + "learning_rate": 1.894355981488712e-05, + "loss": 0.1494, + "reward": 1.027604192495346, + "reward_std": 0.14785205852240324, + "rewards/accuracy_reward": 0.06458333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9630208432674408, + "step": 728 + }, + { + "clip_ratio": 0.0, + "completion_length": 627.6437744140625, + "epoch": 0.23331733077292366, + "grad_norm": 1.3393751382827759, + "kl": 1.4025199614465236, + "learning_rate": 1.8938554549650172e-05, + "loss": 0.1719, + "reward": 1.0317708432674408, + "reward_std": 0.13696985617280005, + "rewards/accuracy_reward": 0.07708333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9546875178813934, + "step": 729 + }, + { + "clip_ratio": 0.0, + "completion_length": 625.658349609375, + "epoch": 0.23363738198111697, + "grad_norm": 4.692083358764648, + "kl": 4.138763834536076, + "learning_rate": 1.893353811975832e-05, + "loss": 0.3455, + "reward": 1.0473958373069763, + "reward_std": 0.1992236189544201, + "rewards/accuracy_reward": 0.10833333730697632, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9390625059604645, + "step": 730 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.952099609375, + "epoch": 0.23395743318931028, + "grad_norm": 1.7119262218475342, + "kl": 1.134942190349102, + "learning_rate": 1.8928510531477305e-05, + "loss": 0.181, + "reward": 0.9755208551883697, + "reward_std": 0.1736592784523964, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9380208432674408, + "step": 731 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.6604309082031, + "epoch": 0.2342774843975036, + "grad_norm": 2.9206032752990723, + "kl": 2.6096622347831726, + "learning_rate": 1.892347179108681e-05, + "loss": 0.2967, + "reward": 0.9921875178813935, + "reward_std": 0.1957916386425495, + "rewards/accuracy_reward": 0.05208333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9401041865348816, + "step": 732 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.6271118164062, + "epoch": 0.2345975356056969, + "grad_norm": 10.899141311645508, + "kl": 5.642543570697308, + "learning_rate": 1.891842190488045e-05, + "loss": 0.5927, + "reward": 1.045312523841858, + "reward_std": 0.19487025067210198, + "rewards/accuracy_reward": 0.1062500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9390625178813934, + "step": 733 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.308349609375, + "epoch": 0.23491758681389022, + "grad_norm": 9.065811157226562, + "kl": 4.151773124933243, + "learning_rate": 1.891336087916576e-05, + "loss": 0.4228, + "reward": 0.9750000238418579, + "reward_std": 0.20050331354141235, + "rewards/accuracy_reward": 0.050000001303851606, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.925000011920929, + "step": 734 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.2458557128906, + "epoch": 0.23523763802208353, + "grad_norm": 2.479764223098755, + "kl": 0.7591698169708252, + "learning_rate": 1.8908288720264184e-05, + "loss": 0.2353, + "reward": 1.027604204416275, + "reward_std": 0.23811267241835593, + "rewards/accuracy_reward": 0.12083333842456341, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9067708492279053, + "step": 735 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.2541809082031, + "epoch": 0.23555768923027684, + "grad_norm": 3.136521577835083, + "kl": 0.9379741698503494, + "learning_rate": 1.8903205434511072e-05, + "loss": 0.2684, + "reward": 1.041666680574417, + "reward_std": 0.1941637597978115, + "rewards/accuracy_reward": 0.11458333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 736 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.9104309082031, + "epoch": 0.23587774043847015, + "grad_norm": 15.852173805236816, + "kl": 8.845919364690781, + "learning_rate": 1.8898111028255686e-05, + "loss": 0.8396, + "reward": 1.0156250238418578, + "reward_std": 0.1994689255952835, + "rewards/accuracy_reward": 0.08333333544433116, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 737 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.114599609375, + "epoch": 0.23619779164666346, + "grad_norm": 10.610895156860352, + "kl": 7.209189605712891, + "learning_rate": 1.889300550786116e-05, + "loss": 0.7747, + "reward": 1.0208333492279054, + "reward_std": 0.21294072940945624, + "rewards/accuracy_reward": 0.10833333656191826, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9125000178813935, + "step": 738 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.3521087646484, + "epoch": 0.23651784285485677, + "grad_norm": 1.1002973318099976, + "kl": 2.1927175372838974, + "learning_rate": 1.888788887970452e-05, + "loss": 0.3327, + "reward": 0.9885416805744172, + "reward_std": 0.19914889633655547, + "rewards/accuracy_reward": 0.07291666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9156250119209289, + "step": 739 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.7937713623047, + "epoch": 0.2368378940630501, + "grad_norm": 0.6471092104911804, + "kl": 2.1248645067214964, + "learning_rate": 1.888276115017666e-05, + "loss": 0.3223, + "reward": 0.9750000238418579, + "reward_std": 0.16674922611564397, + "rewards/accuracy_reward": 0.03333333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9416666865348816, + "step": 740 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.6062744140625, + "epoch": 0.2371579452712434, + "grad_norm": 2.839332342147827, + "kl": 3.6467182874679565, + "learning_rate": 1.887762232568235e-05, + "loss": 0.4353, + "reward": 1.0088541865348817, + "reward_std": 0.23403916209936143, + "rewards/accuracy_reward": 0.07291666809469462, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.935937511920929, + "step": 741 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.931265258789, + "epoch": 0.2374779964794367, + "grad_norm": 2.3406782150268555, + "kl": 1.8981781423091888, + "learning_rate": 1.8872472412640207e-05, + "loss": 0.339, + "reward": 1.0604166984558105, + "reward_std": 0.19939529821276664, + "rewards/accuracy_reward": 0.12708333637565375, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9312500178813934, + "step": 742 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.2083587646484, + "epoch": 0.23779804768763002, + "grad_norm": 2.462381601333618, + "kl": 2.055042415857315, + "learning_rate": 1.8867311417482707e-05, + "loss": 0.3379, + "reward": 0.9421875178813934, + "reward_std": 0.21793267950415612, + "rewards/accuracy_reward": 0.022916667722165586, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9192708551883697, + "step": 743 + }, + { + "clip_ratio": 0.0, + "completion_length": 497.708349609375, + "epoch": 0.23811809889582333, + "grad_norm": 9.371177673339844, + "kl": 7.4982593089342116, + "learning_rate": 1.886213934665616e-05, + "loss": 0.6855, + "reward": 0.9718750298023224, + "reward_std": 0.2046487707644701, + "rewards/accuracy_reward": 0.04583333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9260416865348816, + "step": 744 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.5521057128906, + "epoch": 0.23843815010401664, + "grad_norm": 4.96921443939209, + "kl": 6.159427142143249, + "learning_rate": 1.8856956206620717e-05, + "loss": 0.6304, + "reward": 0.9744791984558105, + "reward_std": 0.2073620229959488, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.907812523841858, + "step": 745 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.2291809082031, + "epoch": 0.23875820131220996, + "grad_norm": 2.8724591732025146, + "kl": 1.0669405221939088, + "learning_rate": 1.8851762003850348e-05, + "loss": 0.2729, + "reward": 1.002604204416275, + "reward_std": 0.2241446740925312, + "rewards/accuracy_reward": 0.07291666902601719, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9296875238418579, + "step": 746 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.4291778564453, + "epoch": 0.23907825252040327, + "grad_norm": 2.7185521125793457, + "kl": 1.5261371374130248, + "learning_rate": 1.8846556744832852e-05, + "loss": 0.3374, + "reward": 0.967187511920929, + "reward_std": 0.272777758538723, + "rewards/accuracy_reward": 0.08333333563059568, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8838541805744171, + "step": 747 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.4187713623047, + "epoch": 0.23939830372859658, + "grad_norm": 1.0410653352737427, + "kl": 1.700943198800087, + "learning_rate": 1.8841340436069825e-05, + "loss": 0.3202, + "reward": 0.9619791865348816, + "reward_std": 0.23116603791713713, + "rewards/accuracy_reward": 0.03541666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.926562511920929, + "step": 748 + }, + { + "clip_ratio": 0.0, + "completion_length": 471.2875183105469, + "epoch": 0.2397183549367899, + "grad_norm": 4.877148628234863, + "kl": 3.820130455493927, + "learning_rate": 1.8836113084076673e-05, + "loss": 0.5104, + "reward": 1.028125023841858, + "reward_std": 0.2020101472735405, + "rewards/accuracy_reward": 0.0937500026077032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9343750059604645, + "step": 749 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.7541778564453, + "epoch": 0.2400384061449832, + "grad_norm": 11.63560676574707, + "kl": 8.239775601029397, + "learning_rate": 1.883087469538259e-05, + "loss": 0.7489, + "reward": 0.9447916865348815, + "reward_std": 0.1876985676586628, + "rewards/accuracy_reward": 0.01041666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.934375011920929, + "step": 750 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.8208435058593, + "epoch": 0.2403584573531765, + "grad_norm": 8.246529579162598, + "kl": 6.4461568117141725, + "learning_rate": 1.8825625276530558e-05, + "loss": 0.701, + "reward": 1.0072916865348815, + "reward_std": 0.23445617109537126, + "rewards/accuracy_reward": 0.10625000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9010416805744171, + "step": 751 + }, + { + "clip_ratio": 0.0, + "completion_length": 498.65834655761716, + "epoch": 0.24067850856136982, + "grad_norm": 1.731472134590149, + "kl": 1.476518702507019, + "learning_rate": 1.882036483407734e-05, + "loss": 0.3387, + "reward": 1.0395833551883698, + "reward_std": 0.2580983817577362, + "rewards/accuracy_reward": 0.10000000353902579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9395833373069763, + "step": 752 + }, + { + "clip_ratio": 0.0, + "completion_length": 509.2354370117188, + "epoch": 0.24099855976956314, + "grad_norm": 1.2525382041931152, + "kl": 1.777240651845932, + "learning_rate": 1.8815093374593463e-05, + "loss": 0.3417, + "reward": 0.9989583551883697, + "reward_std": 0.22307676412165164, + "rewards/accuracy_reward": 0.07500000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9239583492279053, + "step": 753 + }, + { + "clip_ratio": 0.0, + "completion_length": 526.858349609375, + "epoch": 0.24131861097775645, + "grad_norm": 1.0679889917373657, + "kl": 2.1552729278802873, + "learning_rate": 1.880981090466321e-05, + "loss": 0.4127, + "reward": 0.9875000238418579, + "reward_std": 0.20257178843021392, + "rewards/accuracy_reward": 0.06250000167638063, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.925000011920929, + "step": 754 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.0416870117188, + "epoch": 0.24163866218594976, + "grad_norm": 1.1074210405349731, + "kl": 1.7916708946228028, + "learning_rate": 1.8804517430884633e-05, + "loss": 0.3344, + "reward": 0.9458333551883698, + "reward_std": 0.2129554446786642, + "rewards/accuracy_reward": 0.025000000558793544, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9208333492279053, + "step": 755 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.1396026611328, + "epoch": 0.24195871339414307, + "grad_norm": 0.8018332719802856, + "kl": 2.069081211090088, + "learning_rate": 1.879921295986951e-05, + "loss": 0.359, + "reward": 1.0026041865348816, + "reward_std": 0.21627548113465309, + "rewards/accuracy_reward": 0.08125000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9213541805744171, + "step": 756 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.2937683105469, + "epoch": 0.24227876460233638, + "grad_norm": 0.5721918344497681, + "kl": 2.462398773431778, + "learning_rate": 1.879389749824336e-05, + "loss": 0.3709, + "reward": 0.9510416805744171, + "reward_std": 0.23518796935677527, + "rewards/accuracy_reward": 0.05000000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9010416805744171, + "step": 757 + }, + { + "clip_ratio": 0.0, + "completion_length": 628.2625183105469, + "epoch": 0.2425988158105297, + "grad_norm": 1.7147362232208252, + "kl": 2.181269180774689, + "learning_rate": 1.8788571052645448e-05, + "loss": 0.3649, + "reward": 0.9447916924953461, + "reward_std": 0.26175126880407334, + "rewards/accuracy_reward": 0.06250000149011611, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8822916805744171, + "step": 758 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.789599609375, + "epoch": 0.242918867018723, + "grad_norm": 1.032857060432434, + "kl": 3.850922179222107, + "learning_rate": 1.8783233629728725e-05, + "loss": 0.4549, + "reward": 0.8744791865348815, + "reward_std": 0.2859712585806847, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8536458432674408, + "step": 759 + }, + { + "clip_ratio": 0.0, + "completion_length": 640.3812622070312, + "epoch": 0.24323891822691632, + "grad_norm": 1.8498499393463135, + "kl": 2.852463459968567, + "learning_rate": 1.877788523615988e-05, + "loss": 0.3521, + "reward": 0.8567708551883697, + "reward_std": 0.29042457044124603, + "rewards/accuracy_reward": 0.020833334326744078, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8359375178813935, + "step": 760 + }, + { + "clip_ratio": 0.0, + "completion_length": 670.9916809082031, + "epoch": 0.24355896943510963, + "grad_norm": 3.006237030029297, + "kl": 6.155748796463013, + "learning_rate": 1.87725258786193e-05, + "loss": 0.6083, + "reward": 0.8255208492279053, + "reward_std": 0.2984800562262535, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7880208492279053, + "step": 761 + }, + { + "clip_ratio": 0.0, + "completion_length": 622.9521118164063, + "epoch": 0.24387902064330294, + "grad_norm": 1.1661911010742188, + "kl": 3.726873683929443, + "learning_rate": 1.8767155563801053e-05, + "loss": 0.4531, + "reward": 0.8677083551883698, + "reward_std": 0.28935869932174685, + "rewards/accuracy_reward": 0.07291666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7947916865348816, + "step": 762 + }, + { + "clip_ratio": 0.0, + "completion_length": 623.7646057128907, + "epoch": 0.24419907185149625, + "grad_norm": 2.272340774536133, + "kl": 4.760553753376007, + "learning_rate": 1.8761774298412905e-05, + "loss": 0.5509, + "reward": 0.8604166984558106, + "reward_std": 0.2656098708510399, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8250000238418579, + "step": 763 + }, + { + "clip_ratio": 0.0, + "completion_length": 615.7062683105469, + "epoch": 0.24451912305968956, + "grad_norm": 1.5633440017700195, + "kl": 1.859854531288147, + "learning_rate": 1.8756382089176303e-05, + "loss": 0.3324, + "reward": 0.9937500059604645, + "reward_std": 0.2956600204110146, + "rewards/accuracy_reward": 0.1062500050291419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.887500011920929, + "step": 764 + }, + { + "clip_ratio": 0.0, + "completion_length": 625.6916900634766, + "epoch": 0.24483917426788285, + "grad_norm": 0.5739659667015076, + "kl": 2.4789600491523744, + "learning_rate": 1.8750978942826353e-05, + "loss": 0.3688, + "reward": 0.9510416924953461, + "reward_std": 0.26511411666870116, + "rewards/accuracy_reward": 0.08333333432674409, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8677083492279053, + "step": 765 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.0354431152343, + "epoch": 0.24515922547607616, + "grad_norm": 0.3142963647842407, + "kl": 1.9652863681316375, + "learning_rate": 1.874556486611183e-05, + "loss": 0.3312, + "reward": 0.9567708730697632, + "reward_std": 0.3204138189554214, + "rewards/accuracy_reward": 0.07708333563059569, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8796875238418579, + "step": 766 + }, + { + "clip_ratio": 0.0, + "completion_length": 627.9125122070312, + "epoch": 0.24547927668426947, + "grad_norm": 0.2902325987815857, + "kl": 2.781359338760376, + "learning_rate": 1.8740139865795154e-05, + "loss": 0.4133, + "reward": 0.8859375238418579, + "reward_std": 0.28512853384017944, + "rewards/accuracy_reward": 0.04791666772216559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8380208551883698, + "step": 767 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.4104431152343, + "epoch": 0.24579932789246278, + "grad_norm": 1.0774624347686768, + "kl": 2.3280814051628114, + "learning_rate": 1.8734703948652398e-05, + "loss": 0.4042, + "reward": 0.8906250178813935, + "reward_std": 0.28475052416324614, + "rewards/accuracy_reward": 0.043750002048909664, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.846875011920929, + "step": 768 + }, + { + "clip_ratio": 0.0, + "completion_length": 637.6333618164062, + "epoch": 0.2461193791006561, + "grad_norm": 2.035698413848877, + "kl": 4.03775839805603, + "learning_rate": 1.8729257121473262e-05, + "loss": 0.474, + "reward": 0.8588541805744171, + "reward_std": 0.31844222843647, + "rewards/accuracy_reward": 0.025000001303851604, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8338541805744171, + "step": 769 + }, + { + "clip_ratio": 0.0, + "completion_length": 611.2562683105468, + "epoch": 0.2464394303088494, + "grad_norm": 0.49926403164863586, + "kl": 3.1284287214279174, + "learning_rate": 1.872379939106108e-05, + "loss": 0.397, + "reward": 0.8848958492279053, + "reward_std": 0.3051090121269226, + "rewards/accuracy_reward": 0.03958333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8453125059604645, + "step": 770 + }, + { + "clip_ratio": 0.0, + "completion_length": 649.7354370117188, + "epoch": 0.24675948151704272, + "grad_norm": 0.631862998008728, + "kl": 3.050023341178894, + "learning_rate": 1.8718330764232802e-05, + "loss": 0.4031, + "reward": 0.9828125357627868, + "reward_std": 0.29355679303407667, + "rewards/accuracy_reward": 0.15208333730697632, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8307291865348816, + "step": 771 + }, + { + "clip_ratio": 0.0, + "completion_length": 763.3729309082031, + "epoch": 0.24707953272523603, + "grad_norm": 5.6362738609313965, + "kl": 7.783353233337403, + "learning_rate": 1.8712851247818985e-05, + "loss": 0.6343, + "reward": 0.7375000238418579, + "reward_std": 0.3411813169717789, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6708333551883697, + "step": 772 + }, + { + "clip_ratio": 0.0, + "completion_length": 781.3562805175782, + "epoch": 0.24739958393342934, + "grad_norm": 1.8554197549819946, + "kl": 5.768043446540832, + "learning_rate": 1.870736084866379e-05, + "loss": 0.4355, + "reward": 0.6088541805744171, + "reward_std": 0.342638885974884, + "rewards/accuracy_reward": 0.00625, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6026041805744171, + "step": 773 + }, + { + "clip_ratio": 0.0, + "completion_length": 808.0021057128906, + "epoch": 0.24771963514162265, + "grad_norm": 3.9803450107574463, + "kl": 1.709758222103119, + "learning_rate": 1.8701859573624975e-05, + "loss": 0.1949, + "reward": 0.8083333551883698, + "reward_std": 0.2954397678375244, + "rewards/accuracy_reward": 0.11250000409781932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6958333551883698, + "step": 774 + }, + { + "clip_ratio": 0.0, + "completion_length": 839.1916870117187, + "epoch": 0.24803968634981596, + "grad_norm": 3.2302024364471436, + "kl": 1.209952062368393, + "learning_rate": 1.869634742957388e-05, + "loss": 0.1301, + "reward": 0.7463541865348816, + "reward_std": 0.30243373960256575, + "rewards/accuracy_reward": 0.02083333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7255208551883697, + "step": 775 + }, + { + "clip_ratio": 0.0, + "completion_length": 786.752099609375, + "epoch": 0.24835973755800927, + "grad_norm": 2.0077474117279053, + "kl": 1.2873852461576463, + "learning_rate": 1.8690824423395412e-05, + "loss": 0.1697, + "reward": 0.9005208551883698, + "reward_std": 0.31336794048547745, + "rewards/accuracy_reward": 0.1333333408460021, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.767187523841858, + "step": 776 + }, + { + "clip_ratio": 0.0, + "completion_length": 678.7021057128907, + "epoch": 0.2486797887662026, + "grad_norm": 1.3721160888671875, + "kl": 1.352865958213806, + "learning_rate": 1.868529056198806e-05, + "loss": 0.2235, + "reward": 0.9020833551883698, + "reward_std": 0.26901768147945404, + "rewards/accuracy_reward": 0.04166666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8604166865348816, + "step": 777 + }, + { + "clip_ratio": 0.0, + "completion_length": 718.2958557128907, + "epoch": 0.2489998399743959, + "grad_norm": 1.037018060684204, + "kl": 1.5384167373180389, + "learning_rate": 1.867974585226386e-05, + "loss": 0.1842, + "reward": 0.9026041865348816, + "reward_std": 0.22797319442033767, + "rewards/accuracy_reward": 0.016666667722165584, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8859375178813934, + "step": 778 + }, + { + "clip_ratio": 0.0, + "completion_length": 786.62294921875, + "epoch": 0.2493198911825892, + "grad_norm": 1.917616605758667, + "kl": 2.3354121506214143, + "learning_rate": 1.8674190301148406e-05, + "loss": 0.1736, + "reward": 0.9625000357627869, + "reward_std": 0.23147097155451773, + "rewards/accuracy_reward": 0.06458333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8979166924953461, + "step": 779 + }, + { + "clip_ratio": 0.0, + "completion_length": 856.283349609375, + "epoch": 0.24963994239078252, + "grad_norm": 1.3474206924438477, + "kl": 2.2255136251449583, + "learning_rate": 1.866862391558083e-05, + "loss": 0.0977, + "reward": 0.8802083551883697, + "reward_std": 0.20946567207574845, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8093750238418579, + "step": 780 + }, + { + "clip_ratio": 0.0, + "completion_length": 815.7333557128907, + "epoch": 0.24995999359897583, + "grad_norm": 0.525276780128479, + "kl": 1.391992512345314, + "learning_rate": 1.8663046702513795e-05, + "loss": 0.0309, + "reward": 0.7567708432674408, + "reward_std": 0.13151366412639617, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7213541805744171, + "step": 781 + }, + { + "clip_ratio": 0.0, + "completion_length": 929.2333557128907, + "epoch": 0.25028004480716914, + "grad_norm": 0.5646325945854187, + "kl": 1.0617589622735977, + "learning_rate": 1.8657458668913493e-05, + "loss": 0.0313, + "reward": 0.7848958671092987, + "reward_std": 0.0879902821034193, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7182291924953461, + "step": 782 + }, + { + "clip_ratio": 0.0, + "completion_length": 826.2625122070312, + "epoch": 0.2506000960153625, + "grad_norm": 1.0494465827941895, + "kl": 0.8743727058172226, + "learning_rate": 1.8651859821759623e-05, + "loss": 0.0169, + "reward": 0.7416666924953461, + "reward_std": 0.1264648325741291, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7062500238418579, + "step": 783 + }, + { + "clip_ratio": 0.0, + "completion_length": 754.9791870117188, + "epoch": 0.25092014722355577, + "grad_norm": 0.7133949398994446, + "kl": 1.71711206138134, + "learning_rate": 1.8646250168045402e-05, + "loss": 0.0536, + "reward": 0.8036458551883697, + "reward_std": 0.1877228483557701, + "rewards/accuracy_reward": 0.10000000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7036458551883698, + "step": 784 + }, + { + "clip_ratio": 0.0, + "completion_length": 789.0812683105469, + "epoch": 0.2512401984317491, + "grad_norm": 34.78798294067383, + "kl": 4.583227729797363, + "learning_rate": 1.8640629714777536e-05, + "loss": 0.1088, + "reward": 0.8328125178813934, + "reward_std": 0.23548691123723983, + "rewards/accuracy_reward": 0.01666666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8161458492279052, + "step": 785 + }, + { + "clip_ratio": 0.0, + "completion_length": 669.0083557128906, + "epoch": 0.2515602496399424, + "grad_norm": 1.3595727682113647, + "kl": 2.5710567235946655, + "learning_rate": 1.8634998468976225e-05, + "loss": 0.2484, + "reward": 0.7208333611488342, + "reward_std": 0.17345971316099168, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6875000298023224, + "step": 786 + }, + { + "clip_ratio": 0.0, + "completion_length": 772.6666870117188, + "epoch": 0.2518803008481357, + "grad_norm": 1.8687893152236938, + "kl": 3.129037153720856, + "learning_rate": 1.862935643767514e-05, + "loss": 0.1782, + "reward": 0.6364583551883698, + "reward_std": 0.20351852625608444, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6010416865348815, + "step": 787 + }, + { + "clip_ratio": 0.0, + "completion_length": 683.4479370117188, + "epoch": 0.252200352056329, + "grad_norm": 0.4782872200012207, + "kl": 2.446159327030182, + "learning_rate": 1.862370362792144e-05, + "loss": 0.217, + "reward": 0.7505208671092987, + "reward_std": 0.19110870510339736, + "rewards/accuracy_reward": 0.07916666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6713541924953461, + "step": 788 + }, + { + "clip_ratio": 0.0, + "completion_length": 703.6479370117188, + "epoch": 0.2525204032645223, + "grad_norm": 0.7935335636138916, + "kl": 2.2065913677215576, + "learning_rate": 1.8618040046775727e-05, + "loss": 0.1883, + "reward": 0.7151041805744172, + "reward_std": 0.19547585546970367, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6484375178813935, + "step": 789 + }, + { + "clip_ratio": 0.0, + "completion_length": 702.8125183105469, + "epoch": 0.25284045447271564, + "grad_norm": 1.0700191259384155, + "kl": 1.690595942735672, + "learning_rate": 1.8612365701312075e-05, + "loss": 0.1709, + "reward": 0.6677083492279052, + "reward_std": 0.1768379256129265, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6656250178813934, + "step": 790 + }, + { + "clip_ratio": 0.0, + "completion_length": 721.0458435058594, + "epoch": 0.2531605056809089, + "grad_norm": 0.8433765769004822, + "kl": 1.9942040205001832, + "learning_rate": 1.8606680598617995e-05, + "loss": 0.1587, + "reward": 0.6973958551883698, + "reward_std": 0.1701609805226326, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6640625178813935, + "step": 791 + }, + { + "clip_ratio": 0.0, + "completion_length": 667.1479370117188, + "epoch": 0.25348055688910226, + "grad_norm": 4.201164722442627, + "kl": 2.1893013775348664, + "learning_rate": 1.8600984745794438e-05, + "loss": 0.1323, + "reward": 0.6645833551883698, + "reward_std": 0.1957714796066284, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6312500238418579, + "step": 792 + }, + { + "clip_ratio": 0.0, + "completion_length": 891.3812683105468, + "epoch": 0.25380060809729554, + "grad_norm": 0.30567294359207153, + "kl": 2.2855528831481933, + "learning_rate": 1.859527814995577e-05, + "loss": 0.0917, + "reward": 0.7296875178813934, + "reward_std": 0.17712628692388535, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6630208492279053, + "step": 793 + }, + { + "clip_ratio": 0.0, + "completion_length": 886.8958618164063, + "epoch": 0.2541206593054889, + "grad_norm": 0.6789547204971313, + "kl": 1.546866774559021, + "learning_rate": 1.858956081822979e-05, + "loss": 0.0331, + "reward": 0.7739583611488342, + "reward_std": 0.16994911432266235, + "rewards/accuracy_reward": 0.10000000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6739583551883698, + "step": 794 + }, + { + "clip_ratio": 0.0, + "completion_length": 845.0812744140625, + "epoch": 0.25444071051368217, + "grad_norm": 0.2693403363227844, + "kl": 1.9320049643516541, + "learning_rate": 1.8583832757757708e-05, + "loss": 0.1012, + "reward": 0.7270833611488342, + "reward_std": 0.17218801006674767, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6604166865348816, + "step": 795 + }, + { + "clip_ratio": 0.0, + "completion_length": 863.764599609375, + "epoch": 0.2547607617218755, + "grad_norm": 0.27951666712760925, + "kl": 2.302595019340515, + "learning_rate": 1.8578093975694116e-05, + "loss": 0.0908, + "reward": 0.7333333551883697, + "reward_std": 0.17331424206495286, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6645833492279053, + "step": 796 + }, + { + "clip_ratio": 0.0, + "completion_length": 923.2771057128906, + "epoch": 0.2550808129300688, + "grad_norm": 0.3106688857078552, + "kl": 2.457894867658615, + "learning_rate": 1.8572344479207015e-05, + "loss": 0.0761, + "reward": 0.6953125238418579, + "reward_std": 0.17134494259953498, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6619791865348816, + "step": 797 + }, + { + "clip_ratio": 0.0, + "completion_length": 939.3854431152344, + "epoch": 0.25540086413826213, + "grad_norm": 0.24902932345867157, + "kl": 1.85448357462883, + "learning_rate": 1.8566584275477783e-05, + "loss": 0.0579, + "reward": 0.6687500238418579, + "reward_std": 0.15060244724154473, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6687500238418579, + "step": 798 + }, + { + "clip_ratio": 0.0, + "completion_length": 958.8750122070312, + "epoch": 0.2557209153464554, + "grad_norm": 0.6678584218025208, + "kl": 1.8299700140953064, + "learning_rate": 1.8560813371701174e-05, + "loss": 0.0511, + "reward": 0.6729166865348816, + "reward_std": 0.15844282358884812, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6729166865348816, + "step": 799 + }, + { + "clip_ratio": 0.0, + "completion_length": 935.5291809082031, + "epoch": 0.25604096655464875, + "grad_norm": 0.3589387834072113, + "kl": 2.199638992547989, + "learning_rate": 1.8555031775085307e-05, + "loss": 0.0748, + "reward": 0.7260416924953461, + "reward_std": 0.17357457876205445, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6906250238418579, + "step": 800 + }, + { + "clip_ratio": 0.0, + "completion_length": 925.0354370117187, + "epoch": 0.25636101776284204, + "grad_norm": 0.577064573764801, + "kl": 2.5014767736196517, + "learning_rate": 1.854923949285165e-05, + "loss": 0.1002, + "reward": 0.7541666924953461, + "reward_std": 0.1812018111348152, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7145833551883698, + "step": 801 + }, + { + "clip_ratio": 0.0, + "completion_length": 835.470849609375, + "epoch": 0.2566810689710354, + "grad_norm": 0.31030163168907166, + "kl": 2.0380566120147705, + "learning_rate": 1.8543436532235024e-05, + "loss": 0.0894, + "reward": 0.8473958611488343, + "reward_std": 0.21838683038949966, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7807291865348815, + "step": 802 + }, + { + "clip_ratio": 0.0, + "completion_length": 757.6479248046875, + "epoch": 0.25700112017922866, + "grad_norm": 0.29996275901794434, + "kl": 1.3410570591688156, + "learning_rate": 1.853762290048359e-05, + "loss": 0.1172, + "reward": 0.9130208492279053, + "reward_std": 0.21495410203933715, + "rewards/accuracy_reward": 0.04166666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8713541805744172, + "step": 803 + }, + { + "clip_ratio": 0.0, + "completion_length": 649.0166931152344, + "epoch": 0.257321171387422, + "grad_norm": 0.4481050968170166, + "kl": 0.744247005879879, + "learning_rate": 1.853179860485883e-05, + "loss": 0.0867, + "reward": 1.0031250298023224, + "reward_std": 0.19283585250377655, + "rewards/accuracy_reward": 0.11666667014360428, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8864583492279052, + "step": 804 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.3208557128906, + "epoch": 0.2576412225956153, + "grad_norm": 1.6031851768493652, + "kl": 1.8358533322811126, + "learning_rate": 1.8525963652635556e-05, + "loss": 0.1293, + "reward": 0.9744791924953461, + "reward_std": 0.2302825279533863, + "rewards/accuracy_reward": 0.1000000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8744791865348815, + "step": 805 + }, + { + "clip_ratio": 0.0, + "completion_length": 634.2333557128907, + "epoch": 0.2579612738038086, + "grad_norm": 0.8979751467704773, + "kl": 1.0591185629367827, + "learning_rate": 1.852011805110188e-05, + "loss": 0.1109, + "reward": 1.0229166865348815, + "reward_std": 0.177987564727664, + "rewards/accuracy_reward": 0.09166666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9312500119209289, + "step": 806 + }, + { + "clip_ratio": 0.0, + "completion_length": 661.3687683105469, + "epoch": 0.2582813250120019, + "grad_norm": 0.4129449427127838, + "kl": 1.2034089416265488, + "learning_rate": 1.851426180755922e-05, + "loss": 0.1144, + "reward": 0.9953125178813934, + "reward_std": 0.15365473832935095, + "rewards/accuracy_reward": 0.0458333345130086, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9494791865348816, + "step": 807 + }, + { + "clip_ratio": 0.0, + "completion_length": 641.5812744140625, + "epoch": 0.25860137622019524, + "grad_norm": 0.3640819489955902, + "kl": 1.47001773416996, + "learning_rate": 1.8508394929322287e-05, + "loss": 0.1258, + "reward": 0.9661458551883697, + "reward_std": 0.12315437085926532, + "rewards/accuracy_reward": 0.006250000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9598958492279053, + "step": 808 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.7125183105469, + "epoch": 0.25892142742838853, + "grad_norm": 0.9221828579902649, + "kl": 2.0171607047319413, + "learning_rate": 1.8502517423719075e-05, + "loss": 0.113, + "reward": 1.0677083671092986, + "reward_std": 0.16264262348413466, + "rewards/accuracy_reward": 0.12500000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9427083492279053, + "step": 809 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.5166809082032, + "epoch": 0.25924147863658187, + "grad_norm": 0.5077120661735535, + "kl": 1.0036138698458672, + "learning_rate": 1.8496629298090855e-05, + "loss": 0.0882, + "reward": 0.963541692495346, + "reward_std": 0.1293813869357109, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9593750178813935, + "step": 810 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.4104309082031, + "epoch": 0.25956152984477515, + "grad_norm": 0.6715332269668579, + "kl": 1.3229067370295524, + "learning_rate": 1.8490730559792153e-05, + "loss": 0.121, + "reward": 1.0197916865348815, + "reward_std": 0.11515746731311083, + "rewards/accuracy_reward": 0.05625000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9635416805744171, + "step": 811 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.3958465576172, + "epoch": 0.2598815810529685, + "grad_norm": 0.5190712809562683, + "kl": 0.5667064756155014, + "learning_rate": 1.848482121619076e-05, + "loss": 0.0447, + "reward": 1.0526041865348816, + "reward_std": 0.13494310155510902, + "rewards/accuracy_reward": 0.07916667126119137, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9734375059604645, + "step": 812 + }, + { + "clip_ratio": 0.0, + "completion_length": 537.341683959961, + "epoch": 0.2602016322611618, + "grad_norm": 0.6430520415306091, + "kl": 0.50772774964571, + "learning_rate": 1.8478901274667716e-05, + "loss": 0.0742, + "reward": 1.0723958611488342, + "reward_std": 0.10893401503562927, + "rewards/accuracy_reward": 0.09583333935588598, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9765625178813935, + "step": 813 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.8104309082031, + "epoch": 0.2605216834693551, + "grad_norm": 1.1371740102767944, + "kl": 2.0117278814315798, + "learning_rate": 1.8472970742617284e-05, + "loss": 0.229, + "reward": 0.9604166805744171, + "reward_std": 0.18165156841278077, + "rewards/accuracy_reward": 0.03125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9291666805744171, + "step": 814 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.8437561035156, + "epoch": 0.2608417346775484, + "grad_norm": 0.37978023290634155, + "kl": 1.3154131323099136, + "learning_rate": 1.846702962744697e-05, + "loss": 0.1291, + "reward": 1.0223958432674407, + "reward_std": 0.1484291136264801, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.951562511920929, + "step": 815 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.9187683105469, + "epoch": 0.26116178588574174, + "grad_norm": 0.34045541286468506, + "kl": 1.4208843201398849, + "learning_rate": 1.8461077936577495e-05, + "loss": 0.1463, + "reward": 0.9494791805744172, + "reward_std": 0.18640333712100982, + "rewards/accuracy_reward": 0.0125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9369791805744171, + "step": 816 + }, + { + "clip_ratio": 0.0, + "completion_length": 508.3791809082031, + "epoch": 0.261481837093935, + "grad_norm": 0.4059346616268158, + "kl": 1.6916437029838562, + "learning_rate": 1.8455115677442782e-05, + "loss": 0.1865, + "reward": 1.0869791865348817, + "reward_std": 0.2534780815243721, + "rewards/accuracy_reward": 0.1562500052154064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9307291805744171, + "step": 817 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.2896057128906, + "epoch": 0.26180188830212836, + "grad_norm": 0.6775670647621155, + "kl": 2.5077103793621065, + "learning_rate": 1.844914285748996e-05, + "loss": 0.2779, + "reward": 0.9302083492279053, + "reward_std": 0.24873557239770888, + "rewards/accuracy_reward": 0.03333333339542151, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8968750059604644, + "step": 818 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.7562622070312, + "epoch": 0.26212193951032164, + "grad_norm": 0.7965701222419739, + "kl": 2.863987410068512, + "learning_rate": 1.8443159484179348e-05, + "loss": 0.3095, + "reward": 0.9828125298023224, + "reward_std": 0.26197887808084486, + "rewards/accuracy_reward": 0.11250000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8703125178813934, + "step": 819 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.910433959961, + "epoch": 0.262441990718515, + "grad_norm": 0.577987015247345, + "kl": 1.5371546924114228, + "learning_rate": 1.8437165564984455e-05, + "loss": 0.1004, + "reward": 1.0109375178813935, + "reward_std": 0.2512478806078434, + "rewards/accuracy_reward": 0.10625000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.904687511920929, + "step": 820 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.1666839599609, + "epoch": 0.26276204192670827, + "grad_norm": 0.9527766704559326, + "kl": 1.177549660205841, + "learning_rate": 1.8431161107391947e-05, + "loss": 0.124, + "reward": 0.9760416865348815, + "reward_std": 0.2343311682343483, + "rewards/accuracy_reward": 0.06041666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9156250178813934, + "step": 821 + }, + { + "clip_ratio": 0.0, + "completion_length": 508.45001831054685, + "epoch": 0.2630820931349016, + "grad_norm": 0.35349592566490173, + "kl": 2.095759892463684, + "learning_rate": 1.8425146118901664e-05, + "loss": 0.1814, + "reward": 0.9859375298023224, + "reward_std": 0.20536768585443496, + "rewards/accuracy_reward": 0.07291666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9130208551883697, + "step": 822 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.160433959961, + "epoch": 0.2634021443430949, + "grad_norm": 1.703538179397583, + "kl": 3.468520486354828, + "learning_rate": 1.841912060702659e-05, + "loss": 0.3395, + "reward": 0.9875000298023224, + "reward_std": 0.26018320918083193, + "rewards/accuracy_reward": 0.09583333637565375, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8916666805744171, + "step": 823 + }, + { + "clip_ratio": 0.0, + "completion_length": 500.3896026611328, + "epoch": 0.26372219555128823, + "grad_norm": 0.6156812310218811, + "kl": 2.648340845108032, + "learning_rate": 1.8413084579292868e-05, + "loss": 0.2845, + "reward": 0.9500000238418579, + "reward_std": 0.2134397841989994, + "rewards/accuracy_reward": 0.04375000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9062500238418579, + "step": 824 + }, + { + "clip_ratio": 0.0, + "completion_length": 495.5458465576172, + "epoch": 0.2640422467594815, + "grad_norm": 0.5195518136024475, + "kl": 0.9634673684835434, + "learning_rate": 1.840703804323976e-05, + "loss": 0.1202, + "reward": 1.004166692495346, + "reward_std": 0.20163882821798323, + "rewards/accuracy_reward": 0.052083334513008596, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9520833492279053, + "step": 825 + }, + { + "clip_ratio": 0.0, + "completion_length": 483.0604278564453, + "epoch": 0.26436229796767485, + "grad_norm": 0.7861474752426147, + "kl": 1.254196584224701, + "learning_rate": 1.8400981006419663e-05, + "loss": 0.1639, + "reward": 1.0604166865348816, + "reward_std": 0.20427689626812934, + "rewards/accuracy_reward": 0.12291667070239783, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9375000119209289, + "step": 826 + }, + { + "clip_ratio": 0.0, + "completion_length": 497.9104309082031, + "epoch": 0.26468234917586814, + "grad_norm": 0.278336763381958, + "kl": 1.4873881816864014, + "learning_rate": 1.8394913476398087e-05, + "loss": 0.2505, + "reward": 1.0411458611488342, + "reward_std": 0.1484985716640949, + "rewards/accuracy_reward": 0.10208333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9390625119209289, + "step": 827 + }, + { + "clip_ratio": 0.0, + "completion_length": 483.8458526611328, + "epoch": 0.2650024003840615, + "grad_norm": 0.47557759284973145, + "kl": 1.4743517637252808, + "learning_rate": 1.838883546075365e-05, + "loss": 0.1885, + "reward": 1.0234375238418578, + "reward_std": 0.1315991472452879, + "rewards/accuracy_reward": 0.07291666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9505208492279053, + "step": 828 + }, + { + "clip_ratio": 0.0, + "completion_length": 499.23126220703125, + "epoch": 0.26532245159225476, + "grad_norm": 0.468987375497818, + "kl": 1.7570225208997727, + "learning_rate": 1.8382746967078063e-05, + "loss": 0.2526, + "reward": 0.9932291746139527, + "reward_std": 0.17668437063694, + "rewards/accuracy_reward": 0.05416666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9390625119209289, + "step": 829 + }, + { + "clip_ratio": 0.0, + "completion_length": 491.00001831054686, + "epoch": 0.2656425028004481, + "grad_norm": 0.35486987233161926, + "kl": 2.0428441941738127, + "learning_rate": 1.837664800297613e-05, + "loss": 0.1859, + "reward": 0.9739583492279053, + "reward_std": 0.1656632751226425, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9364583492279053, + "step": 830 + }, + { + "clip_ratio": 0.0, + "completion_length": 512.306265258789, + "epoch": 0.2659625540086414, + "grad_norm": 0.7836453914642334, + "kl": 1.9435631185770035, + "learning_rate": 1.8370538576065725e-05, + "loss": 0.1945, + "reward": 1.0937500238418578, + "reward_std": 0.20406704619526864, + "rewards/accuracy_reward": 0.14375000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9500000178813934, + "step": 831 + }, + { + "clip_ratio": 0.0, + "completion_length": 504.570849609375, + "epoch": 0.26628260521683467, + "grad_norm": 0.7468709945678711, + "kl": 1.106460866332054, + "learning_rate": 1.8364418693977803e-05, + "loss": 0.1291, + "reward": 1.0890625238418579, + "reward_std": 0.16735546700656415, + "rewards/accuracy_reward": 0.12916667088866235, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.957812511920929, + "step": 832 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.9125183105468, + "epoch": 0.266602656425028, + "grad_norm": 0.4349055886268616, + "kl": 1.8148438930511475, + "learning_rate": 1.8358288364356366e-05, + "loss": 0.1068, + "reward": 1.005729180574417, + "reward_std": 0.14619814604520798, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9390625178813934, + "step": 833 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.9812683105469, + "epoch": 0.2669227076332213, + "grad_norm": 0.6686826348304749, + "kl": 1.2067111015319825, + "learning_rate": 1.8352147594858474e-05, + "loss": 0.1426, + "reward": 1.0645833551883697, + "reward_std": 0.13354990780353546, + "rewards/accuracy_reward": 0.10208333637565374, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9604166805744171, + "step": 834 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.9062683105469, + "epoch": 0.26724275884141463, + "grad_norm": 1.4165607690811157, + "kl": 2.314134883880615, + "learning_rate": 1.834599639315422e-05, + "loss": 0.2464, + "reward": 0.9781250119209289, + "reward_std": 0.17349504306912422, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9364583492279053, + "step": 835 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.8916900634765, + "epoch": 0.2675628100496079, + "grad_norm": 0.5959430932998657, + "kl": 1.5935033410787582, + "learning_rate": 1.833983476692673e-05, + "loss": 0.1566, + "reward": 1.0067708611488342, + "reward_std": 0.16007075309753419, + "rewards/accuracy_reward": 0.0583333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9484375178813934, + "step": 836 + }, + { + "clip_ratio": 0.0, + "completion_length": 639.4729400634766, + "epoch": 0.26788286125780125, + "grad_norm": 0.3902430534362793, + "kl": 1.4951316177845002, + "learning_rate": 1.8333662723872154e-05, + "loss": 0.1353, + "reward": 0.9494791865348816, + "reward_std": 0.20283174850046634, + "rewards/accuracy_reward": 0.016666667722165584, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.9286458492279053, + "step": 837 + }, + { + "clip_ratio": 0.0, + "completion_length": 639.3229309082031, + "epoch": 0.26820291246599454, + "grad_norm": 0.6958396434783936, + "kl": 0.9415562689304352, + "learning_rate": 1.8327480271699647e-05, + "loss": 0.1359, + "reward": 1.0807291924953462, + "reward_std": 0.13761940076947213, + "rewards/accuracy_reward": 0.13333333730697633, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9473958432674408, + "step": 838 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.2479400634766, + "epoch": 0.2685229636741879, + "grad_norm": 0.5779876708984375, + "kl": 1.5146077901124955, + "learning_rate": 1.8321287418131368e-05, + "loss": 0.1672, + "reward": 1.0000000178813935, + "reward_std": 0.1803262263536453, + "rewards/accuracy_reward": 0.0791666692122817, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9208333551883697, + "step": 839 + }, + { + "clip_ratio": 0.0, + "completion_length": 612.8604278564453, + "epoch": 0.26884301488238116, + "grad_norm": 0.34577852487564087, + "kl": 1.463090929389, + "learning_rate": 1.8315084170902473e-05, + "loss": 0.1737, + "reward": 0.9578125298023223, + "reward_std": 0.21278849691152574, + "rewards/accuracy_reward": 0.02916666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9286458611488342, + "step": 840 + }, + { + "clip_ratio": 0.0, + "completion_length": 630.8458557128906, + "epoch": 0.2691630660905745, + "grad_norm": 0.8705610632896423, + "kl": 1.8129864871501922, + "learning_rate": 1.8308870537761094e-05, + "loss": 0.264, + "reward": 0.9651041924953461, + "reward_std": 0.26597666591405866, + "rewards/accuracy_reward": 0.07291666865348816, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.8901041865348815, + "step": 841 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.7208557128906, + "epoch": 0.2694831172987678, + "grad_norm": 0.6193972826004028, + "kl": 1.368115884065628, + "learning_rate": 1.8302646526468337e-05, + "loss": 0.2307, + "reward": 0.9614583551883698, + "reward_std": 0.22459929436445236, + "rewards/accuracy_reward": 0.03333333488553762, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9281250178813935, + "step": 842 + }, + { + "clip_ratio": 0.0, + "completion_length": 619.2021087646484, + "epoch": 0.2698031685069611, + "grad_norm": 0.2857276499271393, + "kl": 1.0940616935491563, + "learning_rate": 1.8296412144798266e-05, + "loss": 0.1333, + "reward": 0.9843750059604645, + "reward_std": 0.16745015531778334, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9427083373069763, + "step": 843 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.3583435058594, + "epoch": 0.2701232197151544, + "grad_norm": 0.357234925031662, + "kl": 1.360547348856926, + "learning_rate": 1.829016740053791e-05, + "loss": 0.1622, + "reward": 0.9958333551883698, + "reward_std": 0.1787314772605896, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9291666865348815, + "step": 844 + }, + { + "clip_ratio": 0.0, + "completion_length": 622.8791809082031, + "epoch": 0.27044327092334774, + "grad_norm": 0.5601744651794434, + "kl": 1.5113209307193756, + "learning_rate": 1.8283912301487228e-05, + "loss": 0.2104, + "reward": 0.9359375238418579, + "reward_std": 0.21898051649332045, + "rewards/accuracy_reward": 0.02500000074505806, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.908854192495346, + "step": 845 + }, + { + "clip_ratio": 0.0, + "completion_length": 617.1708526611328, + "epoch": 0.27076332213154103, + "grad_norm": 0.3600686192512512, + "kl": 0.8812868297100067, + "learning_rate": 1.8277646855459124e-05, + "loss": 0.1488, + "reward": 1.0291666984558105, + "reward_std": 0.1729067787528038, + "rewards/accuracy_reward": 0.07500000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9541666805744171, + "step": 846 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.289599609375, + "epoch": 0.27108337333973437, + "grad_norm": 0.2547048330307007, + "kl": 1.0196032211184503, + "learning_rate": 1.8271371070279418e-05, + "loss": 0.0957, + "reward": 1.0218750178813933, + "reward_std": 0.14253914952278138, + "rewards/accuracy_reward": 0.07916666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 847 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.5541778564453, + "epoch": 0.27140342454792765, + "grad_norm": 0.40000978112220764, + "kl": 1.0541925325989723, + "learning_rate": 1.826508495378685e-05, + "loss": 0.1986, + "reward": 1.0296875178813933, + "reward_std": 0.15220083631575107, + "rewards/accuracy_reward": 0.07291666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9567708492279052, + "step": 848 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.2229370117187, + "epoch": 0.271723475756121, + "grad_norm": 0.3675740957260132, + "kl": 1.61804456114769, + "learning_rate": 1.825878851383305e-05, + "loss": 0.16, + "reward": 0.981250011920929, + "reward_std": 0.1732964960858226, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9416666746139526, + "step": 849 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.0250122070313, + "epoch": 0.2720435269643143, + "grad_norm": 0.3455909490585327, + "kl": 1.393028575181961, + "learning_rate": 1.8252481758282573e-05, + "loss": 0.1151, + "reward": 1.055729192495346, + "reward_std": 0.2162807509303093, + "rewards/accuracy_reward": 0.09791666772216559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9578125059604645, + "step": 850 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.6458465576172, + "epoch": 0.2723635781725076, + "grad_norm": 1.0094131231307983, + "kl": 2.248434340953827, + "learning_rate": 1.8246164695012817e-05, + "loss": 0.2087, + "reward": 1.0145833611488342, + "reward_std": 0.14186026901006699, + "rewards/accuracy_reward": 0.06458333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9500000178813934, + "step": 851 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.0291778564454, + "epoch": 0.2726836293807009, + "grad_norm": 0.37523195147514343, + "kl": 1.6864983469247818, + "learning_rate": 1.8239837331914098e-05, + "loss": 0.1855, + "reward": 0.9838541805744171, + "reward_std": 0.15197336673736572, + "rewards/accuracy_reward": 0.03958333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9442708492279053, + "step": 852 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.55418395996094, + "epoch": 0.27300368058889424, + "grad_norm": 0.27413806319236755, + "kl": 1.3582224547863007, + "learning_rate": 1.8233499676889556e-05, + "loss": 0.2058, + "reward": 1.0500000178813935, + "reward_std": 0.15795501098036765, + "rewards/accuracy_reward": 0.10416667088866234, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9458333492279053, + "step": 853 + }, + { + "clip_ratio": 0.0, + "completion_length": 515.4479370117188, + "epoch": 0.2733237317970875, + "grad_norm": 0.6673869490623474, + "kl": 0.7741886451840401, + "learning_rate": 1.822715173785522e-05, + "loss": 0.1256, + "reward": 0.9984375238418579, + "reward_std": 0.09940896760672331, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9630208492279053, + "step": 854 + }, + { + "clip_ratio": 0.0, + "completion_length": 513.2166778564454, + "epoch": 0.27364378300528086, + "grad_norm": 1.2225868701934814, + "kl": 0.6319494009017944, + "learning_rate": 1.8220793522739947e-05, + "loss": 0.1281, + "reward": 1.0994791865348816, + "reward_std": 0.14179169721901416, + "rewards/accuracy_reward": 0.1250000050291419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9744791805744171, + "step": 855 + }, + { + "clip_ratio": 0.0, + "completion_length": 493.2479278564453, + "epoch": 0.27396383421347414, + "grad_norm": 1.0505335330963135, + "kl": 1.155528011918068, + "learning_rate": 1.8214425039485428e-05, + "loss": 0.1561, + "reward": 1.0578125357627868, + "reward_std": 0.14424145892262458, + "rewards/accuracy_reward": 0.10000000316649675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9578125238418579, + "step": 856 + }, + { + "clip_ratio": 0.0, + "completion_length": 517.2020965576172, + "epoch": 0.2742838854216675, + "grad_norm": 0.7932451963424683, + "kl": 2.262352053821087, + "learning_rate": 1.820804629604619e-05, + "loss": 0.3155, + "reward": 1.0395833432674408, + "reward_std": 0.19677990078926086, + "rewards/accuracy_reward": 0.09583333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.943750011920929, + "step": 857 + }, + { + "clip_ratio": 0.0, + "completion_length": 470.45000915527345, + "epoch": 0.27460393662986077, + "grad_norm": 1.4789113998413086, + "kl": 2.0796320915222166, + "learning_rate": 1.8201657300389563e-05, + "loss": 0.2949, + "reward": 1.0286458551883697, + "reward_std": 0.1159073494374752, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9619791805744171, + "step": 858 + }, + { + "clip_ratio": 0.0, + "completion_length": 502.1958526611328, + "epoch": 0.2749239878380541, + "grad_norm": 0.8765280246734619, + "kl": 1.5847502857446671, + "learning_rate": 1.8195258060495693e-05, + "loss": 0.1356, + "reward": 1.0989583551883697, + "reward_std": 0.13560206349939108, + "rewards/accuracy_reward": 0.13333333637565375, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9656250178813934, + "step": 859 + }, + { + "clip_ratio": 0.0, + "completion_length": 515.400015258789, + "epoch": 0.2752440390462474, + "grad_norm": 1.8679522275924683, + "kl": 2.9036698162555696, + "learning_rate": 1.8188848584357516e-05, + "loss": 0.348, + "reward": 0.9890625178813934, + "reward_std": 0.21013089418411254, + "rewards/accuracy_reward": 0.08541666865348815, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9036458432674408, + "step": 860 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.7666809082031, + "epoch": 0.27556409025444073, + "grad_norm": 0.4723247289657593, + "kl": 1.273580791056156, + "learning_rate": 1.8182428879980754e-05, + "loss": 0.1668, + "reward": 0.8343750178813935, + "reward_std": 0.18344238325953482, + "rewards/accuracy_reward": 0.03541666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7989583551883698, + "step": 861 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.6187713623047, + "epoch": 0.275884141462634, + "grad_norm": 0.9013276100158691, + "kl": 1.1724308669567107, + "learning_rate": 1.8175998955383906e-05, + "loss": 0.2216, + "reward": 0.9062500298023224, + "reward_std": 0.19240469932556153, + "rewards/accuracy_reward": 0.10208333730697632, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8041666865348815, + "step": 862 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.4750122070312, + "epoch": 0.27620419267082735, + "grad_norm": 1.1328942775726318, + "kl": 0.9504182629287243, + "learning_rate": 1.8169558818598236e-05, + "loss": 0.1826, + "reward": 0.9463541865348816, + "reward_std": 0.16272302493453025, + "rewards/accuracy_reward": 0.05208333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8942708492279052, + "step": 863 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.714599609375, + "epoch": 0.27652424387902064, + "grad_norm": 0.8893626928329468, + "kl": 0.8462207525968551, + "learning_rate": 1.8163108477667762e-05, + "loss": 0.1331, + "reward": 0.9947917044162751, + "reward_std": 0.1171018997207284, + "rewards/accuracy_reward": 0.02708333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9677083551883697, + "step": 864 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.0895965576172, + "epoch": 0.276844295087214, + "grad_norm": 0.3679625988006592, + "kl": 1.5704083681106566, + "learning_rate": 1.815664794064925e-05, + "loss": 0.1883, + "reward": 0.9822916865348816, + "reward_std": 0.1469844736158848, + "rewards/accuracy_reward": 0.037500002048909666, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9447916746139526, + "step": 865 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.74584045410154, + "epoch": 0.27716434629540726, + "grad_norm": 1.254338264465332, + "kl": 2.278603066504002, + "learning_rate": 1.8150177215612198e-05, + "loss": 0.2866, + "reward": 1.0031250238418579, + "reward_std": 0.2205186128616333, + "rewards/accuracy_reward": 0.06250000204890967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9406250178813934, + "step": 866 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.5208526611328, + "epoch": 0.2774843975036006, + "grad_norm": 0.5605323910713196, + "kl": 2.108324646949768, + "learning_rate": 1.8143696310638836e-05, + "loss": 0.2894, + "reward": 1.0187500298023224, + "reward_std": 0.1810709685087204, + "rewards/accuracy_reward": 0.07916666883975268, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9395833492279053, + "step": 867 + }, + { + "clip_ratio": 0.0, + "completion_length": 467.27500610351564, + "epoch": 0.2778044487117939, + "grad_norm": 0.5356122255325317, + "kl": 1.2343434900045396, + "learning_rate": 1.81372052338241e-05, + "loss": 0.238, + "reward": 1.0869791865348817, + "reward_std": 0.14499858394265175, + "rewards/accuracy_reward": 0.1291666716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.957812511920929, + "step": 868 + }, + { + "clip_ratio": 0.0, + "completion_length": 477.0354339599609, + "epoch": 0.2781244999199872, + "grad_norm": 0.437862753868103, + "kl": 1.2617906153202056, + "learning_rate": 1.813070399327564e-05, + "loss": 0.1205, + "reward": 1.0546875178813935, + "reward_std": 0.18123132549226284, + "rewards/accuracy_reward": 0.10000000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.954687523841858, + "step": 869 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.1562652587891, + "epoch": 0.2784445511281805, + "grad_norm": 0.35148391127586365, + "kl": 1.2551678597927094, + "learning_rate": 1.8124192597113786e-05, + "loss": 0.134, + "reward": 1.0380208611488342, + "reward_std": 0.12462070938199758, + "rewards/accuracy_reward": 0.07291666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9651041865348816, + "step": 870 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.0312652587891, + "epoch": 0.27876460233637385, + "grad_norm": 0.639610230922699, + "kl": 1.409567552804947, + "learning_rate": 1.8117671053471576e-05, + "loss": 0.1501, + "reward": 1.0369791865348816, + "reward_std": 0.1435700273141265, + "rewards/accuracy_reward": 0.07708333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9598958492279053, + "step": 871 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.8458587646485, + "epoch": 0.27908465354456713, + "grad_norm": 0.6962535977363586, + "kl": 1.1621643796563148, + "learning_rate": 1.8111139370494705e-05, + "loss": 0.1272, + "reward": 1.110416692495346, + "reward_std": 0.12768222466111184, + "rewards/accuracy_reward": 0.14166667070239783, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9687500059604645, + "step": 872 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.8625244140625, + "epoch": 0.27940470475276047, + "grad_norm": 0.45951202511787415, + "kl": 0.5613536521792412, + "learning_rate": 1.8104597556341538e-05, + "loss": 0.0462, + "reward": 1.0333333551883697, + "reward_std": 0.10816633310168981, + "rewards/accuracy_reward": 0.05625000167638063, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833373069763, + "step": 873 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.0979248046875, + "epoch": 0.27972475596095375, + "grad_norm": 0.33993226289749146, + "kl": 0.5836403653025627, + "learning_rate": 1.8098045619183092e-05, + "loss": 0.0209, + "reward": 1.0614583432674407, + "reward_std": 0.08450026344507933, + "rewards/accuracy_reward": 0.08125000204890967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9802083432674408, + "step": 874 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.3104370117187, + "epoch": 0.28004480716914704, + "grad_norm": 0.23612137138843536, + "kl": 0.6390758916735649, + "learning_rate": 1.809148356720303e-05, + "loss": 0.0365, + "reward": 1.024479204416275, + "reward_std": 0.10049552712589502, + "rewards/accuracy_reward": 0.047916668094694616, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9765625178813935, + "step": 875 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.7541931152343, + "epoch": 0.2803648583773404, + "grad_norm": 1.0360605716705322, + "kl": 0.8030475050210952, + "learning_rate": 1.808491140859765e-05, + "loss": 0.0634, + "reward": 0.9723958373069763, + "reward_std": 0.16076738238334656, + "rewards/accuracy_reward": 0.012500000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9598958432674408, + "step": 876 + }, + { + "clip_ratio": 0.0, + "completion_length": 619.4771118164062, + "epoch": 0.28068490958553366, + "grad_norm": 0.13555213809013367, + "kl": 0.5969193749129772, + "learning_rate": 1.8078329151575874e-05, + "loss": 0.0455, + "reward": 1.0635416805744171, + "reward_std": 0.10078618377447128, + "rewards/accuracy_reward": 0.08541666865348815, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250059604645, + "step": 877 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.8291870117188, + "epoch": 0.281004960793727, + "grad_norm": 0.1967499703168869, + "kl": 0.7825645431876183, + "learning_rate": 1.8071736804359235e-05, + "loss": 0.0243, + "reward": 1.0677083492279054, + "reward_std": 0.0947670703753829, + "rewards/accuracy_reward": 0.08333333339542151, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750119209289, + "step": 878 + }, + { + "clip_ratio": 0.0, + "completion_length": 613.789599609375, + "epoch": 0.2813250120019203, + "grad_norm": 0.24350924789905548, + "kl": 1.0826184466481208, + "learning_rate": 1.806513437518187e-05, + "loss": 0.0796, + "reward": 1.032291704416275, + "reward_std": 0.15164516121149063, + "rewards/accuracy_reward": 0.06250000130385161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916805744171, + "step": 879 + }, + { + "clip_ratio": 0.0, + "completion_length": 588.2604431152344, + "epoch": 0.2816450632101136, + "grad_norm": 0.36908820271492004, + "kl": 1.1280300706624984, + "learning_rate": 1.8058521872290505e-05, + "loss": 0.137, + "reward": 0.973437511920929, + "reward_std": 0.10502424836158752, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9692708432674408, + "step": 880 + }, + { + "clip_ratio": 0.0, + "completion_length": 618.9083618164062, + "epoch": 0.2819651144183069, + "grad_norm": 0.2288147509098053, + "kl": 0.812470331788063, + "learning_rate": 1.8051899303944454e-05, + "loss": 0.0882, + "reward": 1.0593750059604645, + "reward_std": 0.10690983049571515, + "rewards/accuracy_reward": 0.0812500011175871, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250059604645, + "step": 881 + }, + { + "clip_ratio": 0.0, + "completion_length": 609.6250213623047, + "epoch": 0.28228516562650025, + "grad_norm": 0.5861449837684631, + "kl": 1.3498991549015045, + "learning_rate": 1.8045266678415608e-05, + "loss": 0.1378, + "reward": 0.9854166805744171, + "reward_std": 0.14203399419784546, + "rewards/accuracy_reward": 0.01666666716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9687500119209289, + "step": 882 + }, + { + "clip_ratio": 0.0, + "completion_length": 620.7187683105469, + "epoch": 0.28260521683469353, + "grad_norm": 1.0595533847808838, + "kl": 0.9216204196214676, + "learning_rate": 1.8038624003988406e-05, + "loss": 0.1037, + "reward": 1.014062523841858, + "reward_std": 0.12942186892032623, + "rewards/accuracy_reward": 0.03750000167638064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9765625178813935, + "step": 883 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.2708526611328, + "epoch": 0.28292526804288687, + "grad_norm": 0.24561259150505066, + "kl": 1.2314361870288848, + "learning_rate": 1.8031971288959845e-05, + "loss": 0.0694, + "reward": 1.0031250238418579, + "reward_std": 0.12703317496925592, + "rewards/accuracy_reward": 0.04166666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9614583492279053, + "step": 884 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.0854370117188, + "epoch": 0.28324531925108015, + "grad_norm": 0.39103934168815613, + "kl": 0.7185132935643196, + "learning_rate": 1.8025308541639467e-05, + "loss": 0.1049, + "reward": 1.0197916865348815, + "reward_std": 0.1216716593131423, + "rewards/accuracy_reward": 0.050000001303851606, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916805744171, + "step": 885 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.1625183105468, + "epoch": 0.2835653704592735, + "grad_norm": 0.246460422873497, + "kl": 0.48717030733823774, + "learning_rate": 1.8018635770349343e-05, + "loss": 0.0698, + "reward": 0.9666666805744171, + "reward_std": 0.11010051686316728, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9645833432674408, + "step": 886 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.2646057128907, + "epoch": 0.2838854216674668, + "grad_norm": 0.28537264466285706, + "kl": 0.6939228355884552, + "learning_rate": 1.8011952983424058e-05, + "loss": 0.0918, + "reward": 1.0520833671092986, + "reward_std": 0.17129152230918407, + "rewards/accuracy_reward": 0.10208333674818278, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9500000178813934, + "step": 887 + }, + { + "clip_ratio": 0.0, + "completion_length": 506.7354278564453, + "epoch": 0.2842054728756601, + "grad_norm": 0.37171459197998047, + "kl": 0.8684057459235192, + "learning_rate": 1.800526018921072e-05, + "loss": 0.0965, + "reward": 0.9979166924953461, + "reward_std": 0.1424863189458847, + "rewards/accuracy_reward": 0.05000000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 888 + }, + { + "clip_ratio": 0.0, + "completion_length": 485.01251220703125, + "epoch": 0.2845255240838534, + "grad_norm": 1.3459445238113403, + "kl": 0.7982180349528789, + "learning_rate": 1.7998557396068923e-05, + "loss": 0.0867, + "reward": 1.024479192495346, + "reward_std": 0.17068119421601297, + "rewards/accuracy_reward": 0.07708333563059569, + "rewards/format_reward": 0.006250000186264515, + "rewards/tag_count_reward": 0.9411458492279052, + "step": 889 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.414599609375, + "epoch": 0.28484557529204674, + "grad_norm": 0.24494709074497223, + "kl": 0.787135424464941, + "learning_rate": 1.7991844612370756e-05, + "loss": 0.081, + "reward": 1.0135416984558105, + "reward_std": 0.21748648285865785, + "rewards/accuracy_reward": 0.05416666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9593750059604644, + "step": 890 + }, + { + "clip_ratio": 0.0, + "completion_length": 660.3875244140625, + "epoch": 0.28516562650024, + "grad_norm": 0.18688689172267914, + "kl": 0.7876987963914871, + "learning_rate": 1.798512184650079e-05, + "loss": 0.042, + "reward": 1.0692708492279053, + "reward_std": 0.1874027382582426, + "rewards/accuracy_reward": 0.10000000279396773, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9692708373069763, + "step": 891 + }, + { + "clip_ratio": 0.0, + "completion_length": 724.6104370117188, + "epoch": 0.28548567770843336, + "grad_norm": 0.09229505062103271, + "kl": 0.4387684382498264, + "learning_rate": 1.7978389106856056e-05, + "loss": 0.0362, + "reward": 1.0125000119209289, + "reward_std": 0.1014697566628456, + "rewards/accuracy_reward": 0.02708333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9854166686534882, + "step": 892 + }, + { + "clip_ratio": 0.0, + "completion_length": 739.0000183105469, + "epoch": 0.28580572891662664, + "grad_norm": 0.6484495401382446, + "kl": 0.7545491896569729, + "learning_rate": 1.797164640184605e-05, + "loss": 0.0398, + "reward": 1.051041692495346, + "reward_std": 0.16586514431983232, + "rewards/accuracy_reward": 0.07916666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.971875011920929, + "step": 893 + }, + { + "clip_ratio": 0.0, + "completion_length": 768.3666870117188, + "epoch": 0.28612578012482, + "grad_norm": 0.2332949936389923, + "kl": 1.24727663397789, + "learning_rate": 1.796489373989271e-05, + "loss": 0.0895, + "reward": 1.034375011920929, + "reward_std": 0.12449453994631768, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9635416746139527, + "step": 894 + }, + { + "clip_ratio": 0.0, + "completion_length": 763.9021057128906, + "epoch": 0.28644583133301327, + "grad_norm": 0.18527598679065704, + "kl": 1.00824686139822, + "learning_rate": 1.7958131129430417e-05, + "loss": 0.0326, + "reward": 1.0395833551883698, + "reward_std": 0.10068847816437483, + "rewards/accuracy_reward": 0.06875000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333432674408, + "step": 895 + }, + { + "clip_ratio": 0.0, + "completion_length": 734.9166809082031, + "epoch": 0.2867658825412066, + "grad_norm": 0.5873371362686157, + "kl": 1.7709833174943923, + "learning_rate": 1.7951358578905976e-05, + "loss": 0.0726, + "reward": 0.9812500178813934, + "reward_std": 0.14730002786964178, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9458333551883698, + "step": 896 + }, + { + "clip_ratio": 0.0, + "completion_length": 734.1875183105469, + "epoch": 0.2870859337493999, + "grad_norm": 0.6751114130020142, + "kl": 1.6086274296045304, + "learning_rate": 1.7944576096778595e-05, + "loss": 0.1066, + "reward": 0.9916666924953461, + "reward_std": 0.14545521959662439, + "rewards/accuracy_reward": 0.04583333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9458333551883698, + "step": 897 + }, + { + "clip_ratio": 0.0, + "completion_length": 679.9854339599609, + "epoch": 0.28740598495759323, + "grad_norm": 0.25408291816711426, + "kl": 1.0246099442243577, + "learning_rate": 1.793778369151991e-05, + "loss": 0.092, + "reward": 1.019791692495346, + "reward_std": 0.15563630759716035, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9489583492279052, + "step": 898 + }, + { + "clip_ratio": 0.0, + "completion_length": 719.8354309082031, + "epoch": 0.2877260361657865, + "grad_norm": 0.5261191129684448, + "kl": 0.9038506269454956, + "learning_rate": 1.7930981371613936e-05, + "loss": 0.0331, + "reward": 0.9869791865348816, + "reward_std": 0.13006459400057793, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9536458492279053, + "step": 899 + }, + { + "clip_ratio": 0.0, + "completion_length": 722.0062622070312, + "epoch": 0.28804608737397985, + "grad_norm": 0.4196346700191498, + "kl": 0.968600545823574, + "learning_rate": 1.792416914555707e-05, + "loss": 0.0621, + "reward": 0.9843750119209289, + "reward_std": 0.15532765444368124, + "rewards/accuracy_reward": 0.025000000558793544, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.959375011920929, + "step": 900 + }, + { + "clip_ratio": 0.0, + "completion_length": 656.9187774658203, + "epoch": 0.28836613858217314, + "grad_norm": 0.24890783429145813, + "kl": 0.9109811738133431, + "learning_rate": 1.7917347021858092e-05, + "loss": 0.0637, + "reward": 1.0385416984558105, + "reward_std": 0.1349452082067728, + "rewards/accuracy_reward": 0.07708333563059569, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9614583611488342, + "step": 901 + }, + { + "clip_ratio": 0.0, + "completion_length": 647.0625122070312, + "epoch": 0.2886861897903665, + "grad_norm": 0.22287528216838837, + "kl": 0.8536801934242249, + "learning_rate": 1.791051500903814e-05, + "loss": 0.0469, + "reward": 0.9989583611488342, + "reward_std": 0.16375069059431552, + "rewards/accuracy_reward": 0.03333333507180214, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9656250059604645, + "step": 902 + }, + { + "clip_ratio": 0.0, + "completion_length": 688.527099609375, + "epoch": 0.28900624099855976, + "grad_norm": 0.8015788793563843, + "kl": 1.4864769637584687, + "learning_rate": 1.7903673115630703e-05, + "loss": 0.0914, + "reward": 1.0114583551883698, + "reward_std": 0.16271494328975677, + "rewards/accuracy_reward": 0.052083333395421504, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9593750238418579, + "step": 903 + }, + { + "clip_ratio": 0.0, + "completion_length": 681.2875183105468, + "epoch": 0.2893262922067531, + "grad_norm": 0.4300864040851593, + "kl": 1.9600940197706223, + "learning_rate": 1.7896821350181613e-05, + "loss": 0.1166, + "reward": 1.0031250298023224, + "reward_std": 0.20148923099040986, + "rewards/accuracy_reward": 0.06875000260770321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9343750178813934, + "step": 904 + }, + { + "clip_ratio": 0.0, + "completion_length": 687.5271057128906, + "epoch": 0.2896463434149464, + "grad_norm": 0.2800224721431732, + "kl": 1.190105938911438, + "learning_rate": 1.788995972124903e-05, + "loss": 0.0948, + "reward": 0.9963541924953461, + "reward_std": 0.1741415023803711, + "rewards/accuracy_reward": 0.04791666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.948437511920929, + "step": 905 + }, + { + "clip_ratio": 0.0, + "completion_length": 615.8479309082031, + "epoch": 0.2899663946231397, + "grad_norm": 0.6161094903945923, + "kl": 1.6506537348031998, + "learning_rate": 1.788308823740344e-05, + "loss": 0.1233, + "reward": 1.0333333492279053, + "reward_std": 0.2054402783513069, + "rewards/accuracy_reward": 0.09791666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9354166805744171, + "step": 906 + }, + { + "clip_ratio": 0.0, + "completion_length": 651.6416931152344, + "epoch": 0.290286445831333, + "grad_norm": 0.40559831261634827, + "kl": 1.1269910991191865, + "learning_rate": 1.7876206907227628e-05, + "loss": 0.0767, + "reward": 1.0171875059604645, + "reward_std": 0.16564912348985672, + "rewards/accuracy_reward": 0.06250000149011611, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9526041805744171, + "step": 907 + }, + { + "clip_ratio": 0.0, + "completion_length": 613.4104431152343, + "epoch": 0.29060649703952635, + "grad_norm": 0.8728421926498413, + "kl": 1.824609386920929, + "learning_rate": 1.7869315739316685e-05, + "loss": 0.1396, + "reward": 0.9671875178813935, + "reward_std": 0.22036788761615753, + "rewards/accuracy_reward": 0.05000000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.917187511920929, + "step": 908 + }, + { + "clip_ratio": 0.0, + "completion_length": 660.864599609375, + "epoch": 0.29092654824771963, + "grad_norm": 0.3799927234649658, + "kl": 1.3191021710634232, + "learning_rate": 1.7862414742277993e-05, + "loss": 0.0871, + "reward": 1.0270833432674409, + "reward_std": 0.1408295204862952, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9583333492279053, + "step": 909 + }, + { + "clip_ratio": 0.0, + "completion_length": 615.456265258789, + "epoch": 0.29124659945591297, + "grad_norm": 0.8161053657531738, + "kl": 2.05251030921936, + "learning_rate": 1.7855503924731205e-05, + "loss": 0.2041, + "reward": 1.0473958611488343, + "reward_std": 0.18615373224020004, + "rewards/accuracy_reward": 0.10208333861082793, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9453125178813935, + "step": 910 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.612515258789, + "epoch": 0.29156665066410625, + "grad_norm": 0.3766956627368927, + "kl": 1.0139563411474228, + "learning_rate": 1.7848583295308236e-05, + "loss": 0.1265, + "reward": 0.9994791865348815, + "reward_std": 0.11650126576423644, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9619791746139527, + "step": 911 + }, + { + "clip_ratio": 0.0, + "completion_length": 603.5229309082031, + "epoch": 0.2918867018722996, + "grad_norm": 0.4018208682537079, + "kl": 0.8724268615245819, + "learning_rate": 1.784165286265327e-05, + "loss": 0.132, + "reward": 1.0041666984558106, + "reward_std": 0.14942692667245866, + "rewards/accuracy_reward": 0.045833334885537626, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9583333611488343, + "step": 912 + }, + { + "clip_ratio": 0.0, + "completion_length": 604.341683959961, + "epoch": 0.2922067530804929, + "grad_norm": 0.30784985423088074, + "kl": 0.8771272003650665, + "learning_rate": 1.7834712635422718e-05, + "loss": 0.1001, + "reward": 1.0505208432674409, + "reward_std": 0.18126559555530547, + "rewards/accuracy_reward": 0.10000000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9505208492279053, + "step": 913 + }, + { + "clip_ratio": 0.0, + "completion_length": 619.1208557128906, + "epoch": 0.2925268042886862, + "grad_norm": 0.278812974691391, + "kl": 0.8446752950549126, + "learning_rate": 1.7827762622285245e-05, + "loss": 0.1318, + "reward": 1.0598958611488343, + "reward_std": 0.18923650197684766, + "rewards/accuracy_reward": 0.10208333786576987, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9578125178813934, + "step": 914 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.808349609375, + "epoch": 0.2928468554968795, + "grad_norm": 0.15253132581710815, + "kl": 0.4771121509373188, + "learning_rate": 1.7820802831921723e-05, + "loss": 0.06, + "reward": 1.066666704416275, + "reward_std": 0.12053482681512832, + "rewards/accuracy_reward": 0.09166667088866234, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9750000238418579, + "step": 915 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.3854370117188, + "epoch": 0.29316690670507284, + "grad_norm": 0.21070396900177002, + "kl": 0.3982532635331154, + "learning_rate": 1.7813833273025237e-05, + "loss": 0.1003, + "reward": 1.0447916865348816, + "reward_std": 0.15599412955343722, + "rewards/accuracy_reward": 0.07291666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9718750059604645, + "step": 916 + }, + { + "clip_ratio": 0.0, + "completion_length": 620.0041870117187, + "epoch": 0.2934869579132661, + "grad_norm": 0.3192347586154938, + "kl": 0.7548017039895057, + "learning_rate": 1.780685395430109e-05, + "loss": 0.0836, + "reward": 1.0635416805744171, + "reward_std": 0.13123438209295274, + "rewards/accuracy_reward": 0.0895833358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583373069763, + "step": 917 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.8812744140625, + "epoch": 0.2938070091214594, + "grad_norm": 0.18562725186347961, + "kl": 0.5546324595808982, + "learning_rate": 1.779986488446676e-05, + "loss": 0.0847, + "reward": 1.0500000357627868, + "reward_std": 0.1341039039194584, + "rewards/accuracy_reward": 0.08750000204890966, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9604166805744171, + "step": 918 + }, + { + "clip_ratio": 0.0, + "completion_length": 631.4958557128906, + "epoch": 0.29412706032965275, + "grad_norm": 0.18860366940498352, + "kl": 0.60446348041296, + "learning_rate": 1.77928660722519e-05, + "loss": 0.0673, + "reward": 1.0619791805744172, + "reward_std": 0.15306191500276328, + "rewards/accuracy_reward": 0.0979166692122817, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9640625178813934, + "step": 919 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.8270965576172, + "epoch": 0.29444711153784603, + "grad_norm": 0.4077078700065613, + "kl": 0.9243380039930343, + "learning_rate": 1.7785857526398347e-05, + "loss": 0.1521, + "reward": 1.0421875298023224, + "reward_std": 0.18533986136317254, + "rewards/accuracy_reward": 0.0958333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9463541865348816, + "step": 920 + }, + { + "clip_ratio": 0.0, + "completion_length": 603.7916931152344, + "epoch": 0.29476716274603937, + "grad_norm": 0.17928466200828552, + "kl": 0.7125317409634591, + "learning_rate": 1.7778839255660087e-05, + "loss": 0.1819, + "reward": 1.0593750298023223, + "reward_std": 0.16393165495246648, + "rewards/accuracy_reward": 0.11458333656191826, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9447916865348815, + "step": 921 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.7062622070313, + "epoch": 0.29508721395423265, + "grad_norm": 0.1269647628068924, + "kl": 0.6188096687197685, + "learning_rate": 1.7771811268803258e-05, + "loss": 0.0549, + "reward": 1.0052083551883697, + "reward_std": 0.11031838692724705, + "rewards/accuracy_reward": 0.03958333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9656250238418579, + "step": 922 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.4354400634766, + "epoch": 0.295407265162426, + "grad_norm": 0.4039466083049774, + "kl": 1.4429447636008264, + "learning_rate": 1.7764773574606124e-05, + "loss": 0.1123, + "reward": 1.0437500298023223, + "reward_std": 0.1867046182975173, + "rewards/accuracy_reward": 0.09166666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9520833551883697, + "step": 923 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.133349609375, + "epoch": 0.2957273163706193, + "grad_norm": 0.20248201489448547, + "kl": 0.6664691850543022, + "learning_rate": 1.7757726181859084e-05, + "loss": 0.1074, + "reward": 1.0020833551883697, + "reward_std": 0.10909523330628872, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9645833492279052, + "step": 924 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.9187622070312, + "epoch": 0.2960473675788126, + "grad_norm": 0.3584718704223633, + "kl": 0.9459757208824158, + "learning_rate": 1.7750669099364643e-05, + "loss": 0.116, + "reward": 1.0421875298023224, + "reward_std": 0.13381226696074008, + "rewards/accuracy_reward": 0.07916666883975268, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9630208492279053, + "step": 925 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.2375122070313, + "epoch": 0.2963674187870059, + "grad_norm": 0.27927231788635254, + "kl": 0.7706046402454376, + "learning_rate": 1.774360233593742e-05, + "loss": 0.1015, + "reward": 1.0286458551883697, + "reward_std": 0.13063797876238822, + "rewards/accuracy_reward": 0.06250000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9661458492279053, + "step": 926 + }, + { + "clip_ratio": 0.0, + "completion_length": 629.7625183105469, + "epoch": 0.29668746999519924, + "grad_norm": 0.26649898290634155, + "kl": 1.7680400401353835, + "learning_rate": 1.7736525900404114e-05, + "loss": 0.1569, + "reward": 1.0302083671092988, + "reward_std": 0.18417428024113178, + "rewards/accuracy_reward": 0.08958333637565374, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9385416865348816, + "step": 927 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.7000274658203, + "epoch": 0.2970075212033925, + "grad_norm": 0.4364243149757385, + "kl": 1.2817148357629775, + "learning_rate": 1.772943980160351e-05, + "loss": 0.1445, + "reward": 1.0682291984558105, + "reward_std": 0.25006254613399503, + "rewards/accuracy_reward": 0.12708333618938922, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9411458432674408, + "step": 928 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.2708557128906, + "epoch": 0.29732757241158586, + "grad_norm": 0.43147382140159607, + "kl": 0.8256754875183105, + "learning_rate": 1.7722344048386468e-05, + "loss": 0.1191, + "reward": 1.0005208551883698, + "reward_std": 0.15139769725501537, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9588541805744171, + "step": 929 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.2187713623047, + "epoch": 0.29764762361977914, + "grad_norm": 1.0563299655914307, + "kl": 1.2677388548851014, + "learning_rate": 1.7715238649615893e-05, + "loss": 0.1303, + "reward": 1.0494791865348816, + "reward_std": 0.1273749502375722, + "rewards/accuracy_reward": 0.09166666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9578125238418579, + "step": 930 + }, + { + "clip_ratio": 0.0, + "completion_length": 513.6125122070313, + "epoch": 0.2979676748279725, + "grad_norm": 0.1512734740972519, + "kl": 0.6284838706254959, + "learning_rate": 1.770812361416675e-05, + "loss": 0.1211, + "reward": 1.060937523841858, + "reward_std": 0.1170524686574936, + "rewards/accuracy_reward": 0.08750000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9734375178813934, + "step": 931 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.3916839599609, + "epoch": 0.29828772603616577, + "grad_norm": 0.1825270652770996, + "kl": 0.3309263564646244, + "learning_rate": 1.770099895092604e-05, + "loss": 0.0311, + "reward": 1.1010416924953461, + "reward_std": 0.07066966965794563, + "rewards/accuracy_reward": 0.11250000316649675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9885416805744172, + "step": 932 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.8979370117188, + "epoch": 0.2986077772443591, + "grad_norm": 0.3317464590072632, + "kl": 0.2580555848777294, + "learning_rate": 1.7693864668792785e-05, + "loss": 0.0402, + "reward": 1.1088541984558105, + "reward_std": 0.09786607697606087, + "rewards/accuracy_reward": 0.12291667088866234, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9859375119209289, + "step": 933 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.3729309082031, + "epoch": 0.2989278284525524, + "grad_norm": 0.3599172830581665, + "kl": 0.26390968188643454, + "learning_rate": 1.768672077667802e-05, + "loss": 0.0126, + "reward": 1.0473958611488343, + "reward_std": 0.1011963851749897, + "rewards/accuracy_reward": 0.058333336003124715, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.989062511920929, + "step": 934 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.2916870117188, + "epoch": 0.29924787966074573, + "grad_norm": 0.20416362583637238, + "kl": 0.44174774885177615, + "learning_rate": 1.767956728350479e-05, + "loss": 0.0472, + "reward": 1.0395833551883698, + "reward_std": 0.07305270098149777, + "rewards/accuracy_reward": 0.05625000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9833333432674408, + "step": 935 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.5771026611328, + "epoch": 0.299567930868939, + "grad_norm": 0.30982765555381775, + "kl": 0.47428609281778333, + "learning_rate": 1.7672404198208123e-05, + "loss": 0.0426, + "reward": 1.1197916984558105, + "reward_std": 0.1203194510191679, + "rewards/accuracy_reward": 0.1437500050291419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9760416865348815, + "step": 936 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.5479339599609, + "epoch": 0.29988798207713235, + "grad_norm": 0.11249273270368576, + "kl": 0.6478874146938324, + "learning_rate": 1.7665231529735042e-05, + "loss": 0.0666, + "reward": 1.0703125238418578, + "reward_std": 0.10500245019793511, + "rewards/accuracy_reward": 0.09166666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458492279053, + "step": 937 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.2646026611328, + "epoch": 0.30020803328532564, + "grad_norm": 1.0702687501907349, + "kl": 0.7110832586884499, + "learning_rate": 1.765804928704452e-05, + "loss": 0.1257, + "reward": 1.0927083611488342, + "reward_std": 0.12243700325489044, + "rewards/accuracy_reward": 0.11875000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583492279053, + "step": 938 + }, + { + "clip_ratio": 0.0, + "completion_length": 609.7104431152344, + "epoch": 0.300528084493519, + "grad_norm": 0.1390598863363266, + "kl": 0.5671024739742279, + "learning_rate": 1.7650857479107507e-05, + "loss": 0.1166, + "reward": 0.9692708551883698, + "reward_std": 0.10575719326734542, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9692708551883698, + "step": 939 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.1354309082031, + "epoch": 0.30084813570171226, + "grad_norm": 0.3185669779777527, + "kl": 0.6791951522231102, + "learning_rate": 1.7643656114906895e-05, + "loss": 0.0587, + "reward": 1.0505208492279052, + "reward_std": 0.16533472537994384, + "rewards/accuracy_reward": 0.08541666846722365, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9651041805744172, + "step": 940 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.7916870117188, + "epoch": 0.3011681869099056, + "grad_norm": 0.40729889273643494, + "kl": 1.093244832754135, + "learning_rate": 1.7636445203437503e-05, + "loss": 0.1655, + "reward": 1.0260416865348816, + "reward_std": 0.1972845211625099, + "rewards/accuracy_reward": 0.08333333563059568, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9427083492279053, + "step": 941 + }, + { + "clip_ratio": 0.0, + "completion_length": 676.4041870117187, + "epoch": 0.3014882381180989, + "grad_norm": 0.0986812561750412, + "kl": 0.3631768196821213, + "learning_rate": 1.7629224753706088e-05, + "loss": 0.0565, + "reward": 1.027604192495346, + "reward_std": 0.09776556696742773, + "rewards/accuracy_reward": 0.05000000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9776041865348816, + "step": 942 + }, + { + "clip_ratio": 0.0, + "completion_length": 645.5583618164062, + "epoch": 0.3018082893262922, + "grad_norm": 0.2628479599952698, + "kl": 0.47837393432855607, + "learning_rate": 1.762199477473131e-05, + "loss": 0.087, + "reward": 1.067187523841858, + "reward_std": 0.11832643263041973, + "rewards/accuracy_reward": 0.10208333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9651041805744172, + "step": 943 + }, + { + "clip_ratio": 0.0, + "completion_length": 618.8875183105469, + "epoch": 0.3021283405344855, + "grad_norm": 0.175709068775177, + "kl": 0.9550945192575455, + "learning_rate": 1.7614755275543748e-05, + "loss": 0.1663, + "reward": 1.0333333671092988, + "reward_std": 0.17842126339673997, + "rewards/accuracy_reward": 0.08750000316649675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9458333611488342, + "step": 944 + }, + { + "clip_ratio": 0.0, + "completion_length": 654.9562683105469, + "epoch": 0.30244839174267885, + "grad_norm": 0.34556370973587036, + "kl": 0.4930610120296478, + "learning_rate": 1.7607506265185846e-05, + "loss": 0.0986, + "reward": 0.9947916805744171, + "reward_std": 0.1405083030462265, + "rewards/accuracy_reward": 0.03958333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9552083432674408, + "step": 945 + }, + { + "clip_ratio": 0.0, + "completion_length": 672.8104431152344, + "epoch": 0.30276844295087213, + "grad_norm": 0.26512089371681213, + "kl": 0.7804930925369262, + "learning_rate": 1.7600247752711952e-05, + "loss": 0.0917, + "reward": 1.0041666865348815, + "reward_std": 0.17392733693122864, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9333333551883698, + "step": 946 + }, + { + "clip_ratio": 0.0, + "completion_length": 725.4125183105468, + "epoch": 0.30308849415906547, + "grad_norm": 0.1472983956336975, + "kl": 0.7122527778148651, + "learning_rate": 1.759297974718827e-05, + "loss": 0.1153, + "reward": 1.071875023841858, + "reward_std": 0.19012432545423508, + "rewards/accuracy_reward": 0.12500000204890965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9468750119209289, + "step": 947 + }, + { + "clip_ratio": 0.0, + "completion_length": 667.7604370117188, + "epoch": 0.30340854536725875, + "grad_norm": 0.2633792757987976, + "kl": 0.8807353228330612, + "learning_rate": 1.7585702257692863e-05, + "loss": 0.1206, + "reward": 1.0348958611488341, + "reward_std": 0.20192356854677201, + "rewards/accuracy_reward": 0.09583333656191825, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9390625119209289, + "step": 948 + }, + { + "clip_ratio": 0.0, + "completion_length": 692.2646057128907, + "epoch": 0.3037285965754521, + "grad_norm": 0.5958443880081177, + "kl": 1.3553169280290605, + "learning_rate": 1.7578415293315646e-05, + "loss": 0.1455, + "reward": 0.977604192495346, + "reward_std": 0.2028628334403038, + "rewards/accuracy_reward": 0.06041667014360428, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9171875238418579, + "step": 949 + }, + { + "clip_ratio": 0.0, + "completion_length": 694.8229309082031, + "epoch": 0.3040486477836454, + "grad_norm": 0.32592839002609253, + "kl": 1.444900530576706, + "learning_rate": 1.7571118863158355e-05, + "loss": 0.1701, + "reward": 1.0734375357627868, + "reward_std": 0.26215763986110685, + "rewards/accuracy_reward": 0.1791666716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8942708492279052, + "step": 950 + }, + { + "clip_ratio": 0.0, + "completion_length": 697.8479431152343, + "epoch": 0.3043686989918387, + "grad_norm": 0.2379060685634613, + "kl": 1.5481299102306365, + "learning_rate": 1.756381297633457e-05, + "loss": 0.1559, + "reward": 1.008854192495346, + "reward_std": 0.24572829753160477, + "rewards/accuracy_reward": 0.1354166716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8734375178813935, + "step": 951 + }, + { + "clip_ratio": 0.0, + "completion_length": 731.483349609375, + "epoch": 0.304688750200032, + "grad_norm": 0.17579278349876404, + "kl": 1.133284804224968, + "learning_rate": 1.7556497641969658e-05, + "loss": 0.0919, + "reward": 1.068229192495346, + "reward_std": 0.21461983472108842, + "rewards/accuracy_reward": 0.15000000298023225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9182291865348816, + "step": 952 + }, + { + "clip_ratio": 0.0, + "completion_length": 723.1666870117188, + "epoch": 0.30500880140822534, + "grad_norm": 0.3515166938304901, + "kl": 1.3535830855369568, + "learning_rate": 1.754917286920081e-05, + "loss": 0.112, + "reward": 0.9218750119209289, + "reward_std": 0.24458130151033403, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.8822916805744171, + "step": 953 + }, + { + "clip_ratio": 0.0, + "completion_length": 713.7125183105469, + "epoch": 0.3053288526164186, + "grad_norm": 0.20482727885246277, + "kl": 1.2124733626842499, + "learning_rate": 1.7541838667176993e-05, + "loss": 0.0829, + "reward": 0.9005208492279053, + "reward_std": 0.23665229380130767, + "rewards/accuracy_reward": 0.010416666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8901041924953461, + "step": 954 + }, + { + "clip_ratio": 0.0, + "completion_length": 705.5625183105469, + "epoch": 0.30564890382461196, + "grad_norm": 0.29011431336402893, + "kl": 1.3561316847801208, + "learning_rate": 1.7534495045058947e-05, + "loss": 0.106, + "reward": 0.9645833551883698, + "reward_std": 0.21052157506346703, + "rewards/accuracy_reward": 0.0770833358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8875000238418579, + "step": 955 + }, + { + "clip_ratio": 0.0, + "completion_length": 702.7250244140625, + "epoch": 0.30596895503280525, + "grad_norm": 0.3863130807876587, + "kl": 1.455906194448471, + "learning_rate": 1.7527142012019193e-05, + "loss": 0.1285, + "reward": 0.8713541865348816, + "reward_std": 0.2652384236454964, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8692708551883698, + "step": 956 + }, + { + "clip_ratio": 0.0, + "completion_length": 808.6521118164062, + "epoch": 0.3062890062409986, + "grad_norm": 0.18241232633590698, + "kl": 1.7860671520233153, + "learning_rate": 1.7519779577241993e-05, + "loss": 0.1083, + "reward": 0.9317708492279053, + "reward_std": 0.27449193596839905, + "rewards/accuracy_reward": 0.06875000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8630208432674408, + "step": 957 + }, + { + "clip_ratio": 0.0, + "completion_length": 751.5437744140625, + "epoch": 0.30660905744919187, + "grad_norm": 0.32234227657318115, + "kl": 1.7606966257095338, + "learning_rate": 1.751240774992336e-05, + "loss": 0.134, + "reward": 0.9302083492279053, + "reward_std": 0.27238035053014753, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8614583432674408, + "step": 958 + }, + { + "clip_ratio": 0.0, + "completion_length": 787.4166870117188, + "epoch": 0.3069291086573852, + "grad_norm": 0.6801765561103821, + "kl": 2.5121779322624205, + "learning_rate": 1.7505026539271038e-05, + "loss": 0.1615, + "reward": 0.9062500298023224, + "reward_std": 0.3029158145189285, + "rewards/accuracy_reward": 0.050000001303851606, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8562500178813934, + "step": 959 + }, + { + "clip_ratio": 0.0, + "completion_length": 770.1125244140625, + "epoch": 0.3072491598655785, + "grad_norm": 0.19171766936779022, + "kl": 1.4077009975910186, + "learning_rate": 1.7497635954504487e-05, + "loss": 0.0902, + "reward": 0.9135416805744171, + "reward_std": 0.2074896477162838, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9093750178813934, + "step": 960 + }, + { + "clip_ratio": 0.0, + "completion_length": 772.3312744140625, + "epoch": 0.30756921107377183, + "grad_norm": 0.27294042706489563, + "kl": 0.896188372373581, + "learning_rate": 1.749023600485488e-05, + "loss": 0.0352, + "reward": 1.0109375238418579, + "reward_std": 0.19567719101905823, + "rewards/accuracy_reward": 0.07500000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.935937511920929, + "step": 961 + }, + { + "clip_ratio": 0.0, + "completion_length": 711.4562683105469, + "epoch": 0.3078892622819651, + "grad_norm": 0.16912463307380676, + "kl": 0.9790063366293907, + "learning_rate": 1.7482826699565083e-05, + "loss": 0.0763, + "reward": 0.9838541805744171, + "reward_std": 0.19945336878299713, + "rewards/accuracy_reward": 0.05416666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9296875119209289, + "step": 962 + }, + { + "clip_ratio": 0.0, + "completion_length": 726.5479431152344, + "epoch": 0.3082093134901584, + "grad_norm": 0.1460140198469162, + "kl": 0.7135851427912712, + "learning_rate": 1.747540804788965e-05, + "loss": 0.0191, + "reward": 0.962500023841858, + "reward_std": 0.13189447149634362, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9583333492279053, + "step": 963 + }, + { + "clip_ratio": 0.0, + "completion_length": 743.6562683105469, + "epoch": 0.30852936469835174, + "grad_norm": 0.13990092277526855, + "kl": 0.574808469414711, + "learning_rate": 1.7467980059094817e-05, + "loss": 0.0155, + "reward": 0.9984375178813935, + "reward_std": 0.11035698503255845, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9630208432674408, + "step": 964 + }, + { + "clip_ratio": 0.0, + "completion_length": 712.4729370117187, + "epoch": 0.308849415906545, + "grad_norm": 0.2480141520500183, + "kl": 0.6258080065250397, + "learning_rate": 1.7460542742458464e-05, + "loss": 0.056, + "reward": 1.0510416984558106, + "reward_std": 0.1838358849287033, + "rewards/accuracy_reward": 0.08750000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9635416746139527, + "step": 965 + }, + { + "clip_ratio": 0.0, + "completion_length": 716.4937744140625, + "epoch": 0.30916946711473836, + "grad_norm": 0.2927420139312744, + "kl": 0.6674822881817818, + "learning_rate": 1.745309610727014e-05, + "loss": 0.0581, + "reward": 1.0708333492279052, + "reward_std": 0.09178536143153906, + "rewards/accuracy_reward": 0.10000000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333432674408, + "step": 966 + }, + { + "clip_ratio": 0.0, + "completion_length": 728.0312744140625, + "epoch": 0.30948951832293164, + "grad_norm": 0.0917883962392807, + "kl": 0.24520479291677474, + "learning_rate": 1.744564016283102e-05, + "loss": 0.0244, + "reward": 1.0692708551883698, + "reward_std": 0.08112927377223969, + "rewards/accuracy_reward": 0.08333333563059568, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9859375059604645, + "step": 967 + }, + { + "clip_ratio": 0.0, + "completion_length": 743.9187622070312, + "epoch": 0.309809569531125, + "grad_norm": 0.0622573047876358, + "kl": 0.3754092678427696, + "learning_rate": 1.7438174918453916e-05, + "loss": 0.0386, + "reward": 1.0364583611488343, + "reward_std": 0.13707383908331394, + "rewards/accuracy_reward": 0.06250000130385161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583492279053, + "step": 968 + }, + { + "clip_ratio": 0.0, + "completion_length": 680.3625244140625, + "epoch": 0.31012962073931827, + "grad_norm": 0.1604725569486618, + "kl": 0.5867097809910774, + "learning_rate": 1.7430700383463253e-05, + "loss": 0.0852, + "reward": 1.058854192495346, + "reward_std": 0.20737518668174743, + "rewards/accuracy_reward": 0.10833333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9505208492279053, + "step": 969 + }, + { + "clip_ratio": 0.0, + "completion_length": 727.0729370117188, + "epoch": 0.3104496719475116, + "grad_norm": 0.10366953909397125, + "kl": 0.4695418193936348, + "learning_rate": 1.742321656719506e-05, + "loss": 0.0847, + "reward": 1.009375023841858, + "reward_std": 0.12343942523002624, + "rewards/accuracy_reward": 0.04375000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.965625011920929, + "step": 970 + }, + { + "clip_ratio": 0.0, + "completion_length": 663.1896118164062, + "epoch": 0.3107697231557049, + "grad_norm": 0.37568023800849915, + "kl": 0.9744888663291931, + "learning_rate": 1.7415723478996955e-05, + "loss": 0.1292, + "reward": 1.0635416865348817, + "reward_std": 0.1882859192788601, + "rewards/accuracy_reward": 0.1145833358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9489583432674408, + "step": 971 + }, + { + "clip_ratio": 0.0, + "completion_length": 687.7958435058594, + "epoch": 0.31108977436389823, + "grad_norm": 0.05901394784450531, + "kl": 0.26913181617856025, + "learning_rate": 1.7408221128228145e-05, + "loss": 0.0557, + "reward": 1.0322916865348817, + "reward_std": 0.10280282869935035, + "rewards/accuracy_reward": 0.052083333395421504, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9802083432674408, + "step": 972 + }, + { + "clip_ratio": 0.0, + "completion_length": 661.0646057128906, + "epoch": 0.3114098255720915, + "grad_norm": 0.12370312213897705, + "kl": 0.5913057863712311, + "learning_rate": 1.74007095242594e-05, + "loss": 0.1314, + "reward": 1.0161458492279052, + "reward_std": 0.1552325375378132, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9494791805744172, + "step": 973 + }, + { + "clip_ratio": 0.0, + "completion_length": 639.9771057128906, + "epoch": 0.31172987678028485, + "grad_norm": 0.313650906085968, + "kl": 0.38522453233599663, + "learning_rate": 1.7393188676473053e-05, + "loss": 0.0671, + "reward": 1.0052083551883697, + "reward_std": 0.0870123527944088, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9718750178813934, + "step": 974 + }, + { + "clip_ratio": 0.0, + "completion_length": 698.7041748046875, + "epoch": 0.31204992798847814, + "grad_norm": 0.10242673009634018, + "kl": 0.7089316248893738, + "learning_rate": 1.738565859426297e-05, + "loss": 0.0851, + "reward": 0.9661458551883697, + "reward_std": 0.1504125714302063, + "rewards/accuracy_reward": 0.010416667163372039, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9557291805744171, + "step": 975 + }, + { + "clip_ratio": 0.0, + "completion_length": 662.2646118164063, + "epoch": 0.3123699791966715, + "grad_norm": 0.1471061110496521, + "kl": 0.5538154274225235, + "learning_rate": 1.737811928703457e-05, + "loss": 0.0833, + "reward": 1.0546875178813935, + "reward_std": 0.1336808368563652, + "rewards/accuracy_reward": 0.09166666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9630208492279053, + "step": 976 + }, + { + "clip_ratio": 0.0, + "completion_length": 687.2437683105469, + "epoch": 0.31269003040486476, + "grad_norm": 0.10087238997220993, + "kl": 0.48820848688483237, + "learning_rate": 1.7370570764204788e-05, + "loss": 0.0999, + "reward": 1.0343750298023224, + "reward_std": 0.15432624202221631, + "rewards/accuracy_reward": 0.0666666692122817, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9677083492279053, + "step": 977 + }, + { + "clip_ratio": 0.0, + "completion_length": 639.7041870117188, + "epoch": 0.3130100816130581, + "grad_norm": 0.22208651900291443, + "kl": 0.6460809573531151, + "learning_rate": 1.7363013035202058e-05, + "loss": 0.0418, + "reward": 1.0875000178813934, + "reward_std": 0.1667162150144577, + "rewards/accuracy_reward": 0.12083333469927311, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9666666805744171, + "step": 978 + }, + { + "clip_ratio": 0.0, + "completion_length": 620.1083557128907, + "epoch": 0.3133301328212514, + "grad_norm": 0.139918714761734, + "kl": 0.7306598663330078, + "learning_rate": 1.7355446109466326e-05, + "loss": 0.1177, + "reward": 1.0744791984558106, + "reward_std": 0.2074648855254054, + "rewards/accuracy_reward": 0.12916667219251393, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9453125119209289, + "step": 979 + }, + { + "clip_ratio": 0.0, + "completion_length": 622.9604309082031, + "epoch": 0.3136501840294447, + "grad_norm": 0.18447650969028473, + "kl": 0.7331676751375198, + "learning_rate": 1.734786999644902e-05, + "loss": 0.1319, + "reward": 1.017187523841858, + "reward_std": 0.15683282688260078, + "rewards/accuracy_reward": 0.05833333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9588541805744171, + "step": 980 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.8250122070312, + "epoch": 0.313970235237638, + "grad_norm": 0.1246795505285263, + "kl": 0.5896144509315491, + "learning_rate": 1.7340284705613045e-05, + "loss": 0.0758, + "reward": 1.0437500178813934, + "reward_std": 0.10564655810594559, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9729166805744172, + "step": 981 + }, + { + "clip_ratio": 0.0, + "completion_length": 639.4041809082031, + "epoch": 0.31429028644583135, + "grad_norm": 0.27654311060905457, + "kl": 0.4418774448335171, + "learning_rate": 1.7332690246432774e-05, + "loss": 0.0852, + "reward": 1.084895873069763, + "reward_std": 0.14493267983198166, + "rewards/accuracy_reward": 0.11458333805203438, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.970312523841858, + "step": 982 + }, + { + "clip_ratio": 0.0, + "completion_length": 620.7979370117188, + "epoch": 0.31461033765402463, + "grad_norm": 0.18854264914989471, + "kl": 1.0708917260169983, + "learning_rate": 1.7325086628394017e-05, + "loss": 0.1695, + "reward": 1.1531250417232513, + "reward_std": 0.19459521472454072, + "rewards/accuracy_reward": 0.20833334028720857, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9447916746139526, + "step": 983 + }, + { + "clip_ratio": 0.0, + "completion_length": 643.2104370117188, + "epoch": 0.31493038886221797, + "grad_norm": 0.4541122615337372, + "kl": 1.2559853374958039, + "learning_rate": 1.731747386099404e-05, + "loss": 0.1223, + "reward": 0.947916692495346, + "reward_std": 0.21363041847944259, + "rewards/accuracy_reward": 0.025000001303851604, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9229166984558106, + "step": 984 + }, + { + "clip_ratio": 0.0, + "completion_length": 641.2625152587891, + "epoch": 0.31525044007041125, + "grad_norm": 0.5043960809707642, + "kl": 1.4750530004501343, + "learning_rate": 1.7309851953741532e-05, + "loss": 0.1493, + "reward": 0.9682291924953461, + "reward_std": 0.20985282510519027, + "rewards/accuracy_reward": 0.03750000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9307291865348816, + "step": 985 + }, + { + "clip_ratio": 0.0, + "completion_length": 613.8229370117188, + "epoch": 0.3155704912786046, + "grad_norm": 0.5190443992614746, + "kl": 1.9234457969665528, + "learning_rate": 1.7302220916156592e-05, + "loss": 0.2225, + "reward": 0.9427083551883697, + "reward_std": 0.2576408013701439, + "rewards/accuracy_reward": 0.041666666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 986 + }, + { + "clip_ratio": 0.0, + "completion_length": 639.17294921875, + "epoch": 0.3158905424867979, + "grad_norm": 0.2320450246334076, + "kl": 1.2377650499343873, + "learning_rate": 1.7294580757770725e-05, + "loss": 0.141, + "reward": 0.9635416865348816, + "reward_std": 0.21727037131786348, + "rewards/accuracy_reward": 0.0479166679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9156250119209289, + "step": 987 + }, + { + "clip_ratio": 0.0, + "completion_length": 629.5395935058593, + "epoch": 0.3162105936949912, + "grad_norm": 0.1871139407157898, + "kl": 1.1846880629658698, + "learning_rate": 1.728693148812684e-05, + "loss": 0.1154, + "reward": 0.9864583730697631, + "reward_std": 0.1802637368440628, + "rewards/accuracy_reward": 0.052083336375653745, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9343750238418579, + "step": 988 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.0541809082031, + "epoch": 0.3165306449031845, + "grad_norm": 0.4449709951877594, + "kl": 0.8319399744272232, + "learning_rate": 1.727927311677921e-05, + "loss": 0.1479, + "reward": 1.0192708551883698, + "reward_std": 0.1877690449357033, + "rewards/accuracy_reward": 0.07083333432674407, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9484375178813934, + "step": 989 + }, + { + "clip_ratio": 0.0, + "completion_length": 621.6937683105468, + "epoch": 0.31685069611137784, + "grad_norm": 0.3338225483894348, + "kl": 1.130355241894722, + "learning_rate": 1.7271605653293486e-05, + "loss": 0.122, + "reward": 0.9359375178813935, + "reward_std": 0.2035887584090233, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9317708551883698, + "step": 990 + }, + { + "clip_ratio": 0.0, + "completion_length": 617.0375244140625, + "epoch": 0.3171707473195711, + "grad_norm": 0.15540385246276855, + "kl": 0.9231228500604629, + "learning_rate": 1.7263929107246672e-05, + "loss": 0.1501, + "reward": 1.0187500238418579, + "reward_std": 0.15587160028517247, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 991 + }, + { + "clip_ratio": 0.0, + "completion_length": 627.5458557128907, + "epoch": 0.31749079852776446, + "grad_norm": 0.11945409327745438, + "kl": 0.8397936165332794, + "learning_rate": 1.725624348822712e-05, + "loss": 0.1723, + "reward": 1.0463541924953461, + "reward_std": 0.15689616054296493, + "rewards/accuracy_reward": 0.10000000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9463541865348816, + "step": 992 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.2479370117187, + "epoch": 0.31781084973595775, + "grad_norm": 0.474109411239624, + "kl": 1.2607935786247253, + "learning_rate": 1.7248548805834512e-05, + "loss": 0.2222, + "reward": 0.9911458611488342, + "reward_std": 0.2234620615839958, + "rewards/accuracy_reward": 0.06666666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9244791865348816, + "step": 993 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.1166900634765, + "epoch": 0.3181309009441511, + "grad_norm": 0.33378320932388306, + "kl": 1.0174303948879242, + "learning_rate": 1.724084506967985e-05, + "loss": 0.1641, + "reward": 1.0338541984558105, + "reward_std": 0.2083968624472618, + "rewards/accuracy_reward": 0.0958333358168602, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.935937511920929, + "step": 994 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.108349609375, + "epoch": 0.31845095215234437, + "grad_norm": 0.20785702764987946, + "kl": 0.7034070655703545, + "learning_rate": 1.723313228938545e-05, + "loss": 0.1507, + "reward": 0.9697916805744171, + "reward_std": 0.19483174681663512, + "rewards/accuracy_reward": 0.0375, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9322916746139527, + "step": 995 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.8812622070312, + "epoch": 0.3187710033605377, + "grad_norm": 0.26794859766960144, + "kl": 0.9634685277938843, + "learning_rate": 1.7225410474584907e-05, + "loss": 0.1563, + "reward": 1.052604192495346, + "reward_std": 0.19138510003685952, + "rewards/accuracy_reward": 0.11875000409781933, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9338541805744172, + "step": 996 + }, + { + "clip_ratio": 0.0, + "completion_length": 604.5541839599609, + "epoch": 0.319091054568731, + "grad_norm": 0.25083127617836, + "kl": 1.1079894408583641, + "learning_rate": 1.721767963492313e-05, + "loss": 0.2077, + "reward": 0.9994791984558106, + "reward_std": 0.22150392681360245, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9161458551883698, + "step": 997 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.695849609375, + "epoch": 0.31941110577692433, + "grad_norm": 0.21013391017913818, + "kl": 0.9574685275554657, + "learning_rate": 1.7209939780056273e-05, + "loss": 0.1939, + "reward": 1.0083333551883698, + "reward_std": 0.16795330494642258, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9375000178813935, + "step": 998 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.3500213623047, + "epoch": 0.3197311569851176, + "grad_norm": 0.11133457720279694, + "kl": 0.6998517155647278, + "learning_rate": 1.7202190919651764e-05, + "loss": 0.1522, + "reward": 1.0833333611488343, + "reward_std": 0.14614312946796418, + "rewards/accuracy_reward": 0.13541667070239782, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 999 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.8375183105469, + "epoch": 0.32005120819331095, + "grad_norm": 0.16730454564094543, + "kl": 1.1215898275375367, + "learning_rate": 1.7194433063388273e-05, + "loss": 0.2736, + "reward": 1.0567708611488342, + "reward_std": 0.23038013577461242, + "rewards/accuracy_reward": 0.13125000447034835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9255208492279052, + "step": 1000 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.7354278564453, + "epoch": 0.32037125940150424, + "grad_norm": 0.2715721130371094, + "kl": 1.28203387260437, + "learning_rate": 1.718666622095572e-05, + "loss": 0.1875, + "reward": 1.0302083551883698, + "reward_std": 0.17592437490820884, + "rewards/accuracy_reward": 0.09166666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9385416865348816, + "step": 1001 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.2479370117187, + "epoch": 0.3206913106096976, + "grad_norm": 0.28725120425224304, + "kl": 0.7281457930803299, + "learning_rate": 1.7178890402055232e-05, + "loss": 0.1394, + "reward": 0.9776041865348816, + "reward_std": 0.1258978858590126, + "rewards/accuracy_reward": 0.01875, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9588541805744171, + "step": 1002 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.7625122070312, + "epoch": 0.32101136181789086, + "grad_norm": 0.12669086456298828, + "kl": 0.750990717113018, + "learning_rate": 1.7171105616399153e-05, + "loss": 0.1796, + "reward": 1.0588541865348815, + "reward_std": 0.1406536651775241, + "rewards/accuracy_reward": 0.10625000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9526041805744171, + "step": 1003 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.8916839599609, + "epoch": 0.3213314130260842, + "grad_norm": 0.20147007703781128, + "kl": 0.889993640780449, + "learning_rate": 1.7163311873711035e-05, + "loss": 0.1543, + "reward": 1.0171875178813934, + "reward_std": 0.15972171053290368, + "rewards/accuracy_reward": 0.060416667722165585, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9567708492279052, + "step": 1004 + }, + { + "clip_ratio": 0.0, + "completion_length": 503.09376220703126, + "epoch": 0.3216514642342775, + "grad_norm": 0.25828316807746887, + "kl": 0.7611904472112656, + "learning_rate": 1.7155509183725607e-05, + "loss": 0.177, + "reward": 1.0395833611488343, + "reward_std": 0.18584888130426408, + "rewards/accuracy_reward": 0.08333333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.956250011920929, + "step": 1005 + }, + { + "clip_ratio": 0.0, + "completion_length": 492.1937622070312, + "epoch": 0.32197151544247077, + "grad_norm": 0.11577334254980087, + "kl": 0.6240782648324966, + "learning_rate": 1.714769755618878e-05, + "loss": 0.1137, + "reward": 1.0635416805744171, + "reward_std": 0.1440664477646351, + "rewards/accuracy_reward": 0.09375000204890967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916746139527, + "step": 1006 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.85625915527345, + "epoch": 0.3222915666506641, + "grad_norm": 0.2497272938489914, + "kl": 0.3379806771874428, + "learning_rate": 1.7139877000857623e-05, + "loss": 0.0777, + "reward": 1.0541666865348815, + "reward_std": 0.0930209718644619, + "rewards/accuracy_reward": 0.07500000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9791666805744171, + "step": 1007 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.9854248046875, + "epoch": 0.3226116178588574, + "grad_norm": 0.35958606004714966, + "kl": 0.5691968247294426, + "learning_rate": 1.7132047527500366e-05, + "loss": 0.1087, + "reward": 1.0572916984558105, + "reward_std": 0.1227844811975956, + "rewards/accuracy_reward": 0.08333333656191826, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1008 + }, + { + "clip_ratio": 0.0, + "completion_length": 481.6020935058594, + "epoch": 0.32293166906705073, + "grad_norm": 0.1607331484556198, + "kl": 0.38854978755116465, + "learning_rate": 1.712420914589637e-05, + "loss": 0.1146, + "reward": 1.0718750178813934, + "reward_std": 0.10704346112906933, + "rewards/accuracy_reward": 0.09791666772216559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1009 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.42918395996094, + "epoch": 0.323251720275244, + "grad_norm": 0.21614070236682892, + "kl": 0.5235861442983151, + "learning_rate": 1.711636186583612e-05, + "loss": 0.1206, + "reward": 1.1130208551883698, + "reward_std": 0.09185979887843132, + "rewards/accuracy_reward": 0.1312500059604645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708432674408, + "step": 1010 + }, + { + "clip_ratio": 0.0, + "completion_length": 471.1458465576172, + "epoch": 0.32357177148343735, + "grad_norm": 0.21211141347885132, + "kl": 0.5399421505630017, + "learning_rate": 1.710850569712123e-05, + "loss": 0.0784, + "reward": 1.1104166984558106, + "reward_std": 0.11394599229097366, + "rewards/accuracy_reward": 0.1270833384245634, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9833333432674408, + "step": 1011 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.9541809082031, + "epoch": 0.32389182269163064, + "grad_norm": 0.15681226551532745, + "kl": 0.43956211805343626, + "learning_rate": 1.7100640649564396e-05, + "loss": 0.0954, + "reward": 1.042187511920929, + "reward_std": 0.07975867688655854, + "rewards/accuracy_reward": 0.05416666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9880208432674408, + "step": 1012 + }, + { + "clip_ratio": 0.0, + "completion_length": 470.1875061035156, + "epoch": 0.324211873899824, + "grad_norm": 0.468483567237854, + "kl": 0.8010672204196453, + "learning_rate": 1.7092766732989418e-05, + "loss": 0.1121, + "reward": 1.0697916865348815, + "reward_std": 0.12852012366056442, + "rewards/accuracy_reward": 0.08958333656191826, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9802083373069763, + "step": 1013 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.1958465576172, + "epoch": 0.32453192510801726, + "grad_norm": 0.21684125065803528, + "kl": 0.39564828500151633, + "learning_rate": 1.708488395723117e-05, + "loss": 0.0683, + "reward": 1.0682291865348816, + "reward_std": 0.09386988766491414, + "rewards/accuracy_reward": 0.08125000260770321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9869791746139527, + "step": 1014 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.4958435058594, + "epoch": 0.3248519763162106, + "grad_norm": 0.23216085135936737, + "kl": 0.6974504925310612, + "learning_rate": 1.7076992332135595e-05, + "loss": 0.1328, + "reward": 1.0578125178813935, + "reward_std": 0.11282338351011276, + "rewards/accuracy_reward": 0.07708333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9807291805744172, + "step": 1015 + }, + { + "clip_ratio": 0.0, + "completion_length": 478.4666748046875, + "epoch": 0.3251720275244039, + "grad_norm": 0.16336947679519653, + "kl": 0.4402541309595108, + "learning_rate": 1.7069091867559687e-05, + "loss": 0.0909, + "reward": 1.0713541865348817, + "reward_std": 0.09093062989413739, + "rewards/accuracy_reward": 0.08541666865348815, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9859375119209289, + "step": 1016 + }, + { + "clip_ratio": 0.0, + "completion_length": 476.71250915527344, + "epoch": 0.3254920787325972, + "grad_norm": 0.24683783948421478, + "kl": 0.5640803650021553, + "learning_rate": 1.706118257337148e-05, + "loss": 0.0919, + "reward": 1.0421875238418579, + "reward_std": 0.1091598778963089, + "rewards/accuracy_reward": 0.06458333507180214, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9776041805744171, + "step": 1017 + }, + { + "clip_ratio": 0.0, + "completion_length": 482.1416839599609, + "epoch": 0.3258121299407905, + "grad_norm": 0.18254989385604858, + "kl": 0.44138511940836905, + "learning_rate": 1.7053264459450023e-05, + "loss": 0.1237, + "reward": 1.018229180574417, + "reward_std": 0.08106882330030203, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9807291805744172, + "step": 1018 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.80626220703124, + "epoch": 0.32613218114898385, + "grad_norm": 0.1619545817375183, + "kl": 0.5294121131300926, + "learning_rate": 1.7045337535685414e-05, + "loss": 0.0465, + "reward": 1.0416666984558105, + "reward_std": 0.144361755810678, + "rewards/accuracy_reward": 0.08333333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1019 + }, + { + "clip_ratio": 0.0, + "completion_length": 478.7354309082031, + "epoch": 0.32645223235717713, + "grad_norm": 0.18814796209335327, + "kl": 0.449739009141922, + "learning_rate": 1.7037401811978726e-05, + "loss": 0.0714, + "reward": 1.0338541984558105, + "reward_std": 0.1286115448921919, + "rewards/accuracy_reward": 0.06250000353902578, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9713541865348816, + "step": 1020 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.3479278564453, + "epoch": 0.32677228356537047, + "grad_norm": 0.45165371894836426, + "kl": 0.5484678715467453, + "learning_rate": 1.7029457298242035e-05, + "loss": 0.1563, + "reward": 1.1328125298023224, + "reward_std": 0.14226205535233022, + "rewards/accuracy_reward": 0.17291667312383652, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9598958551883697, + "step": 1021 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.78126220703126, + "epoch": 0.32709233477356375, + "grad_norm": 0.34666770696640015, + "kl": 0.6640940323472023, + "learning_rate": 1.7021504004398392e-05, + "loss": 0.1551, + "reward": 1.0348958611488341, + "reward_std": 0.11599632911384106, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9640625238418579, + "step": 1022 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.120849609375, + "epoch": 0.3274123859817571, + "grad_norm": 0.261369913816452, + "kl": 1.0041472047567368, + "learning_rate": 1.7013541940381824e-05, + "loss": 0.2455, + "reward": 1.0723958730697631, + "reward_std": 0.22422086521983148, + "rewards/accuracy_reward": 0.12916666977107524, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9432291805744171, + "step": 1023 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.2083435058594, + "epoch": 0.3277324371899504, + "grad_norm": 0.15666531026363373, + "kl": 0.6061700366437435, + "learning_rate": 1.70055711161373e-05, + "loss": 0.1381, + "reward": 1.0192708671092987, + "reward_std": 0.12545478213578462, + "rewards/accuracy_reward": 0.05208333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9671875178813935, + "step": 1024 + }, + { + "clip_ratio": 0.0, + "completion_length": 473.61667785644534, + "epoch": 0.3280524883981437, + "grad_norm": 0.3670523464679718, + "kl": 0.8996363550424575, + "learning_rate": 1.6997591541620734e-05, + "loss": 0.1796, + "reward": 1.0375000298023225, + "reward_std": 0.17077935561537744, + "rewards/accuracy_reward": 0.07708333507180214, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9604166746139526, + "step": 1025 + }, + { + "clip_ratio": 0.0, + "completion_length": 495.0062622070312, + "epoch": 0.328372539606337, + "grad_norm": 0.16689921915531158, + "kl": 0.8198397219181061, + "learning_rate": 1.6989603226798976e-05, + "loss": 0.2232, + "reward": 1.0385416865348815, + "reward_std": 0.17929813712835313, + "rewards/accuracy_reward": 0.08125000204890967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9572916746139526, + "step": 1026 + }, + { + "clip_ratio": 0.0, + "completion_length": 492.21250915527344, + "epoch": 0.32869259081453034, + "grad_norm": 0.38878318667411804, + "kl": 1.0202806413173675, + "learning_rate": 1.698160618164979e-05, + "loss": 0.2086, + "reward": 1.0447916984558105, + "reward_std": 0.16940562725067138, + "rewards/accuracy_reward": 0.09791667014360428, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9468750119209289, + "step": 1027 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.2270965576172, + "epoch": 0.3290126420227236, + "grad_norm": 0.21051208674907684, + "kl": 0.7796215415000916, + "learning_rate": 1.6973600416161842e-05, + "loss": 0.1823, + "reward": 1.0338541865348816, + "reward_std": 0.15895916149020195, + "rewards/accuracy_reward": 0.07291666828095913, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9588541805744171, + "step": 1028 + }, + { + "clip_ratio": 0.0, + "completion_length": 479.7583465576172, + "epoch": 0.32933269323091696, + "grad_norm": 0.3499182462692261, + "kl": 0.8251389652490616, + "learning_rate": 1.6965585940334688e-05, + "loss": 0.2098, + "reward": 1.0192708611488341, + "reward_std": 0.14868669509887694, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9526041865348815, + "step": 1029 + }, + { + "clip_ratio": 0.0, + "completion_length": 489.2979309082031, + "epoch": 0.32965274443911025, + "grad_norm": 0.32130110263824463, + "kl": 0.570948114991188, + "learning_rate": 1.6957562764178774e-05, + "loss": 0.1494, + "reward": 1.009375011920929, + "reward_std": 0.10973553471267224, + "rewards/accuracy_reward": 0.03958333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916805744171, + "step": 1030 + }, + { + "clip_ratio": 0.0, + "completion_length": 480.38543701171875, + "epoch": 0.3299727956473036, + "grad_norm": 0.43677008152008057, + "kl": 0.7646385207772255, + "learning_rate": 1.69495308977154e-05, + "loss": 0.2192, + "reward": 1.1062500298023223, + "reward_std": 0.15374659057706594, + "rewards/accuracy_reward": 0.1479166716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9583333492279053, + "step": 1031 + }, + { + "clip_ratio": 0.0, + "completion_length": 482.4104339599609, + "epoch": 0.33029284685549687, + "grad_norm": 0.2866804003715515, + "kl": 0.9388459503650666, + "learning_rate": 1.694149035097673e-05, + "loss": 0.2006, + "reward": 1.002604180574417, + "reward_std": 0.16000167801976203, + "rewards/accuracy_reward": 0.04375000130385161, + "rewards/format_reward": 0.006250000186264515, + "rewards/tag_count_reward": 0.9526041805744171, + "step": 1032 + }, + { + "clip_ratio": 0.0, + "completion_length": 484.7291778564453, + "epoch": 0.3306128980636902, + "grad_norm": 0.2829350531101227, + "kl": 1.3405283033847808, + "learning_rate": 1.6933441134005774e-05, + "loss": 0.3405, + "reward": 1.133854216337204, + "reward_std": 0.1917470723390579, + "rewards/accuracy_reward": 0.19583334028720856, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9380208551883698, + "step": 1033 + }, + { + "clip_ratio": 0.0, + "completion_length": 484.714599609375, + "epoch": 0.3309329492718835, + "grad_norm": 0.4547743499279022, + "kl": 1.0454600259661675, + "learning_rate": 1.692538325685635e-05, + "loss": 0.274, + "reward": 1.113541692495346, + "reward_std": 0.18550372272729873, + "rewards/accuracy_reward": 0.177083339355886, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9343750238418579, + "step": 1034 + }, + { + "clip_ratio": 0.0, + "completion_length": 476.5916778564453, + "epoch": 0.33125300048007683, + "grad_norm": 0.17376388609409332, + "kl": 0.7498746126890182, + "learning_rate": 1.6917316729593115e-05, + "loss": 0.1823, + "reward": 1.0651041924953462, + "reward_std": 0.1255657471716404, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9609375059604645, + "step": 1035 + }, + { + "clip_ratio": 0.0, + "completion_length": 468.60418395996095, + "epoch": 0.3315730516882701, + "grad_norm": 0.2241523712873459, + "kl": 0.6255024075508118, + "learning_rate": 1.6909241562291522e-05, + "loss": 0.2153, + "reward": 1.0432291865348815, + "reward_std": 0.1870565339922905, + "rewards/accuracy_reward": 0.08750000279396772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9557291805744171, + "step": 1036 + }, + { + "clip_ratio": 0.0, + "completion_length": 482.62500915527346, + "epoch": 0.33189310289646345, + "grad_norm": 0.3327322006225586, + "kl": 0.9747333094477654, + "learning_rate": 1.690115776503782e-05, + "loss": 0.1304, + "reward": 1.1036458492279053, + "reward_std": 0.15871551111340523, + "rewards/accuracy_reward": 0.14791667107492684, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9557291805744171, + "step": 1037 + }, + { + "clip_ratio": 0.0, + "completion_length": 485.9416839599609, + "epoch": 0.33221315410465674, + "grad_norm": 0.29658105969429016, + "kl": 1.1347735792398452, + "learning_rate": 1.689306534792903e-05, + "loss": 0.1945, + "reward": 1.101562535762787, + "reward_std": 0.19802000969648362, + "rewards/accuracy_reward": 0.14583333842456342, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9557291865348816, + "step": 1038 + }, + { + "clip_ratio": 0.0, + "completion_length": 488.0104248046875, + "epoch": 0.3325332053128501, + "grad_norm": 0.30956873297691345, + "kl": 0.9602329656481743, + "learning_rate": 1.6884964321072938e-05, + "loss": 0.1944, + "reward": 1.004687523841858, + "reward_std": 0.17770743370056152, + "rewards/accuracy_reward": 0.05208333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9526041805744171, + "step": 1039 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.8354248046875, + "epoch": 0.33285325652104336, + "grad_norm": 0.2959546446800232, + "kl": 0.6376917466521264, + "learning_rate": 1.68768546945881e-05, + "loss": 0.1583, + "reward": 1.0161458551883698, + "reward_std": 0.12123525217175483, + "rewards/accuracy_reward": 0.045833334885537626, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9703125178813934, + "step": 1040 + }, + { + "clip_ratio": 0.0, + "completion_length": 472.9666809082031, + "epoch": 0.3331733077292367, + "grad_norm": 0.13327783346176147, + "kl": 0.6734405755996704, + "learning_rate": 1.68687364786038e-05, + "loss": 0.1282, + "reward": 1.1036458551883697, + "reward_std": 0.168779456615448, + "rewards/accuracy_reward": 0.14166667070239783, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9619791746139527, + "step": 1041 + }, + { + "clip_ratio": 0.0, + "completion_length": 492.8271026611328, + "epoch": 0.33349335893743, + "grad_norm": 0.1860327571630478, + "kl": 0.4504072442650795, + "learning_rate": 1.686060968326005e-05, + "loss": 0.1078, + "reward": 1.066666692495346, + "reward_std": 0.10958906393498183, + "rewards/accuracy_reward": 0.08958333693444728, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833492279053, + "step": 1042 + }, + { + "clip_ratio": 0.0, + "completion_length": 473.4041778564453, + "epoch": 0.3338134101456233, + "grad_norm": 0.24637283384799957, + "kl": 1.1397089630365371, + "learning_rate": 1.685247431870758e-05, + "loss": 0.1869, + "reward": 0.9864583492279053, + "reward_std": 0.13785818926990032, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9531250119209289, + "step": 1043 + }, + { + "clip_ratio": 0.0, + "completion_length": 473.10834350585935, + "epoch": 0.3341334613538166, + "grad_norm": 0.14904294908046722, + "kl": 0.3731867730617523, + "learning_rate": 1.6844330395107825e-05, + "loss": 0.112, + "reward": 1.0729166984558105, + "reward_std": 0.13176908865571021, + "rewards/accuracy_reward": 0.09375000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1044 + }, + { + "clip_ratio": 0.0, + "completion_length": 472.1208465576172, + "epoch": 0.33445351256200995, + "grad_norm": 0.06686241179704666, + "kl": 0.30279314517974854, + "learning_rate": 1.6836177922632918e-05, + "loss": 0.0505, + "reward": 1.0250000178813934, + "reward_std": 0.054784043319523336, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9875000059604645, + "step": 1045 + }, + { + "clip_ratio": 0.0, + "completion_length": 493.8479278564453, + "epoch": 0.33477356377020323, + "grad_norm": 0.09057088941335678, + "kl": 0.549952282756567, + "learning_rate": 1.6828016911465655e-05, + "loss": 0.0907, + "reward": 1.0505208492279052, + "reward_std": 0.08606277983635664, + "rewards/accuracy_reward": 0.07083333432674407, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.979687511920929, + "step": 1046 + }, + { + "clip_ratio": 0.0, + "completion_length": 467.7375091552734, + "epoch": 0.33509361497839657, + "grad_norm": 0.34637758135795593, + "kl": 0.7677448585629463, + "learning_rate": 1.6819847371799505e-05, + "loss": 0.1089, + "reward": 0.9848958671092987, + "reward_std": 0.11421102657914162, + "rewards/accuracy_reward": 0.01250000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9723958551883698, + "step": 1047 + }, + { + "clip_ratio": 0.0, + "completion_length": 469.1625091552734, + "epoch": 0.33541366618658985, + "grad_norm": 0.15661399066448212, + "kl": 0.47885870188474655, + "learning_rate": 1.681166931383859e-05, + "loss": 0.0583, + "reward": 1.056250023841858, + "reward_std": 0.0759154099971056, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9854166746139527, + "step": 1048 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.1291778564453, + "epoch": 0.33573371739478314, + "grad_norm": 0.16242820024490356, + "kl": 0.251812618970871, + "learning_rate": 1.6803482747797674e-05, + "loss": 0.0133, + "reward": 1.0750000119209289, + "reward_std": 0.08946933038532734, + "rewards/accuracy_reward": 0.08750000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9875000178813934, + "step": 1049 + }, + { + "clip_ratio": 0.0, + "completion_length": 471.3854248046875, + "epoch": 0.3360537686029765, + "grad_norm": 0.23674461245536804, + "kl": 0.4152077123522758, + "learning_rate": 1.6795287683902136e-05, + "loss": 0.0852, + "reward": 1.0583333492279052, + "reward_std": 0.11216433495283126, + "rewards/accuracy_reward": 0.07500000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9833333432674408, + "step": 1050 + }, + { + "clip_ratio": 0.0, + "completion_length": 486.94793395996095, + "epoch": 0.33637381981116976, + "grad_norm": 0.15628622472286224, + "kl": 0.33315576761960985, + "learning_rate": 1.6787084132387987e-05, + "loss": 0.0482, + "reward": 1.0989583432674408, + "reward_std": 0.12317422851920128, + "rewards/accuracy_reward": 0.11458333786576987, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750119209289, + "step": 1051 + }, + { + "clip_ratio": 0.0, + "completion_length": 485.56251525878906, + "epoch": 0.3366938710193631, + "grad_norm": 0.21389330923557281, + "kl": 0.5460788942873478, + "learning_rate": 1.6778872103501825e-05, + "loss": 0.051, + "reward": 1.021875011920929, + "reward_std": 0.1267334796488285, + "rewards/accuracy_reward": 0.04583333376795053, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9760416805744171, + "step": 1052 + }, + { + "clip_ratio": 0.0, + "completion_length": 486.83335266113284, + "epoch": 0.3370139222275564, + "grad_norm": 0.1881573349237442, + "kl": 0.36430116593837736, + "learning_rate": 1.677065160750084e-05, + "loss": 0.0722, + "reward": 1.062500011920929, + "reward_std": 0.10569052752107382, + "rewards/accuracy_reward": 0.08333333432674409, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9791666805744171, + "step": 1053 + }, + { + "clip_ratio": 0.0, + "completion_length": 504.9583465576172, + "epoch": 0.3373339734357497, + "grad_norm": 0.1714252233505249, + "kl": 0.5596703916788102, + "learning_rate": 1.6762422654652806e-05, + "loss": 0.1102, + "reward": 1.0255208611488342, + "reward_std": 0.11673090867698192, + "rewards/accuracy_reward": 0.050000001676380634, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208432674408, + "step": 1054 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.2104339599609, + "epoch": 0.337654024643943, + "grad_norm": 0.13085860013961792, + "kl": 0.6136700950562954, + "learning_rate": 1.6754185255236047e-05, + "loss": 0.0916, + "reward": 1.0729166984558105, + "reward_std": 0.1413856975734234, + "rewards/accuracy_reward": 0.09791667107492685, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.975000011920929, + "step": 1055 + }, + { + "clip_ratio": 0.0, + "completion_length": 514.6895965576172, + "epoch": 0.33797407585213635, + "grad_norm": 0.16582679748535156, + "kl": 0.6665895022451878, + "learning_rate": 1.674593941953945e-05, + "loss": 0.0631, + "reward": 1.053645873069763, + "reward_std": 0.1587853878736496, + "rewards/accuracy_reward": 0.07916666772216559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9744791805744171, + "step": 1056 + }, + { + "clip_ratio": 0.0, + "completion_length": 507.177099609375, + "epoch": 0.33829412706032963, + "grad_norm": 0.09304836392402649, + "kl": 0.4092469088733196, + "learning_rate": 1.6737685157862428e-05, + "loss": 0.072, + "reward": 1.1234375417232514, + "reward_std": 0.1315523639321327, + "rewards/accuracy_reward": 0.13958333767950534, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9838541746139526, + "step": 1057 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.7229370117187, + "epoch": 0.33861417826852297, + "grad_norm": 0.35923483967781067, + "kl": 0.9676612123847008, + "learning_rate": 1.6729422480514926e-05, + "loss": 0.074, + "reward": 1.0307291865348815, + "reward_std": 0.12929603606462478, + "rewards/accuracy_reward": 0.05625000018626451, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9723958492279052, + "step": 1058 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.5208587646484, + "epoch": 0.33893422947671625, + "grad_norm": 0.20335394144058228, + "kl": 0.747900664061308, + "learning_rate": 1.67211513978174e-05, + "loss": 0.1232, + "reward": 1.0276041984558106, + "reward_std": 0.15231779962778091, + "rewards/accuracy_reward": 0.05625000167638063, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9713541865348816, + "step": 1059 + }, + { + "clip_ratio": 0.0, + "completion_length": 491.5520965576172, + "epoch": 0.3392542806849096, + "grad_norm": 0.23044894635677338, + "kl": 0.5531878419220447, + "learning_rate": 1.6712871920100796e-05, + "loss": 0.1287, + "reward": 1.0656250238418579, + "reward_std": 0.11024533435702324, + "rewards/accuracy_reward": 0.09166667014360427, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583373069763, + "step": 1060 + }, + { + "clip_ratio": 0.0, + "completion_length": 497.895849609375, + "epoch": 0.3395743318931029, + "grad_norm": 0.13916930556297302, + "kl": 0.6994629740715027, + "learning_rate": 1.6704584057706558e-05, + "loss": 0.1484, + "reward": 1.0302083551883698, + "reward_std": 0.14814634323120118, + "rewards/accuracy_reward": 0.0666666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9635416805744171, + "step": 1061 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.333349609375, + "epoch": 0.3398943831012962, + "grad_norm": 0.06657827645540237, + "kl": 0.26350629031658174, + "learning_rate": 1.6696287820986595e-05, + "loss": 0.0732, + "reward": 0.9911458373069764, + "reward_std": 0.06581217646598816, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9869791686534881, + "step": 1062 + }, + { + "clip_ratio": 0.0, + "completion_length": 506.0541748046875, + "epoch": 0.3402144343094895, + "grad_norm": 0.14629337191581726, + "kl": 0.2807926818728447, + "learning_rate": 1.668798322030328e-05, + "loss": 0.0584, + "reward": 1.0437500119209289, + "reward_std": 0.06695888042449952, + "rewards/accuracy_reward": 0.06041666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9833333432674408, + "step": 1063 + }, + { + "clip_ratio": 0.0, + "completion_length": 512.6021057128906, + "epoch": 0.34053448551768284, + "grad_norm": 0.22216708958148956, + "kl": 0.6122836649417878, + "learning_rate": 1.667967026602943e-05, + "loss": 0.0986, + "reward": 1.0703125238418578, + "reward_std": 0.1326361045241356, + "rewards/accuracy_reward": 0.10000000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9703125178813934, + "step": 1064 + }, + { + "clip_ratio": 0.0, + "completion_length": 509.1354248046875, + "epoch": 0.3408545367258761, + "grad_norm": 0.13464389741420746, + "kl": 0.6803478240966797, + "learning_rate": 1.66713489685483e-05, + "loss": 0.1292, + "reward": 1.0578125357627868, + "reward_std": 0.1410712368786335, + "rewards/accuracy_reward": 0.0916666692122817, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9661458492279053, + "step": 1065 + }, + { + "clip_ratio": 0.0, + "completion_length": 517.6291870117187, + "epoch": 0.34117458793406946, + "grad_norm": 0.12916676700115204, + "kl": 0.5221568688750267, + "learning_rate": 1.6663019338253556e-05, + "loss": 0.104, + "reward": 1.0270833432674409, + "reward_std": 0.12490638475865126, + "rewards/accuracy_reward": 0.05625, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333432674408, + "step": 1066 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.689599609375, + "epoch": 0.34149463914226275, + "grad_norm": 0.21543385088443756, + "kl": 0.8520060390233993, + "learning_rate": 1.665468138554929e-05, + "loss": 0.2002, + "reward": 0.979166692495346, + "reward_std": 0.16948885917663575, + "rewards/accuracy_reward": 0.02291666679084301, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9541666805744171, + "step": 1067 + }, + { + "clip_ratio": 0.0, + "completion_length": 526.2770935058594, + "epoch": 0.3418146903504561, + "grad_norm": 0.26105642318725586, + "kl": 0.8584218144416809, + "learning_rate": 1.6646335120849964e-05, + "loss": 0.2162, + "reward": 1.0072916924953461, + "reward_std": 0.20712767243385316, + "rewards/accuracy_reward": 0.05625000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9510416924953461, + "step": 1068 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.4416809082031, + "epoch": 0.34213474155864937, + "grad_norm": 0.24653339385986328, + "kl": 1.068970836699009, + "learning_rate": 1.6637980554580447e-05, + "loss": 0.1043, + "reward": 1.1468750298023225, + "reward_std": 0.1717896606773138, + "rewards/accuracy_reward": 0.18958334028720855, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9572916865348816, + "step": 1069 + }, + { + "clip_ratio": 0.0, + "completion_length": 503.23126220703125, + "epoch": 0.3424547927668427, + "grad_norm": 0.1497165560722351, + "kl": 1.0819413036108017, + "learning_rate": 1.6629617697175967e-05, + "loss": 0.1963, + "reward": 0.993750023841858, + "reward_std": 0.18702587112784386, + "rewards/accuracy_reward": 0.03750000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9562500298023224, + "step": 1070 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.4021026611329, + "epoch": 0.342774843975036, + "grad_norm": 0.6320114731788635, + "kl": 1.6799171954393386, + "learning_rate": 1.66212465590821e-05, + "loss": 0.2058, + "reward": 1.105729204416275, + "reward_std": 0.1997086688876152, + "rewards/accuracy_reward": 0.16458333767950534, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9411458492279052, + "step": 1071 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.039599609375, + "epoch": 0.34309489518322933, + "grad_norm": 0.23983895778656006, + "kl": 0.9433726727962494, + "learning_rate": 1.6612867150754776e-05, + "loss": 0.091, + "reward": 1.0604167044162751, + "reward_std": 0.1479168005287647, + "rewards/accuracy_reward": 0.10208333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9583333492279053, + "step": 1072 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.1604370117187, + "epoch": 0.3434149463914226, + "grad_norm": 0.19629456102848053, + "kl": 0.9360431842505932, + "learning_rate": 1.6604479482660257e-05, + "loss": 0.1161, + "reward": 1.051562523841858, + "reward_std": 0.1672593917697668, + "rewards/accuracy_reward": 0.0958333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9557291805744171, + "step": 1073 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.5458465576172, + "epoch": 0.34373499759961595, + "grad_norm": 0.5441807508468628, + "kl": 0.7919405251741409, + "learning_rate": 1.6596083565275107e-05, + "loss": 0.1435, + "reward": 0.9848958551883698, + "reward_std": 0.14814932085573673, + "rewards/accuracy_reward": 0.029166667722165585, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9557291865348816, + "step": 1074 + }, + { + "clip_ratio": 0.0, + "completion_length": 518.2146057128906, + "epoch": 0.34405504880780924, + "grad_norm": 0.20584291219711304, + "kl": 0.8357704304158687, + "learning_rate": 1.6587679409086207e-05, + "loss": 0.1099, + "reward": 1.0416666865348816, + "reward_std": 0.11115128733217716, + "rewards/accuracy_reward": 0.07083333432674407, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333492279053, + "step": 1075 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.4625183105469, + "epoch": 0.3443751000160026, + "grad_norm": 0.2858220636844635, + "kl": 0.8899985015392303, + "learning_rate": 1.6579267024590727e-05, + "loss": 0.1511, + "reward": 1.1177083790302276, + "reward_std": 0.15269537195563315, + "rewards/accuracy_reward": 0.15833333730697632, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9593750178813935, + "step": 1076 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.2854278564453, + "epoch": 0.34469515122419586, + "grad_norm": 0.20057371258735657, + "kl": 0.8167131602764129, + "learning_rate": 1.6570846422296102e-05, + "loss": 0.1817, + "reward": 1.0286458492279054, + "reward_std": 0.17073360234498977, + "rewards/accuracy_reward": 0.07916666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9494791805744172, + "step": 1077 + }, + { + "clip_ratio": 0.0, + "completion_length": 498.61876525878904, + "epoch": 0.3450152024323892, + "grad_norm": 0.12494718283414841, + "kl": 0.4915993630886078, + "learning_rate": 1.6562417612720055e-05, + "loss": 0.0835, + "reward": 1.0744792103767395, + "reward_std": 0.14771257862448692, + "rewards/accuracy_reward": 0.09375000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9807291865348816, + "step": 1078 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.5000152587891, + "epoch": 0.3453352536405825, + "grad_norm": 0.17784957587718964, + "kl": 0.9045666679739952, + "learning_rate": 1.6553980606390538e-05, + "loss": 0.1211, + "reward": 1.0593750119209289, + "reward_std": 0.13307706415653228, + "rewards/accuracy_reward": 0.08958333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916805744171, + "step": 1079 + }, + { + "clip_ratio": 0.0, + "completion_length": 489.95001220703125, + "epoch": 0.3456553048487758, + "grad_norm": 0.09965129941701889, + "kl": 0.5666639655828476, + "learning_rate": 1.654553541384575e-05, + "loss": 0.0682, + "reward": 0.9807291805744172, + "reward_std": 0.09674109499901533, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9765625119209289, + "step": 1080 + }, + { + "clip_ratio": 0.0, + "completion_length": 517.4583526611328, + "epoch": 0.3459753560569691, + "grad_norm": 0.3702761232852936, + "kl": 0.6852307498455048, + "learning_rate": 1.6537082045634116e-05, + "loss": 0.1417, + "reward": 1.0458333551883698, + "reward_std": 0.09951124414801597, + "rewards/accuracy_reward": 0.07291666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9729166746139526, + "step": 1081 + }, + { + "clip_ratio": 0.0, + "completion_length": 466.1000122070312, + "epoch": 0.34629540726516245, + "grad_norm": 0.29468998312950134, + "kl": 0.8192525319755077, + "learning_rate": 1.6528620512314276e-05, + "loss": 0.1118, + "reward": 1.017187523841858, + "reward_std": 0.09673091005533933, + "rewards/accuracy_reward": 0.039583335444331166, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9776041746139527, + "step": 1082 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.035433959961, + "epoch": 0.34661545847335573, + "grad_norm": 0.11237310618162155, + "kl": 0.3425339564681053, + "learning_rate": 1.652015082445506e-05, + "loss": 0.0822, + "reward": 1.0802083551883697, + "reward_std": 0.0760267723351717, + "rewards/accuracy_reward": 0.10000000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9802083492279052, + "step": 1083 + }, + { + "clip_ratio": 0.0, + "completion_length": 493.88958740234375, + "epoch": 0.34693550968154907, + "grad_norm": 0.1413935422897339, + "kl": 0.43408130556344987, + "learning_rate": 1.6511672992635478e-05, + "loss": 0.1053, + "reward": 1.0927083551883698, + "reward_std": 0.13995889909565448, + "rewards/accuracy_reward": 0.11875000353902579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583492279053, + "step": 1084 + }, + { + "clip_ratio": 0.0, + "completion_length": 496.1083526611328, + "epoch": 0.34725556088974235, + "grad_norm": 0.09691617637872696, + "kl": 0.2840468570590019, + "learning_rate": 1.6503187027444737e-05, + "loss": 0.0298, + "reward": 1.1270833611488342, + "reward_std": 0.04991227090358734, + "rewards/accuracy_reward": 0.13750000409781932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9895833373069763, + "step": 1085 + }, + { + "clip_ratio": 0.0, + "completion_length": 518.3021026611328, + "epoch": 0.3475756120979357, + "grad_norm": 0.04689677432179451, + "kl": 0.18280332162976265, + "learning_rate": 1.6494692939482183e-05, + "loss": 0.0029, + "reward": 1.103125023841858, + "reward_std": 0.07317595779895783, + "rewards/accuracy_reward": 0.10833333935588599, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9947916746139527, + "step": 1086 + }, + { + "clip_ratio": 0.0, + "completion_length": 487.4875183105469, + "epoch": 0.347895663306129, + "grad_norm": 0.0464547798037529, + "kl": 0.22884158343076705, + "learning_rate": 1.6486190739357307e-05, + "loss": 0.007, + "reward": 1.074479192495346, + "reward_std": 0.06167098730802536, + "rewards/accuracy_reward": 0.08333333432674409, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9911458551883697, + "step": 1087 + }, + { + "clip_ratio": 0.0, + "completion_length": 482.2875122070312, + "epoch": 0.3482157145143223, + "grad_norm": 0.17549873888492584, + "kl": 0.2067810483276844, + "learning_rate": 1.6477680437689746e-05, + "loss": 0.0603, + "reward": 1.0901041865348815, + "reward_std": 0.11889987215399742, + "rewards/accuracy_reward": 0.10000000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9901041746139526, + "step": 1088 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.6250183105469, + "epoch": 0.3485357657225156, + "grad_norm": 0.2971534729003906, + "kl": 0.2798635631799698, + "learning_rate": 1.646916204510924e-05, + "loss": 0.0891, + "reward": 1.085416704416275, + "reward_std": 0.10783621110022068, + "rewards/accuracy_reward": 0.10833333842456341, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833432674408, + "step": 1089 + }, + { + "clip_ratio": 0.0, + "completion_length": 501.827099609375, + "epoch": 0.34885581693070894, + "grad_norm": 0.22730644047260284, + "kl": 0.2414296567440033, + "learning_rate": 1.6460635572255644e-05, + "loss": 0.0829, + "reward": 1.0468750298023224, + "reward_std": 0.08340628929436207, + "rewards/accuracy_reward": 0.06458333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9822916805744171, + "step": 1090 + }, + { + "clip_ratio": 0.0, + "completion_length": 486.25209350585936, + "epoch": 0.3491758681389022, + "grad_norm": 0.10404152423143387, + "kl": 0.45262009650468826, + "learning_rate": 1.6452101029778908e-05, + "loss": 0.0416, + "reward": 1.005729180574417, + "reward_std": 0.06869241334497929, + "rewards/accuracy_reward": 0.020833333395421504, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9848958492279053, + "step": 1091 + }, + { + "clip_ratio": 0.0, + "completion_length": 500.92710266113284, + "epoch": 0.34949591934709556, + "grad_norm": 0.09202590584754944, + "kl": 0.250038680434227, + "learning_rate": 1.6443558428339054e-05, + "loss": 0.0159, + "reward": 1.090625035762787, + "reward_std": 0.10430398043245077, + "rewards/accuracy_reward": 0.0979166703298688, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9927083432674408, + "step": 1092 + }, + { + "clip_ratio": 0.0, + "completion_length": 499.0458465576172, + "epoch": 0.34981597055528885, + "grad_norm": 0.10142835229635239, + "kl": 0.2372105412185192, + "learning_rate": 1.6435007778606177e-05, + "loss": 0.0508, + "reward": 1.0635416984558106, + "reward_std": 0.08527100309729577, + "rewards/accuracy_reward": 0.07291666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.990625011920929, + "step": 1093 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.3916839599609, + "epoch": 0.35013602176348213, + "grad_norm": 0.08181018382310867, + "kl": 0.3042118564248085, + "learning_rate": 1.6426449091260424e-05, + "loss": 0.0624, + "reward": 1.0932291924953461, + "reward_std": 0.10010075122117996, + "rewards/accuracy_reward": 0.10833333749324084, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9848958551883698, + "step": 1094 + }, + { + "clip_ratio": 0.0, + "completion_length": 509.5520965576172, + "epoch": 0.35045607297167547, + "grad_norm": 0.1259605586528778, + "kl": 0.19862473011016846, + "learning_rate": 1.641788237699197e-05, + "loss": 0.025, + "reward": 1.0520833611488343, + "reward_std": 0.08347481749951839, + "rewards/accuracy_reward": 0.06041666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9916666746139526, + "step": 1095 + }, + { + "clip_ratio": 0.0, + "completion_length": 503.3416809082031, + "epoch": 0.35077612417986875, + "grad_norm": 0.6125533580780029, + "kl": 0.23866599127650262, + "learning_rate": 1.6409307646501032e-05, + "loss": 0.0605, + "reward": 1.0468750178813935, + "reward_std": 0.08658724837005138, + "rewards/accuracy_reward": 0.06250000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750119209289, + "step": 1096 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.6604309082031, + "epoch": 0.3510961753880621, + "grad_norm": 0.13647539913654327, + "kl": 0.5289441749453545, + "learning_rate": 1.6400724910497832e-05, + "loss": 0.0664, + "reward": 1.0973958551883698, + "reward_std": 0.15426294282078742, + "rewards/accuracy_reward": 0.11666667126119137, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9807291746139526, + "step": 1097 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.6437683105469, + "epoch": 0.3514162265962554, + "grad_norm": 0.14252297580242157, + "kl": 0.3495174624025822, + "learning_rate": 1.6392134179702585e-05, + "loss": 0.0596, + "reward": 1.0697916865348815, + "reward_std": 0.07795717976987362, + "rewards/accuracy_reward": 0.08541666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750119209289, + "step": 1098 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.4708465576172, + "epoch": 0.3517362778044487, + "grad_norm": 0.12695029377937317, + "kl": 0.3373839229345322, + "learning_rate": 1.6383535464845507e-05, + "loss": 0.0763, + "reward": 1.043229192495346, + "reward_std": 0.09906813129782677, + "rewards/accuracy_reward": 0.0604166679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9828125178813935, + "step": 1099 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.395849609375, + "epoch": 0.352056329012642, + "grad_norm": 0.22602546215057373, + "kl": 0.6235632814466954, + "learning_rate": 1.637492877666677e-05, + "loss": 0.111, + "reward": 1.1000000476837157, + "reward_std": 0.13195635173469783, + "rewards/accuracy_reward": 0.13541666995733975, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9645833611488343, + "step": 1100 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.6875244140625, + "epoch": 0.35237638022083534, + "grad_norm": 0.09847158193588257, + "kl": 0.35407338961958884, + "learning_rate": 1.6366314125916524e-05, + "loss": 0.0664, + "reward": 1.1432292103767394, + "reward_std": 0.09192095547914506, + "rewards/accuracy_reward": 0.16458333730697633, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458611488342, + "step": 1101 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.289599609375, + "epoch": 0.3526964314290286, + "grad_norm": 0.1806415617465973, + "kl": 0.5341693744063377, + "learning_rate": 1.635769152335484e-05, + "loss": 0.113, + "reward": 1.051041692495346, + "reward_std": 0.12980886101722716, + "rewards/accuracy_reward": 0.08125000353902578, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916805744171, + "step": 1102 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.8729400634766, + "epoch": 0.35301648263722196, + "grad_norm": 0.6527080535888672, + "kl": 0.81370819658041, + "learning_rate": 1.6349060979751744e-05, + "loss": 0.144, + "reward": 1.083854180574417, + "reward_std": 0.14183319211006165, + "rewards/accuracy_reward": 0.12083333693444728, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9630208432674408, + "step": 1103 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.8625183105469, + "epoch": 0.35333653384541525, + "grad_norm": 0.3019215166568756, + "kl": 0.7167576387524605, + "learning_rate": 1.634042250588717e-05, + "loss": 0.1301, + "reward": 1.0328125178813934, + "reward_std": 0.15510014891624452, + "rewards/accuracy_reward": 0.07500000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9578125059604645, + "step": 1104 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.595849609375, + "epoch": 0.3536565850536086, + "grad_norm": 0.1385013312101364, + "kl": 0.6982532098889351, + "learning_rate": 1.6331776112550956e-05, + "loss": 0.1219, + "reward": 1.1036458492279053, + "reward_std": 0.21694720312952995, + "rewards/accuracy_reward": 0.14583333469927312, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9578125178813934, + "step": 1105 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.2854248046875, + "epoch": 0.35397663626180187, + "grad_norm": 0.40079644322395325, + "kl": 0.6103353053331375, + "learning_rate": 1.6323121810542836e-05, + "loss": 0.1437, + "reward": 0.965104204416275, + "reward_std": 0.1538691446185112, + "rewards/accuracy_reward": 0.010416666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9546875298023224, + "step": 1106 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.8979309082031, + "epoch": 0.3542966874699952, + "grad_norm": 0.2274789661169052, + "kl": 0.5727997168898582, + "learning_rate": 1.631445961067242e-05, + "loss": 0.0982, + "reward": 1.0286458611488343, + "reward_std": 0.17690156698226928, + "rewards/accuracy_reward": 0.07083333432674407, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9578125059604645, + "step": 1107 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.4812683105469, + "epoch": 0.3546167386781885, + "grad_norm": 0.45921313762664795, + "kl": 0.6686241254210472, + "learning_rate": 1.6305789523759186e-05, + "loss": 0.0862, + "reward": 1.0213542044162751, + "reward_std": 0.1808813363313675, + "rewards/accuracy_reward": 0.06458333600312471, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9567708432674408, + "step": 1108 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.1604431152343, + "epoch": 0.35493678988638183, + "grad_norm": 0.1756734997034073, + "kl": 1.0812157839536667, + "learning_rate": 1.6297111560632456e-05, + "loss": 0.1661, + "reward": 0.9817708611488343, + "reward_std": 0.21475159972906113, + "rewards/accuracy_reward": 0.0479166679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9338541865348816, + "step": 1109 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.1104339599609, + "epoch": 0.3552568410945751, + "grad_norm": 0.2468792200088501, + "kl": 1.1981078289449214, + "learning_rate": 1.62884257321314e-05, + "loss": 0.1969, + "reward": 1.0427083671092987, + "reward_std": 0.18149395957589148, + "rewards/accuracy_reward": 0.10833333656191826, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9343750238418579, + "step": 1110 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.9416717529297, + "epoch": 0.35557689230276845, + "grad_norm": 0.20042645931243896, + "kl": 0.9907065749168396, + "learning_rate": 1.6279732049105e-05, + "loss": 0.1766, + "reward": 1.0307291924953461, + "reward_std": 0.2287411093711853, + "rewards/accuracy_reward": 0.10833333842456341, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9223958492279053, + "step": 1111 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.5062683105468, + "epoch": 0.35589694351096174, + "grad_norm": 0.43150994181632996, + "kl": 1.5727191627025605, + "learning_rate": 1.6271030522412066e-05, + "loss": 0.2304, + "reward": 0.9906250178813935, + "reward_std": 0.22456872165203096, + "rewards/accuracy_reward": 0.06458333563059568, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9260416865348816, + "step": 1112 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.5145965576172, + "epoch": 0.3562169947191551, + "grad_norm": 0.20752328634262085, + "kl": 1.127117747068405, + "learning_rate": 1.6262321162921186e-05, + "loss": 0.2099, + "reward": 1.049479180574417, + "reward_std": 0.18920135349035264, + "rewards/accuracy_reward": 0.10833333730697632, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9411458432674408, + "step": 1113 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.3437683105469, + "epoch": 0.35653704592734836, + "grad_norm": 0.20367443561553955, + "kl": 1.2074265986680985, + "learning_rate": 1.6253603981510742e-05, + "loss": 0.1763, + "reward": 1.0625000298023224, + "reward_std": 0.20715280324220658, + "rewards/accuracy_reward": 0.12500000335276126, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9375000178813935, + "step": 1114 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.4020935058594, + "epoch": 0.3568570971355417, + "grad_norm": 0.2791835367679596, + "kl": 0.6976873815059662, + "learning_rate": 1.6244878989068884e-05, + "loss": 0.1238, + "reward": 1.1109375298023223, + "reward_std": 0.16525277644395828, + "rewards/accuracy_reward": 0.15416667088866234, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9567708492279052, + "step": 1115 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.8916839599609, + "epoch": 0.357177148343735, + "grad_norm": 0.22703680396080017, + "kl": 0.7577734768390656, + "learning_rate": 1.623614619649352e-05, + "loss": 0.1371, + "reward": 1.0723958730697631, + "reward_std": 0.14903780817985535, + "rewards/accuracy_reward": 0.11458333674818277, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9578125178813934, + "step": 1116 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.4645965576171, + "epoch": 0.3574971995519283, + "grad_norm": 0.22214283049106598, + "kl": 0.6411052107810974, + "learning_rate": 1.6227405614692295e-05, + "loss": 0.1377, + "reward": 1.0541666924953461, + "reward_std": 0.17779069989919663, + "rewards/accuracy_reward": 0.10000000260770321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9541666805744171, + "step": 1117 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.2000183105469, + "epoch": 0.3578172507601216, + "grad_norm": 0.25091204047203064, + "kl": 1.055853134393692, + "learning_rate": 1.621865725458259e-05, + "loss": 0.2165, + "reward": 1.1354166984558105, + "reward_std": 0.19953776970505716, + "rewards/accuracy_reward": 0.19166667368263007, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9437500178813935, + "step": 1118 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.5750183105469, + "epoch": 0.35813730196831495, + "grad_norm": 0.31793567538261414, + "kl": 0.9337344884872436, + "learning_rate": 1.6209901127091495e-05, + "loss": 0.1811, + "reward": 1.0005208492279052, + "reward_std": 0.14889881759881973, + "rewards/accuracy_reward": 0.045833334885537626, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9546875178813934, + "step": 1119 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.7541809082031, + "epoch": 0.35845735317650823, + "grad_norm": 0.4377756416797638, + "kl": 1.5817268535494804, + "learning_rate": 1.6201137243155815e-05, + "loss": 0.1998, + "reward": 1.032812523841858, + "reward_std": 0.18937183991074563, + "rewards/accuracy_reward": 0.09583333488553762, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9369791865348815, + "step": 1120 + }, + { + "clip_ratio": 0.0, + "completion_length": 512.9750122070312, + "epoch": 0.35877740438470157, + "grad_norm": 0.24275319278240204, + "kl": 0.6483488872647285, + "learning_rate": 1.619236561372202e-05, + "loss": 0.1876, + "reward": 1.1187500298023223, + "reward_std": 0.1460374455899, + "rewards/accuracy_reward": 0.1604166716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9583333551883697, + "step": 1121 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.0521057128906, + "epoch": 0.35909745559289485, + "grad_norm": 0.3204553723335266, + "kl": 0.9119420304894448, + "learning_rate": 1.618358624974628e-05, + "loss": 0.1642, + "reward": 1.0125000119209289, + "reward_std": 0.18264763969928027, + "rewards/accuracy_reward": 0.07083333488553763, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9416666865348816, + "step": 1122 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.004183959961, + "epoch": 0.3594175068010882, + "grad_norm": 0.39938727021217346, + "kl": 0.5670314341783523, + "learning_rate": 1.617479916219441e-05, + "loss": 0.124, + "reward": 1.0885416865348816, + "reward_std": 0.20538848787546157, + "rewards/accuracy_reward": 0.12083333786576986, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9677083432674408, + "step": 1123 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.3812744140625, + "epoch": 0.3597375580092815, + "grad_norm": 0.11475303769111633, + "kl": 0.294252347946167, + "learning_rate": 1.6166004362041867e-05, + "loss": 0.0812, + "reward": 1.0109375298023224, + "reward_std": 0.12263874225318432, + "rewards/accuracy_reward": 0.03333333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9776041805744171, + "step": 1124 + }, + { + "clip_ratio": 0.0, + "completion_length": 510.6312622070312, + "epoch": 0.3600576092174748, + "grad_norm": 0.27126359939575195, + "kl": 0.5437161371111869, + "learning_rate": 1.6157201860273764e-05, + "loss": 0.1592, + "reward": 1.0229166924953461, + "reward_std": 0.15174967646598816, + "rewards/accuracy_reward": 0.06041666697710753, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9625000178813934, + "step": 1125 + }, + { + "clip_ratio": 0.0, + "completion_length": 518.8375122070313, + "epoch": 0.3603776604256681, + "grad_norm": 0.27678969502449036, + "kl": 0.3689132109284401, + "learning_rate": 1.614839166788481e-05, + "loss": 0.1059, + "reward": 1.0052083432674408, + "reward_std": 0.1078458171337843, + "rewards/accuracy_reward": 0.02708333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250119209289, + "step": 1126 + }, + { + "clip_ratio": 0.0, + "completion_length": 515.1708343505859, + "epoch": 0.36069771163386144, + "grad_norm": 0.28149381279945374, + "kl": 0.5752302646636963, + "learning_rate": 1.6139573795879337e-05, + "loss": 0.1607, + "reward": 0.9807291805744172, + "reward_std": 0.13525601997971534, + "rewards/accuracy_reward": 0.01458333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9661458432674408, + "step": 1127 + }, + { + "clip_ratio": 0.0, + "completion_length": 504.80418090820314, + "epoch": 0.3610177628420547, + "grad_norm": 0.18258237838745117, + "kl": 0.4668318539857864, + "learning_rate": 1.6130748255271257e-05, + "loss": 0.1452, + "reward": 1.0510416805744172, + "reward_std": 0.12038996331393718, + "rewards/accuracy_reward": 0.08541666865348815, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.965625011920929, + "step": 1128 + }, + { + "clip_ratio": 0.0, + "completion_length": 499.2229278564453, + "epoch": 0.36133781405024806, + "grad_norm": 0.499051570892334, + "kl": 0.7709709912538528, + "learning_rate": 1.6121915057084064e-05, + "loss": 0.1169, + "reward": 1.0291666984558105, + "reward_std": 0.15609467439353467, + "rewards/accuracy_reward": 0.05416666753590107, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.975000011920929, + "step": 1129 + }, + { + "clip_ratio": 0.0, + "completion_length": 517.5145904541016, + "epoch": 0.36165786525844135, + "grad_norm": 0.2296091914176941, + "kl": 0.5238408371806145, + "learning_rate": 1.6113074212350827e-05, + "loss": 0.155, + "reward": 1.0187500238418579, + "reward_std": 0.13111219555139542, + "rewards/accuracy_reward": 0.05416666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9645833432674408, + "step": 1130 + }, + { + "clip_ratio": 0.0, + "completion_length": 473.2791778564453, + "epoch": 0.3619779164666347, + "grad_norm": 0.20802690088748932, + "kl": 0.6877270132303238, + "learning_rate": 1.6104225732114143e-05, + "loss": 0.1275, + "reward": 1.0468750178813935, + "reward_std": 0.11990614160895348, + "rewards/accuracy_reward": 0.07083333507180214, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9760416686534882, + "step": 1131 + }, + { + "clip_ratio": 0.0, + "completion_length": 500.7354309082031, + "epoch": 0.36229796767482797, + "grad_norm": 0.3290000259876251, + "kl": 0.7059184789657593, + "learning_rate": 1.609536962742617e-05, + "loss": 0.1368, + "reward": 1.0656250357627868, + "reward_std": 0.1308181770145893, + "rewards/accuracy_reward": 0.10000000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9656250178813934, + "step": 1132 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.877099609375, + "epoch": 0.3626180188830213, + "grad_norm": 0.21400968730449677, + "kl": 0.5877170011401176, + "learning_rate": 1.6086505909348585e-05, + "loss": 0.1876, + "reward": 1.1864583671092988, + "reward_std": 0.22381700724363326, + "rewards/accuracy_reward": 0.22916667275130748, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9572916805744172, + "step": 1133 + }, + { + "clip_ratio": 0.0, + "completion_length": 477.80417785644534, + "epoch": 0.3629380700912146, + "grad_norm": 0.11039919406175613, + "kl": 0.2935449294745922, + "learning_rate": 1.6077634588952552e-05, + "loss": 0.0748, + "reward": 1.1015625476837159, + "reward_std": 0.1574238944798708, + "rewards/accuracy_reward": 0.11666666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9848958432674408, + "step": 1134 + }, + { + "clip_ratio": 0.0, + "completion_length": 489.3020965576172, + "epoch": 0.36325812129940793, + "grad_norm": 0.2583947479724884, + "kl": 0.534557220339775, + "learning_rate": 1.606875567731876e-05, + "loss": 0.0936, + "reward": 1.0109375238418579, + "reward_std": 0.11386194564402104, + "rewards/accuracy_reward": 0.0312500013038516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9796875178813934, + "step": 1135 + }, + { + "clip_ratio": 0.0, + "completion_length": 470.9104278564453, + "epoch": 0.3635781725076012, + "grad_norm": 0.2370891273021698, + "kl": 0.4823115229606628, + "learning_rate": 1.6059869185537363e-05, + "loss": 0.0808, + "reward": 1.0354166865348815, + "reward_std": 0.09459959566593171, + "rewards/accuracy_reward": 0.05000000055879354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9854166686534882, + "step": 1136 + }, + { + "clip_ratio": 0.0, + "completion_length": 480.7895904541016, + "epoch": 0.3638982237157945, + "grad_norm": 0.3485714793205261, + "kl": 0.479145385324955, + "learning_rate": 1.605097512470799e-05, + "loss": 0.1299, + "reward": 1.0880208551883697, + "reward_std": 0.150136499106884, + "rewards/accuracy_reward": 0.12083333674818278, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9671875059604644, + "step": 1137 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.0750122070312, + "epoch": 0.36421827492398784, + "grad_norm": 0.35083115100860596, + "kl": 0.46517665684223175, + "learning_rate": 1.6042073505939718e-05, + "loss": 0.1314, + "reward": 1.0151041924953461, + "reward_std": 0.1264990646392107, + "rewards/accuracy_reward": 0.04375000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9713541805744171, + "step": 1138 + }, + { + "clip_ratio": 0.0, + "completion_length": 510.9229400634766, + "epoch": 0.3645383261321811, + "grad_norm": 0.18946900963783264, + "kl": 0.6526028856635093, + "learning_rate": 1.6033164340351065e-05, + "loss": 0.0952, + "reward": 1.096875011920929, + "reward_std": 0.10533001609146594, + "rewards/accuracy_reward": 0.12291666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1139 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.2708526611328, + "epoch": 0.36485837734037446, + "grad_norm": 0.12671904265880585, + "kl": 0.6638071507215499, + "learning_rate": 1.6024247639069987e-05, + "loss": 0.1385, + "reward": 1.0093750178813934, + "reward_std": 0.15015630498528482, + "rewards/accuracy_reward": 0.04166666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9677083492279053, + "step": 1140 + }, + { + "clip_ratio": 0.0, + "completion_length": 505.11668090820314, + "epoch": 0.36517842854856775, + "grad_norm": 0.4702635407447815, + "kl": 1.0072061479091645, + "learning_rate": 1.6015323413233838e-05, + "loss": 0.1401, + "reward": 1.076562523841858, + "reward_std": 0.1350021906197071, + "rewards/accuracy_reward": 0.10833333842456341, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9682291805744171, + "step": 1141 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.72918090820315, + "epoch": 0.3654984797567611, + "grad_norm": 0.2643105089664459, + "kl": 0.6117392227053642, + "learning_rate": 1.6006391673989373e-05, + "loss": 0.1544, + "reward": 1.0864583671092987, + "reward_std": 0.16201163977384567, + "rewards/accuracy_reward": 0.11041667200624943, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9760416805744171, + "step": 1142 + }, + { + "clip_ratio": 0.0, + "completion_length": 500.4666778564453, + "epoch": 0.36581853096495437, + "grad_norm": 0.2908649742603302, + "kl": 0.517003245651722, + "learning_rate": 1.5997452432492732e-05, + "loss": 0.0798, + "reward": 1.020312523841858, + "reward_std": 0.12842915281653405, + "rewards/accuracy_reward": 0.037500002048909666, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.982812511920929, + "step": 1143 + }, + { + "clip_ratio": 0.0, + "completion_length": 495.1750152587891, + "epoch": 0.3661385821731477, + "grad_norm": 0.1781575083732605, + "kl": 0.5948102369904518, + "learning_rate": 1.598850569990944e-05, + "loss": 0.1188, + "reward": 1.0609375298023225, + "reward_std": 0.10798088498413563, + "rewards/accuracy_reward": 0.08125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9796875238418579, + "step": 1144 + }, + { + "clip_ratio": 0.0, + "completion_length": 513.6854370117187, + "epoch": 0.366458633381341, + "grad_norm": 0.2938849627971649, + "kl": 0.38231213241815565, + "learning_rate": 1.5979551487414357e-05, + "loss": 0.0857, + "reward": 1.0234375238418578, + "reward_std": 0.08541666828095913, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9838541805744171, + "step": 1145 + }, + { + "clip_ratio": 0.0, + "completion_length": 493.327099609375, + "epoch": 0.36677868458953433, + "grad_norm": 0.12273690849542618, + "kl": 0.26505750194191935, + "learning_rate": 1.5970589806191698e-05, + "loss": 0.0526, + "reward": 1.0729166746139527, + "reward_std": 0.09640407245606183, + "rewards/accuracy_reward": 0.08125000204890967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9916666686534882, + "step": 1146 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.9125183105469, + "epoch": 0.3670987357977276, + "grad_norm": 0.19048544764518738, + "kl": 0.5402480706572532, + "learning_rate": 1.5961620667434997e-05, + "loss": 0.0985, + "reward": 1.0666666865348815, + "reward_std": 0.08264825325459242, + "rewards/accuracy_reward": 0.08333333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9833333432674408, + "step": 1147 + }, + { + "clip_ratio": 0.0, + "completion_length": 518.0104309082031, + "epoch": 0.36741878700592095, + "grad_norm": 0.320730596780777, + "kl": 0.39287843108177184, + "learning_rate": 1.5952644082347124e-05, + "loss": 0.0834, + "reward": 1.064062523841858, + "reward_std": 0.14702175408601761, + "rewards/accuracy_reward": 0.08750000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9765625119209289, + "step": 1148 + }, + { + "clip_ratio": 0.0, + "completion_length": 493.71876831054686, + "epoch": 0.36773883821411424, + "grad_norm": 0.11714158952236176, + "kl": 0.2555687852203846, + "learning_rate": 1.5943660062140226e-05, + "loss": 0.0438, + "reward": 1.1031250298023223, + "reward_std": 0.11189155112951994, + "rewards/accuracy_reward": 0.1208333371207118, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9822916865348816, + "step": 1149 + }, + { + "clip_ratio": 0.0, + "completion_length": 509.1354248046875, + "epoch": 0.3680588894223076, + "grad_norm": 0.22005389630794525, + "kl": 0.2881615623831749, + "learning_rate": 1.593466861803575e-05, + "loss": 0.1066, + "reward": 1.0901041865348815, + "reward_std": 0.11322282254695892, + "rewards/accuracy_reward": 0.11041666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9796875059604645, + "step": 1150 + }, + { + "clip_ratio": 0.0, + "completion_length": 491.908349609375, + "epoch": 0.36837894063050086, + "grad_norm": 0.5133959650993347, + "kl": 0.6626154512166977, + "learning_rate": 1.592566976126441e-05, + "loss": 0.1367, + "reward": 1.1182291805744171, + "reward_std": 0.13968872725963594, + "rewards/accuracy_reward": 0.14375000316649675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9744791805744171, + "step": 1151 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.1250122070312, + "epoch": 0.3686989918386942, + "grad_norm": 0.17595118284225464, + "kl": 0.43243874460458753, + "learning_rate": 1.5916663503066184e-05, + "loss": 0.087, + "reward": 1.0171875178813934, + "reward_std": 0.05748256333172321, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9838541805744171, + "step": 1152 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.4770965576172, + "epoch": 0.3690190430468875, + "grad_norm": 0.07913016527891159, + "kl": 0.2869682595133781, + "learning_rate": 1.5907649854690292e-05, + "loss": 0.0644, + "reward": 1.0677083611488343, + "reward_std": 0.08603444769978523, + "rewards/accuracy_reward": 0.08333333656191826, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750178813935, + "step": 1153 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.520849609375, + "epoch": 0.3693390942550808, + "grad_norm": 0.2064250111579895, + "kl": 0.4336912453174591, + "learning_rate": 1.5898628827395177e-05, + "loss": 0.1009, + "reward": 1.0192708551883698, + "reward_std": 0.09977535083889962, + "rewards/accuracy_reward": 0.04375000130385161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208492279053, + "step": 1154 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.8770965576172, + "epoch": 0.3696591454632741, + "grad_norm": 0.19949369132518768, + "kl": 0.3324578292667866, + "learning_rate": 1.5889600432448515e-05, + "loss": 0.0774, + "reward": 1.1135416746139526, + "reward_std": 0.08293756693601609, + "rewards/accuracy_reward": 0.12500000204890965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9885416686534881, + "step": 1155 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.8979339599609, + "epoch": 0.36997919667146745, + "grad_norm": 0.6426315307617188, + "kl": 0.7445389926433563, + "learning_rate": 1.5880564681127172e-05, + "loss": 0.1254, + "reward": 1.0541666924953461, + "reward_std": 0.12224040143191814, + "rewards/accuracy_reward": 0.08333333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333492279053, + "step": 1156 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.3125122070312, + "epoch": 0.37029924787966073, + "grad_norm": 0.2057666778564453, + "kl": 0.45236001163721085, + "learning_rate": 1.5871521584717207e-05, + "loss": 0.1005, + "reward": 1.103125023841858, + "reward_std": 0.10306334141641856, + "rewards/accuracy_reward": 0.12500000409781933, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250119209289, + "step": 1157 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.7187683105469, + "epoch": 0.37061929908785407, + "grad_norm": 0.3101314902305603, + "kl": 0.7116847023367882, + "learning_rate": 1.5862471154513853e-05, + "loss": 0.132, + "reward": 1.0463541865348815, + "reward_std": 0.12737105637788773, + "rewards/accuracy_reward": 0.07708333563059569, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9692708492279053, + "step": 1158 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.9916809082031, + "epoch": 0.37093935029604735, + "grad_norm": 0.30432990193367004, + "kl": 0.6676271669566631, + "learning_rate": 1.58534134018215e-05, + "loss": 0.102, + "reward": 0.9994791865348815, + "reward_std": 0.10984005965292454, + "rewards/accuracy_reward": 0.02916666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9703125119209289, + "step": 1159 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.6187713623046, + "epoch": 0.3712594015042407, + "grad_norm": 0.29318493604660034, + "kl": 0.40648306608200074, + "learning_rate": 1.5844348337953682e-05, + "loss": 0.1287, + "reward": 0.9895833551883697, + "reward_std": 0.15547448098659516, + "rewards/accuracy_reward": 0.022916667349636554, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9666666746139526, + "step": 1160 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.220849609375, + "epoch": 0.371579452712434, + "grad_norm": 0.1845497488975525, + "kl": 0.5116272836923599, + "learning_rate": 1.5835275974233083e-05, + "loss": 0.0779, + "reward": 1.0260416924953462, + "reward_std": 0.13704813569784163, + "rewards/accuracy_reward": 0.05416666939854622, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9718750238418579, + "step": 1161 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.5021057128906, + "epoch": 0.3718995039206273, + "grad_norm": 0.34543439745903015, + "kl": 0.5618045464158058, + "learning_rate": 1.5826196321991484e-05, + "loss": 0.1316, + "reward": 1.0187500298023224, + "reward_std": 0.14411462992429733, + "rewards/accuracy_reward": 0.052083334885537624, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9666666805744171, + "step": 1162 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.439599609375, + "epoch": 0.3722195551288206, + "grad_norm": 0.27085787057876587, + "kl": 0.4931087389588356, + "learning_rate": 1.581710939256978e-05, + "loss": 0.0706, + "reward": 1.1380208611488343, + "reward_std": 0.16875488683581352, + "rewards/accuracy_reward": 0.16250000484287738, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208492279053, + "step": 1163 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.0229370117188, + "epoch": 0.37253960633701394, + "grad_norm": 0.24295267462730408, + "kl": 0.6016337320208549, + "learning_rate": 1.5808015197317944e-05, + "loss": 0.073, + "reward": 1.040625023841858, + "reward_std": 0.12459696829319, + "rewards/accuracy_reward": 0.058333334513008595, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9822916746139526, + "step": 1164 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.7375091552734, + "epoch": 0.3728596575452072, + "grad_norm": 0.46193087100982666, + "kl": 0.49578318893909457, + "learning_rate": 1.5798913747595038e-05, + "loss": 0.1141, + "reward": 1.030208373069763, + "reward_std": 0.13383289575576782, + "rewards/accuracy_reward": 0.05625000167638063, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583492279053, + "step": 1165 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.8583465576172, + "epoch": 0.37317970875340056, + "grad_norm": 0.24223735928535461, + "kl": 0.7255073443055153, + "learning_rate": 1.5789805054769187e-05, + "loss": 0.1319, + "reward": 1.067187535762787, + "reward_std": 0.17425041720271112, + "rewards/accuracy_reward": 0.10833333600312471, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9588541924953461, + "step": 1166 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.708349609375, + "epoch": 0.37349975996159385, + "grad_norm": 0.2238290160894394, + "kl": 0.6970250770449639, + "learning_rate": 1.578068913021755e-05, + "loss": 0.1519, + "reward": 1.0447916984558105, + "reward_std": 0.13058854918926954, + "rewards/accuracy_reward": 0.07083333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1167 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.5500183105469, + "epoch": 0.3738198111697872, + "grad_norm": 0.20912905037403107, + "kl": 0.735250449180603, + "learning_rate": 1.5771565985326323e-05, + "loss": 0.1231, + "reward": 1.0281250298023223, + "reward_std": 0.1537714421749115, + "rewards/accuracy_reward": 0.06666666846722365, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9614583432674408, + "step": 1168 + }, + { + "clip_ratio": 0.0, + "completion_length": 537.535433959961, + "epoch": 0.37413986237798047, + "grad_norm": 0.40195563435554504, + "kl": 0.7344786658883095, + "learning_rate": 1.5762435631490732e-05, + "loss": 0.1266, + "reward": 1.0489583671092988, + "reward_std": 0.14664312303066254, + "rewards/accuracy_reward": 0.07916666883975268, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916865348816, + "step": 1169 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.6916809082031, + "epoch": 0.3744599135861738, + "grad_norm": 0.37209442257881165, + "kl": 0.925914877653122, + "learning_rate": 1.5753298080114983e-05, + "loss": 0.1411, + "reward": 0.9750000238418579, + "reward_std": 0.14015717357397078, + "rewards/accuracy_reward": 0.01250000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9625000178813934, + "step": 1170 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.7041931152344, + "epoch": 0.3747799647943671, + "grad_norm": 0.3376603424549103, + "kl": 0.7903919830918312, + "learning_rate": 1.57441533426123e-05, + "loss": 0.1121, + "reward": 1.0151041865348815, + "reward_std": 0.12867529951035978, + "rewards/accuracy_reward": 0.04583333469927311, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.967187511920929, + "step": 1171 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.6396057128907, + "epoch": 0.37510001600256043, + "grad_norm": 0.15526770055294037, + "kl": 0.54666518419981, + "learning_rate": 1.5735001430404864e-05, + "loss": 0.0911, + "reward": 1.0822916805744172, + "reward_std": 0.13302346915006638, + "rewards/accuracy_reward": 0.10833333488553762, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1172 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.8000183105469, + "epoch": 0.3754200672107537, + "grad_norm": 0.22396454215049744, + "kl": 0.6107922196388245, + "learning_rate": 1.5725842354923823e-05, + "loss": 0.1063, + "reward": 1.0338541924953462, + "reward_std": 0.12921084687113762, + "rewards/accuracy_reward": 0.0541666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.979687511920929, + "step": 1173 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.3708526611329, + "epoch": 0.37574011841894706, + "grad_norm": 0.26841455698013306, + "kl": 0.4723584517836571, + "learning_rate": 1.5716676127609277e-05, + "loss": 0.0914, + "reward": 1.1182291984558106, + "reward_std": 0.15276648811995983, + "rewards/accuracy_reward": 0.1437500050291419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9744791746139526, + "step": 1174 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.8854339599609, + "epoch": 0.37606016962714034, + "grad_norm": 0.2657756507396698, + "kl": 0.5366487547755241, + "learning_rate": 1.5707502759910246e-05, + "loss": 0.0807, + "reward": 1.0348958551883698, + "reward_std": 0.15992402881383896, + "rewards/accuracy_reward": 0.06458333488553762, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9703125059604645, + "step": 1175 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.4541809082032, + "epoch": 0.3763802208353337, + "grad_norm": 0.13122889399528503, + "kl": 0.5268323123455048, + "learning_rate": 1.5698322263284683e-05, + "loss": 0.1078, + "reward": 1.0385416924953461, + "reward_std": 0.10363166444003583, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916805744171, + "step": 1176 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.8583557128907, + "epoch": 0.37670027204352696, + "grad_norm": 0.24503201246261597, + "kl": 0.5733169555664063, + "learning_rate": 1.568913464919944e-05, + "loss": 0.1181, + "reward": 1.041666704416275, + "reward_std": 0.1343327358365059, + "rewards/accuracy_reward": 0.07500000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9666666805744171, + "step": 1177 + }, + { + "clip_ratio": 0.0, + "completion_length": 505.1437652587891, + "epoch": 0.3770203232517203, + "grad_norm": 0.1513802856206894, + "kl": 0.3681658856570721, + "learning_rate": 1.5679939929130256e-05, + "loss": 0.074, + "reward": 1.126562511920929, + "reward_std": 0.11795764788985252, + "rewards/accuracy_reward": 0.14583333656191827, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9807291746139526, + "step": 1178 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.7875183105468, + "epoch": 0.3773403744599136, + "grad_norm": 0.12650415301322937, + "kl": 0.48660945147275925, + "learning_rate": 1.5670738114561744e-05, + "loss": 0.1086, + "reward": 1.0734375298023224, + "reward_std": 0.163792784512043, + "rewards/accuracy_reward": 0.10833333656191826, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9651041746139526, + "step": 1179 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.8083465576171, + "epoch": 0.37766042566810687, + "grad_norm": 0.21940185129642487, + "kl": 0.8333341613411903, + "learning_rate": 1.5661529216987393e-05, + "loss": 0.186, + "reward": 0.9989583551883697, + "reward_std": 0.16733265221118926, + "rewards/accuracy_reward": 0.05625000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9427083492279053, + "step": 1180 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.0000152587891, + "epoch": 0.3779804768763002, + "grad_norm": 0.47936850786209106, + "kl": 1.1114118099212646, + "learning_rate": 1.565231324790952e-05, + "loss": 0.1621, + "reward": 1.0494791984558105, + "reward_std": 0.21776344440877438, + "rewards/accuracy_reward": 0.10625000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9432291865348816, + "step": 1181 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.5354370117187, + "epoch": 0.3783005280844935, + "grad_norm": 0.20487003028392792, + "kl": 0.8265619874000549, + "learning_rate": 1.564309021883929e-05, + "loss": 0.1495, + "reward": 1.0140625298023225, + "reward_std": 0.16638899594545364, + "rewards/accuracy_reward": 0.06458333637565375, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9494791865348816, + "step": 1182 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.5479339599609, + "epoch": 0.37862057929268683, + "grad_norm": 0.2059021294116974, + "kl": 0.8217375859618187, + "learning_rate": 1.563386014129667e-05, + "loss": 0.2063, + "reward": 1.097916692495346, + "reward_std": 0.17657624781131745, + "rewards/accuracy_reward": 0.14375000353902578, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9520833492279053, + "step": 1183 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.3583526611328, + "epoch": 0.3789406305008801, + "grad_norm": 0.36007773876190186, + "kl": 1.1445650905370712, + "learning_rate": 1.5624623026810445e-05, + "loss": 0.2036, + "reward": 1.0109375298023224, + "reward_std": 0.19519389644265175, + "rewards/accuracy_reward": 0.07708333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9338541924953461, + "step": 1184 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.383349609375, + "epoch": 0.37926068170907346, + "grad_norm": 0.13779407739639282, + "kl": 0.8353869661688804, + "learning_rate": 1.5615378886918183e-05, + "loss": 0.1376, + "reward": 1.0270833611488341, + "reward_std": 0.20012224316596985, + "rewards/accuracy_reward": 0.08541666846722365, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9416666865348816, + "step": 1185 + }, + { + "clip_ratio": 0.0, + "completion_length": 522.8979370117188, + "epoch": 0.37958073291726674, + "grad_norm": 0.2753371298313141, + "kl": 0.9140992000699043, + "learning_rate": 1.5606127733166237e-05, + "loss": 0.1652, + "reward": 1.0192708432674409, + "reward_std": 0.1692800521850586, + "rewards/accuracy_reward": 0.06875000018626451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9505208432674408, + "step": 1186 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.2021057128907, + "epoch": 0.3799007841254601, + "grad_norm": 0.18957631289958954, + "kl": 1.0052045956254005, + "learning_rate": 1.5596869577109705e-05, + "loss": 0.1419, + "reward": 1.026562511920929, + "reward_std": 0.18854286577552556, + "rewards/accuracy_reward": 0.08750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9390625119209289, + "step": 1187 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.2250183105468, + "epoch": 0.38022083533365336, + "grad_norm": 0.14077381789684296, + "kl": 0.771829554438591, + "learning_rate": 1.5587604430312436e-05, + "loss": 0.1185, + "reward": 1.0666666865348815, + "reward_std": 0.19701256975531578, + "rewards/accuracy_reward": 0.1104166679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.956250011920929, + "step": 1188 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.6041870117188, + "epoch": 0.3805408865418467, + "grad_norm": 0.20995795726776123, + "kl": 0.8531022161245346, + "learning_rate": 1.5578332304347016e-05, + "loss": 0.1283, + "reward": 1.0380208551883698, + "reward_std": 0.17556421980261802, + "rewards/accuracy_reward": 0.09166666883975268, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9463541805744171, + "step": 1189 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.9125183105468, + "epoch": 0.38086093775004, + "grad_norm": 0.23752635717391968, + "kl": 0.9788353681564331, + "learning_rate": 1.5569053210794748e-05, + "loss": 0.095, + "reward": 1.020312523841858, + "reward_std": 0.13795197159051895, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9494791805744172, + "step": 1190 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.4187652587891, + "epoch": 0.3811809889582333, + "grad_norm": 0.3365929424762726, + "kl": 1.0373542308807373, + "learning_rate": 1.5559767161245633e-05, + "loss": 0.1289, + "reward": 1.0098958671092988, + "reward_std": 0.2035977765917778, + "rewards/accuracy_reward": 0.0687500013038516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9411458551883698, + "step": 1191 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.7958557128907, + "epoch": 0.3815010401664266, + "grad_norm": 0.13403218984603882, + "kl": 0.6618030473589898, + "learning_rate": 1.5550474167298364e-05, + "loss": 0.0804, + "reward": 1.1020833611488343, + "reward_std": 0.1049573190510273, + "rewards/accuracy_reward": 0.1375000050291419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9645833432674408, + "step": 1192 + }, + { + "clip_ratio": 0.0, + "completion_length": 515.7479431152344, + "epoch": 0.38182109137461995, + "grad_norm": 0.16500230133533478, + "kl": 0.7753990903496742, + "learning_rate": 1.5541174240560303e-05, + "loss": 0.1341, + "reward": 1.091666692495346, + "reward_std": 0.1889185607433319, + "rewards/accuracy_reward": 0.13333333544433118, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9583333492279053, + "step": 1193 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.5000183105469, + "epoch": 0.38214114258281323, + "grad_norm": 0.2911393344402313, + "kl": 0.9876785755157471, + "learning_rate": 1.553186739264748e-05, + "loss": 0.1403, + "reward": 1.0031250238418579, + "reward_std": 0.17999138236045836, + "rewards/accuracy_reward": 0.05833333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9447916805744171, + "step": 1194 + }, + { + "clip_ratio": 0.0, + "completion_length": 537.6625213623047, + "epoch": 0.38246119379100657, + "grad_norm": 0.4007449448108673, + "kl": 1.0350771889090538, + "learning_rate": 1.5522553635184567e-05, + "loss": 0.1465, + "reward": 1.007812535762787, + "reward_std": 0.18713185042142869, + "rewards/accuracy_reward": 0.058333334513008595, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9494791805744172, + "step": 1195 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.0166839599609, + "epoch": 0.38278124499919985, + "grad_norm": 0.10396149754524231, + "kl": 0.5359139025211335, + "learning_rate": 1.5513232979804854e-05, + "loss": 0.0613, + "reward": 1.0031250238418579, + "reward_std": 0.1407000742852688, + "rewards/accuracy_reward": 0.04166666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9614583432674408, + "step": 1196 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.1437683105469, + "epoch": 0.3831012962073932, + "grad_norm": 0.17239876091480255, + "kl": 0.6517708688974381, + "learning_rate": 1.550390543815026e-05, + "loss": 0.1449, + "reward": 1.0859375298023224, + "reward_std": 0.17106188386678695, + "rewards/accuracy_reward": 0.11666667181998491, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.967187511920929, + "step": 1197 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.6750244140625, + "epoch": 0.3834213474155865, + "grad_norm": 0.24555876851081848, + "kl": 0.49216202795505526, + "learning_rate": 1.549457102187131e-05, + "loss": 0.1128, + "reward": 1.1161458969116211, + "reward_std": 0.2306416004896164, + "rewards/accuracy_reward": 0.15208333656191825, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9619791865348816, + "step": 1198 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.770849609375, + "epoch": 0.3837413986237798, + "grad_norm": 0.1278693675994873, + "kl": 0.5300887562334538, + "learning_rate": 1.5485229742627102e-05, + "loss": 0.0528, + "reward": 1.1583333611488342, + "reward_std": 0.15799203217029573, + "rewards/accuracy_reward": 0.1895833395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9687500178813935, + "step": 1199 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.7000244140625, + "epoch": 0.3840614498319731, + "grad_norm": 0.36602315306663513, + "kl": 0.49914331585168836, + "learning_rate": 1.5475881612085313e-05, + "loss": 0.1042, + "reward": 1.0505208730697633, + "reward_std": 0.17699409797787666, + "rewards/accuracy_reward": 0.08125000223517417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9692708551883698, + "step": 1200 + }, + { + "clip_ratio": 0.0, + "completion_length": 508.54169006347655, + "epoch": 0.38438150104016644, + "grad_norm": 0.17335864901542664, + "kl": 0.46766447871923444, + "learning_rate": 1.5466526641922174e-05, + "loss": 0.077, + "reward": 1.032291704416275, + "reward_std": 0.13878519721329213, + "rewards/accuracy_reward": 0.05625000167638063, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9760416805744171, + "step": 1201 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.2083435058594, + "epoch": 0.3847015522483597, + "grad_norm": 0.12205848842859268, + "kl": 0.32414179742336274, + "learning_rate": 1.5457164843822465e-05, + "loss": 0.0973, + "reward": 1.1067708551883697, + "reward_std": 0.11873383224010467, + "rewards/accuracy_reward": 0.13541667275130748, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9713541865348816, + "step": 1202 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.014599609375, + "epoch": 0.38502160345655306, + "grad_norm": 0.11642692983150482, + "kl": 0.6380541652441025, + "learning_rate": 1.5447796229479495e-05, + "loss": 0.1153, + "reward": 1.0760416984558105, + "reward_std": 0.1689482469111681, + "rewards/accuracy_reward": 0.11458333730697631, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9614583551883698, + "step": 1203 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.6271026611328, + "epoch": 0.38534165466474635, + "grad_norm": 0.12941080331802368, + "kl": 0.30585852190852164, + "learning_rate": 1.5438420810595073e-05, + "loss": 0.0648, + "reward": 1.0114583611488341, + "reward_std": 0.10651846360415221, + "rewards/accuracy_reward": 0.029166668094694613, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9822916805744171, + "step": 1204 + }, + { + "clip_ratio": 0.0, + "completion_length": 526.935433959961, + "epoch": 0.3856617058729397, + "grad_norm": 0.3982611298561096, + "kl": 0.6207052066922187, + "learning_rate": 1.5429038598879526e-05, + "loss": 0.0918, + "reward": 1.0307291924953461, + "reward_std": 0.15194091200828552, + "rewards/accuracy_reward": 0.06041666883975268, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9703125178813934, + "step": 1205 + }, + { + "clip_ratio": 0.0, + "completion_length": 514.6979339599609, + "epoch": 0.38598175708113297, + "grad_norm": 0.06387288123369217, + "kl": 0.30112158581614495, + "learning_rate": 1.5419649606051648e-05, + "loss": 0.0393, + "reward": 1.079687523841858, + "reward_std": 0.09635667633265257, + "rewards/accuracy_reward": 0.09583333637565375, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9838541746139526, + "step": 1206 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.0291778564454, + "epoch": 0.3863018082893263, + "grad_norm": 0.2889772057533264, + "kl": 0.6139212623238564, + "learning_rate": 1.5410253843838717e-05, + "loss": 0.0946, + "reward": 1.0229166924953461, + "reward_std": 0.10145474877208471, + "rewards/accuracy_reward": 0.04791666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.975000011920929, + "step": 1207 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.7708618164063, + "epoch": 0.3866218594975196, + "grad_norm": 0.22425009310245514, + "kl": 0.39511779621243476, + "learning_rate": 1.540085132397646e-05, + "loss": 0.0991, + "reward": 1.110416704416275, + "reward_std": 0.12801450863480568, + "rewards/accuracy_reward": 0.13333333656191826, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833492279053, + "step": 1208 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.0166870117188, + "epoch": 0.38694191070571293, + "grad_norm": 0.16940173506736755, + "kl": 0.48901860415935516, + "learning_rate": 1.539144205820905e-05, + "loss": 0.1193, + "reward": 1.0723958790302277, + "reward_std": 0.18953076004981995, + "rewards/accuracy_reward": 0.11041666809469461, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.961979192495346, + "step": 1209 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.7062713623047, + "epoch": 0.3872619619139062, + "grad_norm": 0.24883034825325012, + "kl": 0.6189972922205925, + "learning_rate": 1.538202605828907e-05, + "loss": 0.1052, + "reward": 0.9833333611488342, + "reward_std": 0.15337586775422096, + "rewards/accuracy_reward": 0.02500000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9583333551883697, + "step": 1210 + }, + { + "clip_ratio": 0.0, + "completion_length": 636.789599609375, + "epoch": 0.38758201312209956, + "grad_norm": 0.5786344408988953, + "kl": 0.9378010019659996, + "learning_rate": 1.5372603335977537e-05, + "loss": 0.1549, + "reward": 0.9421875298023223, + "reward_std": 0.22690111324191092, + "rewards/accuracy_reward": 0.027083333395421506, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.9109375178813934, + "step": 1211 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.358349609375, + "epoch": 0.38790206433029284, + "grad_norm": 0.17479492723941803, + "kl": 0.7791576758027077, + "learning_rate": 1.536317390304385e-05, + "loss": 0.1551, + "reward": 0.9604166865348815, + "reward_std": 0.22087621092796325, + "rewards/accuracy_reward": 0.03541666809469461, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9250000178813934, + "step": 1212 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.2312683105469, + "epoch": 0.3882221155384862, + "grad_norm": 0.1706581711769104, + "kl": 0.5600812263786793, + "learning_rate": 1.5353737771265785e-05, + "loss": 0.0847, + "reward": 0.971354192495346, + "reward_std": 0.1764809437096119, + "rewards/accuracy_reward": 0.02500000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9463541865348816, + "step": 1213 + }, + { + "clip_ratio": 0.0, + "completion_length": 611.3479370117187, + "epoch": 0.38854216674667946, + "grad_norm": 0.12541788816452026, + "kl": 0.3706213489174843, + "learning_rate": 1.5344294952429506e-05, + "loss": 0.0839, + "reward": 1.029166692495346, + "reward_std": 0.20604985877871512, + "rewards/accuracy_reward": 0.0750000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9541666865348816, + "step": 1214 + }, + { + "clip_ratio": 0.0, + "completion_length": 607.0854431152344, + "epoch": 0.3888622179548728, + "grad_norm": 0.30533111095428467, + "kl": 0.4709593154489994, + "learning_rate": 1.5334845458329505e-05, + "loss": 0.0993, + "reward": 1.1036458492279053, + "reward_std": 0.15157056059688329, + "rewards/accuracy_reward": 0.15625000409781933, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9473958492279053, + "step": 1215 + }, + { + "clip_ratio": 0.0, + "completion_length": 604.7583526611328, + "epoch": 0.3891822691630661, + "grad_norm": 0.2925158143043518, + "kl": 0.510768836736679, + "learning_rate": 1.532538930076863e-05, + "loss": 0.0869, + "reward": 1.0177083611488342, + "reward_std": 0.20111262276768685, + "rewards/accuracy_reward": 0.06250000055879354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9552083492279053, + "step": 1216 + }, + { + "clip_ratio": 0.0, + "completion_length": 624.5041931152343, + "epoch": 0.3895023203712594, + "grad_norm": 0.30002668499946594, + "kl": 0.3750695250928402, + "learning_rate": 1.5315926491558045e-05, + "loss": 0.0757, + "reward": 1.0395833611488343, + "reward_std": 0.18180562406778336, + "rewards/accuracy_reward": 0.08125000316649675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9583333492279053, + "step": 1217 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.264599609375, + "epoch": 0.3898223715794527, + "grad_norm": 0.22842350602149963, + "kl": 0.5847925186157227, + "learning_rate": 1.5306457042517218e-05, + "loss": 0.0595, + "reward": 0.9734375238418579, + "reward_std": 0.11920136883854866, + "rewards/accuracy_reward": 0.00833333358168602, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9630208551883698, + "step": 1218 + }, + { + "clip_ratio": 0.0, + "completion_length": 629.6958557128906, + "epoch": 0.39014242278764605, + "grad_norm": 0.14000339806079865, + "kl": 0.4536043472588062, + "learning_rate": 1.5296980965473918e-05, + "loss": 0.0439, + "reward": 1.099479180574417, + "reward_std": 0.13088055774569513, + "rewards/accuracy_reward": 0.12500000298023223, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9723958492279052, + "step": 1219 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.4354309082031, + "epoch": 0.39046247399583933, + "grad_norm": 0.2920055091381073, + "kl": 0.48158343955874444, + "learning_rate": 1.5287498272264192e-05, + "loss": 0.1022, + "reward": 1.1161458492279053, + "reward_std": 0.1597229868173599, + "rewards/accuracy_reward": 0.14791667014360427, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9682291746139526, + "step": 1220 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.8666839599609, + "epoch": 0.39078252520403267, + "grad_norm": 0.3325548470020294, + "kl": 0.7620904207229614, + "learning_rate": 1.5278008974732346e-05, + "loss": 0.1268, + "reward": 1.0489583611488342, + "reward_std": 0.2112396091222763, + "rewards/accuracy_reward": 0.09166666734963655, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9572916865348816, + "step": 1221 + }, + { + "clip_ratio": 0.0, + "completion_length": 621.5583557128906, + "epoch": 0.39110257641222596, + "grad_norm": 0.1554885059595108, + "kl": 0.44617650359869004, + "learning_rate": 1.5268513084730935e-05, + "loss": 0.0812, + "reward": 1.0286458611488343, + "reward_std": 0.1501821421086788, + "rewards/accuracy_reward": 0.06250000204890967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9661458492279053, + "step": 1222 + }, + { + "clip_ratio": 0.0, + "completion_length": 604.2125244140625, + "epoch": 0.3914226276204193, + "grad_norm": 0.12901164591312408, + "kl": 0.5715129643678665, + "learning_rate": 1.5259010614120755e-05, + "loss": 0.1106, + "reward": 1.045312523841858, + "reward_std": 0.1480789568275213, + "rewards/accuracy_reward": 0.08958333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9557291805744171, + "step": 1223 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.1708618164063, + "epoch": 0.3917426788286126, + "grad_norm": 0.14290817081928253, + "kl": 0.5230883605778217, + "learning_rate": 1.5249501574770815e-05, + "loss": 0.074, + "reward": 1.0343750357627868, + "reward_std": 0.16299154441803693, + "rewards/accuracy_reward": 0.06458333414047956, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916805744171, + "step": 1224 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.3875213623047, + "epoch": 0.39206273003680586, + "grad_norm": 0.1658349335193634, + "kl": 0.3186629630625248, + "learning_rate": 1.5239985978558333e-05, + "loss": 0.0573, + "reward": 1.0692708551883698, + "reward_std": 0.10922669228166342, + "rewards/accuracy_reward": 0.09375000204890967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208432674408, + "step": 1225 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.2208435058594, + "epoch": 0.3923827812449992, + "grad_norm": 0.09366770088672638, + "kl": 0.3164710126817226, + "learning_rate": 1.5230463837368713e-05, + "loss": 0.0658, + "reward": 1.0744791865348815, + "reward_std": 0.09520575925707817, + "rewards/accuracy_reward": 0.09583333488553762, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458432674408, + "step": 1226 + }, + { + "clip_ratio": 0.0, + "completion_length": 617.1521057128906, + "epoch": 0.3927028324531925, + "grad_norm": 0.14050906896591187, + "kl": 0.4312257021665573, + "learning_rate": 1.5220935163095534e-05, + "loss": 0.1062, + "reward": 1.0781250298023224, + "reward_std": 0.14153016209602357, + "rewards/accuracy_reward": 0.11458333730697631, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.963541692495346, + "step": 1227 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.0312652587891, + "epoch": 0.3930228836613858, + "grad_norm": 0.12579607963562012, + "kl": 0.45440919920802114, + "learning_rate": 1.521139996764054e-05, + "loss": 0.0825, + "reward": 1.0364583551883697, + "reward_std": 0.10581081509590148, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916865348816, + "step": 1228 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.8896057128907, + "epoch": 0.3933429348695791, + "grad_norm": 0.13927984237670898, + "kl": 0.44168696030974386, + "learning_rate": 1.5201858262913619e-05, + "loss": 0.0928, + "reward": 1.0677083611488343, + "reward_std": 0.15336225517094135, + "rewards/accuracy_reward": 0.0937500026077032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1229 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.508349609375, + "epoch": 0.39366298607777245, + "grad_norm": 0.16402070224285126, + "kl": 0.3885528713464737, + "learning_rate": 1.519231006083278e-05, + "loss": 0.0758, + "reward": 1.030208373069763, + "reward_std": 0.1770285289734602, + "rewards/accuracy_reward": 0.05625000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583492279053, + "step": 1230 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.4666870117187, + "epoch": 0.39398303728596573, + "grad_norm": 0.08129875361919403, + "kl": 0.3222904376685619, + "learning_rate": 1.5182755373324162e-05, + "loss": 0.0497, + "reward": 1.086979192495346, + "reward_std": 0.07945878580212593, + "rewards/accuracy_reward": 0.10625000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9807291805744172, + "step": 1231 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.8791900634766, + "epoch": 0.39430308849415907, + "grad_norm": 0.1860627830028534, + "kl": 0.4070753358304501, + "learning_rate": 1.5173194212321996e-05, + "loss": 0.1034, + "reward": 1.0458333492279053, + "reward_std": 0.12442791275680065, + "rewards/accuracy_reward": 0.07500000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333432674408, + "step": 1232 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.0083557128906, + "epoch": 0.39462313970235235, + "grad_norm": 0.15054498612880707, + "kl": 0.22181895673274993, + "learning_rate": 1.5163626589768598e-05, + "loss": 0.0497, + "reward": 1.0265625178813935, + "reward_std": 0.09489289149641991, + "rewards/accuracy_reward": 0.04166666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9848958432674408, + "step": 1233 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.5416931152344, + "epoch": 0.3949431909105457, + "grad_norm": 0.10187875479459763, + "kl": 0.375130108743906, + "learning_rate": 1.5154052517614361e-05, + "loss": 0.0653, + "reward": 1.0416666924953462, + "reward_std": 0.14163720104843378, + "rewards/accuracy_reward": 0.07291666921228171, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9687500238418579, + "step": 1234 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.6958618164062, + "epoch": 0.395263242118739, + "grad_norm": 0.169550821185112, + "kl": 0.3253509186208248, + "learning_rate": 1.5144472007817723e-05, + "loss": 0.0672, + "reward": 1.0520833551883697, + "reward_std": 0.10514021962881089, + "rewards/accuracy_reward": 0.07083333488553763, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.981250011920929, + "step": 1235 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.4812652587891, + "epoch": 0.3955832933269323, + "grad_norm": 0.09590303897857666, + "kl": 0.2758027367293835, + "learning_rate": 1.5134885072345178e-05, + "loss": 0.0747, + "reward": 1.1109375357627869, + "reward_std": 0.15622661411762237, + "rewards/accuracy_reward": 0.13541666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208432674408, + "step": 1236 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.527099609375, + "epoch": 0.3959033445351256, + "grad_norm": 0.1634804904460907, + "kl": 0.312994534522295, + "learning_rate": 1.512529172317123e-05, + "loss": 0.0633, + "reward": 1.1515625357627868, + "reward_std": 0.1235880684107542, + "rewards/accuracy_reward": 0.1791666727513075, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9723958611488343, + "step": 1237 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.0979370117187, + "epoch": 0.39622339574331894, + "grad_norm": 0.19042591750621796, + "kl": 0.4922703742980957, + "learning_rate": 1.511569197227841e-05, + "loss": 0.0658, + "reward": 1.0942708611488343, + "reward_std": 0.1376211117953062, + "rewards/accuracy_reward": 0.12291667070239783, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9713541865348816, + "step": 1238 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.7187622070312, + "epoch": 0.3965434469515122, + "grad_norm": 0.18415957689285278, + "kl": 0.6596296966075897, + "learning_rate": 1.5106085831657229e-05, + "loss": 0.0724, + "reward": 1.1057291984558106, + "reward_std": 0.11891843751072884, + "rewards/accuracy_reward": 0.1354166716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9703125178813934, + "step": 1239 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.3958435058594, + "epoch": 0.39686349815970556, + "grad_norm": 0.10486699640750885, + "kl": 0.23581696003675462, + "learning_rate": 1.509647331330619e-05, + "loss": 0.0432, + "reward": 1.1166666865348815, + "reward_std": 0.08925211485475301, + "rewards/accuracy_reward": 0.12916666995733977, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.987500011920929, + "step": 1240 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.5791778564453, + "epoch": 0.39718354936789885, + "grad_norm": 0.12086888402700424, + "kl": 0.337103009223938, + "learning_rate": 1.5086854429231763e-05, + "loss": 0.0375, + "reward": 1.010937511920929, + "reward_std": 0.11605828888714313, + "rewards/accuracy_reward": 0.02708333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9838541746139526, + "step": 1241 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.839599609375, + "epoch": 0.3975036005760922, + "grad_norm": 0.16846847534179688, + "kl": 0.5034018464386463, + "learning_rate": 1.5077229191448357e-05, + "loss": 0.0618, + "reward": 1.1223958492279054, + "reward_std": 0.114602355286479, + "rewards/accuracy_reward": 0.14166666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9807291865348816, + "step": 1242 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.495849609375, + "epoch": 0.39782365178428547, + "grad_norm": 0.1870306432247162, + "kl": 0.19878256246447562, + "learning_rate": 1.506759761197833e-05, + "loss": 0.0617, + "reward": 1.1442708671092987, + "reward_std": 0.07291666753590106, + "rewards/accuracy_reward": 0.1562500050291419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9880208432674408, + "step": 1243 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.8666900634765, + "epoch": 0.3981437029924788, + "grad_norm": 0.25140678882598877, + "kl": 0.22704439386725425, + "learning_rate": 1.5057959702851953e-05, + "loss": 0.0576, + "reward": 1.0994792044162751, + "reward_std": 0.10983092840760947, + "rewards/accuracy_reward": 0.11666667088866234, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9828125178813935, + "step": 1244 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.3166961669922, + "epoch": 0.3984637542006721, + "grad_norm": 0.210471972823143, + "kl": 0.4046620957553387, + "learning_rate": 1.5048315476107412e-05, + "loss": 0.0813, + "reward": 1.1104166805744171, + "reward_std": 0.1495097540318966, + "rewards/accuracy_reward": 0.13750000167638063, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9729166865348816, + "step": 1245 + }, + { + "clip_ratio": 0.0, + "completion_length": 533.5750152587891, + "epoch": 0.39878380540886543, + "grad_norm": 0.09761213511228561, + "kl": 0.21089787110686303, + "learning_rate": 1.5038664943790768e-05, + "loss": 0.054, + "reward": 1.0968750238418579, + "reward_std": 0.08261781334877014, + "rewards/accuracy_reward": 0.11250000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750119209289, + "step": 1246 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.989599609375, + "epoch": 0.3991038566170587, + "grad_norm": 0.19265764951705933, + "kl": 0.6630521953105927, + "learning_rate": 1.5029008117955978e-05, + "loss": 0.1042, + "reward": 1.1255208730697632, + "reward_std": 0.1521202649921179, + "rewards/accuracy_reward": 0.15000000335276126, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208492279053, + "step": 1247 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.2479278564454, + "epoch": 0.39942390782525206, + "grad_norm": 0.1177937313914299, + "kl": 0.32113772705197335, + "learning_rate": 1.5019345010664845e-05, + "loss": 0.0734, + "reward": 1.1291666984558106, + "reward_std": 0.11449034418910742, + "rewards/accuracy_reward": 0.15000000465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9791666746139527, + "step": 1248 + }, + { + "clip_ratio": 0.0, + "completion_length": 517.6520965576171, + "epoch": 0.39974395903344534, + "grad_norm": 0.13072729110717773, + "kl": 0.4669448517262936, + "learning_rate": 1.5009675633987027e-05, + "loss": 0.0444, + "reward": 1.063541692495346, + "reward_std": 0.11234742254018784, + "rewards/accuracy_reward": 0.07500000223517418, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9864583432674408, + "step": 1249 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.2000183105469, + "epoch": 0.4000640102416387, + "grad_norm": 0.14546598494052887, + "kl": 0.41185231059789656, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.0769, + "reward": 1.1276041984558105, + "reward_std": 0.0777821946889162, + "rewards/accuracy_reward": 0.1416666718199849, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9859375059604645, + "step": 1250 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.3812744140625, + "epoch": 0.40038406144983196, + "grad_norm": 0.1501355767250061, + "kl": 0.47948102802038195, + "learning_rate": 1.4990318120789074e-05, + "loss": 0.0584, + "reward": 1.060416692495346, + "reward_std": 0.06387959867715835, + "rewards/accuracy_reward": 0.07291666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9875000059604645, + "step": 1251 + }, + { + "clip_ratio": 0.0, + "completion_length": 518.3416870117187, + "epoch": 0.4007041126580253, + "grad_norm": 0.13536398112773895, + "kl": 0.394258227199316, + "learning_rate": 1.4980630008447343e-05, + "loss": 0.0783, + "reward": 1.0604166984558105, + "reward_std": 0.11339464448392392, + "rewards/accuracy_reward": 0.08541666939854622, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9750000238418579, + "step": 1252 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.7666778564453, + "epoch": 0.4010241638662186, + "grad_norm": 0.08538512885570526, + "kl": 0.3164933010935783, + "learning_rate": 1.4970935675075694e-05, + "loss": 0.0867, + "reward": 1.0281250298023223, + "reward_std": 0.12806884478777647, + "rewards/accuracy_reward": 0.04791666734963655, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9802083432674408, + "step": 1253 + }, + { + "clip_ratio": 0.0, + "completion_length": 537.8979339599609, + "epoch": 0.4013442150744119, + "grad_norm": 0.11755944043397903, + "kl": 0.36844282820820806, + "learning_rate": 1.496123513278279e-05, + "loss": 0.0792, + "reward": 1.0468750298023224, + "reward_std": 0.11137478947639465, + "rewards/accuracy_reward": 0.0645833346992731, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9822916805744171, + "step": 1254 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.1604370117187, + "epoch": 0.4016642662826052, + "grad_norm": 0.10501649975776672, + "kl": 0.3535917893052101, + "learning_rate": 1.4951528393685033e-05, + "loss": 0.078, + "reward": 1.0364583611488343, + "reward_std": 0.11245781332254409, + "rewards/accuracy_reward": 0.05208333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750178813935, + "step": 1255 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.7729370117188, + "epoch": 0.40198431749079855, + "grad_norm": 0.6395413875579834, + "kl": 0.40054913982748985, + "learning_rate": 1.4941815469906578e-05, + "loss": 0.0792, + "reward": 1.0505208492279052, + "reward_std": 0.0950919346883893, + "rewards/accuracy_reward": 0.06458333488553762, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9859375119209289, + "step": 1256 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.3854400634766, + "epoch": 0.40230436869899183, + "grad_norm": 0.14963221549987793, + "kl": 0.31750036850571633, + "learning_rate": 1.4932096373579304e-05, + "loss": 0.0544, + "reward": 1.0114583492279052, + "reward_std": 0.12480773292481899, + "rewards/accuracy_reward": 0.029166667722165585, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9822916865348816, + "step": 1257 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.2250122070312, + "epoch": 0.40262441990718517, + "grad_norm": 0.09617770463228226, + "kl": 0.45049638152122495, + "learning_rate": 1.49223711168428e-05, + "loss": 0.125, + "reward": 1.0140625178813933, + "reward_std": 0.128690517693758, + "rewards/accuracy_reward": 0.04583333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9682291805744171, + "step": 1258 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.6125183105469, + "epoch": 0.40294447111537846, + "grad_norm": 0.2407182902097702, + "kl": 0.42702504619956017, + "learning_rate": 1.4912639711844341e-05, + "loss": 0.0967, + "reward": 1.1062500476837158, + "reward_std": 0.12439418062567711, + "rewards/accuracy_reward": 0.13125000409781934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9750000238418579, + "step": 1259 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.9437683105468, + "epoch": 0.4032645223235718, + "grad_norm": 0.055207595229148865, + "kl": 0.26149168610572815, + "learning_rate": 1.490290217073889e-05, + "loss": 0.0297, + "reward": 1.1635416924953461, + "reward_std": 0.08959200419485569, + "rewards/accuracy_reward": 0.17708334047347307, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9864583432674408, + "step": 1260 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.2416839599609, + "epoch": 0.4035845735317651, + "grad_norm": 0.0995592474937439, + "kl": 0.39583138301968573, + "learning_rate": 1.4893158505689071e-05, + "loss": 0.0881, + "reward": 1.0114583492279052, + "reward_std": 0.1348001252859831, + "rewards/accuracy_reward": 0.03958333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.971875011920929, + "step": 1261 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.0791809082032, + "epoch": 0.4039046247399584, + "grad_norm": 0.07839322835206985, + "kl": 0.2790032118558884, + "learning_rate": 1.4883408728865164e-05, + "loss": 0.0807, + "reward": 1.0427083551883698, + "reward_std": 0.1275832900777459, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9760416805744171, + "step": 1262 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.4375122070312, + "epoch": 0.4042246759481517, + "grad_norm": 0.06481237709522247, + "kl": 0.22605575621128082, + "learning_rate": 1.487365285244507e-05, + "loss": 0.0793, + "reward": 1.0750000298023223, + "reward_std": 0.12191822603344918, + "rewards/accuracy_reward": 0.10208333656191826, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9729166805744172, + "step": 1263 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.8937774658203, + "epoch": 0.40454472715634504, + "grad_norm": 0.058921996504068375, + "kl": 0.2315479911863804, + "learning_rate": 1.4863890888614314e-05, + "loss": 0.0253, + "reward": 1.2218750417232513, + "reward_std": 0.13045966662466527, + "rewards/accuracy_reward": 0.24166667126119137, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9802083432674408, + "step": 1264 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.0125183105469, + "epoch": 0.4048647783645383, + "grad_norm": 0.18803565204143524, + "kl": 0.6443677566945553, + "learning_rate": 1.4854122849566032e-05, + "loss": 0.1322, + "reward": 0.9989583611488342, + "reward_std": 0.1535223349928856, + "rewards/accuracy_reward": 0.03750000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9614583492279053, + "step": 1265 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.0375213623047, + "epoch": 0.40518482957273166, + "grad_norm": 0.09277193248271942, + "kl": 0.31689817234873774, + "learning_rate": 1.484434874750094e-05, + "loss": 0.0662, + "reward": 1.0484375178813934, + "reward_std": 0.14467626363039016, + "rewards/accuracy_reward": 0.06875000260770321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9796875059604645, + "step": 1266 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.3333526611328, + "epoch": 0.40550488078092495, + "grad_norm": 0.11833394318819046, + "kl": 0.29800432324409487, + "learning_rate": 1.483456859462733e-05, + "loss": 0.0553, + "reward": 1.079166692495346, + "reward_std": 0.12277526557445526, + "rewards/accuracy_reward": 0.10000000353902579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9791666805744171, + "step": 1267 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.3791809082031, + "epoch": 0.40582493198911823, + "grad_norm": 0.0788242369890213, + "kl": 0.3021985150873661, + "learning_rate": 1.4824782403161049e-05, + "loss": 0.0636, + "reward": 1.0885416984558105, + "reward_std": 0.11099445223808288, + "rewards/accuracy_reward": 0.11041666995733976, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.978125, + "step": 1268 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.2083557128906, + "epoch": 0.40614498319731157, + "grad_norm": 0.15254127979278564, + "kl": 0.24624716192483903, + "learning_rate": 1.4814990185325488e-05, + "loss": 0.0263, + "reward": 1.0864583611488343, + "reward_std": 0.10864646323025226, + "rewards/accuracy_reward": 0.1000000024214387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9864583432674408, + "step": 1269 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.1250183105469, + "epoch": 0.40646503440550485, + "grad_norm": 0.08833687007427216, + "kl": 0.3731885127723217, + "learning_rate": 1.480519195335157e-05, + "loss": 0.122, + "reward": 1.0072916924953461, + "reward_std": 0.0928686197847128, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583611488343, + "step": 1270 + }, + { + "clip_ratio": 0.0, + "completion_length": 588.7500244140625, + "epoch": 0.4067850856136982, + "grad_norm": 0.1265832781791687, + "kl": 0.41062879338860514, + "learning_rate": 1.4795387719477719e-05, + "loss": 0.0834, + "reward": 1.1348958671092988, + "reward_std": 0.1514855232089758, + "rewards/accuracy_reward": 0.16666666995733975, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9682291924953461, + "step": 1271 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.0958465576172, + "epoch": 0.4071051368218915, + "grad_norm": 0.10724162310361862, + "kl": 0.4103553980588913, + "learning_rate": 1.4785577495949866e-05, + "loss": 0.0846, + "reward": 1.045833373069763, + "reward_std": 0.15014931112527846, + "rewards/accuracy_reward": 0.07083333600312472, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.975000011920929, + "step": 1272 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.7375183105469, + "epoch": 0.4074251880300848, + "grad_norm": 0.11321690678596497, + "kl": 0.6172791600227356, + "learning_rate": 1.4775761295021418e-05, + "loss": 0.1314, + "reward": 1.1250000238418578, + "reward_std": 0.1790860690176487, + "rewards/accuracy_reward": 0.15416666883975266, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333492279053, + "step": 1273 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.4604370117188, + "epoch": 0.4077452392382781, + "grad_norm": 0.0912218764424324, + "kl": 0.37791914120316505, + "learning_rate": 1.4765939128953255e-05, + "loss": 0.0626, + "reward": 1.0703125238418578, + "reward_std": 0.16757386401295662, + "rewards/accuracy_reward": 0.09375000353902578, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9765625059604645, + "step": 1274 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.5937744140625, + "epoch": 0.40806529044647144, + "grad_norm": 0.1103522852063179, + "kl": 0.23164157569408417, + "learning_rate": 1.4756111010013694e-05, + "loss": 0.046, + "reward": 1.0833333611488343, + "reward_std": 0.1385488674044609, + "rewards/accuracy_reward": 0.10000000521540642, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.981250011920929, + "step": 1275 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.1708526611328, + "epoch": 0.4083853416546647, + "grad_norm": 0.1879657357931137, + "kl": 0.4369122177362442, + "learning_rate": 1.47462769504785e-05, + "loss": 0.1146, + "reward": 1.0880208492279053, + "reward_std": 0.11164197325706482, + "rewards/accuracy_reward": 0.11041666995733976, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9776041805744171, + "step": 1276 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.6062622070312, + "epoch": 0.40870539286285806, + "grad_norm": 0.22609193623065948, + "kl": 0.8324251100420952, + "learning_rate": 1.473643696263085e-05, + "loss": 0.1212, + "reward": 0.9630208373069763, + "reward_std": 0.1323666602373123, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9609375059604645, + "step": 1277 + }, + { + "clip_ratio": 0.0, + "completion_length": 595.5500305175781, + "epoch": 0.40902544407105135, + "grad_norm": 0.14130529761314392, + "kl": 0.5213746406137943, + "learning_rate": 1.4726591058761336e-05, + "loss": 0.0789, + "reward": 1.1119791984558105, + "reward_std": 0.1154967736452818, + "rewards/accuracy_reward": 0.13750000409781932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9744791865348816, + "step": 1278 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.8812622070312, + "epoch": 0.4093454952792447, + "grad_norm": 0.12913070619106293, + "kl": 0.6176031097769737, + "learning_rate": 1.4716739251167931e-05, + "loss": 0.1167, + "reward": 0.9677083492279053, + "reward_std": 0.12283567264676094, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.965625011920929, + "step": 1279 + }, + { + "clip_ratio": 0.0, + "completion_length": 590.9333618164062, + "epoch": 0.40966554648743797, + "grad_norm": 0.22252824902534485, + "kl": 0.608339787274599, + "learning_rate": 1.470688155215598e-05, + "loss": 0.1006, + "reward": 1.0734375417232513, + "reward_std": 0.18009853959083558, + "rewards/accuracy_reward": 0.10208333563059568, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9713541865348816, + "step": 1280 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.970849609375, + "epoch": 0.4099855976956313, + "grad_norm": 0.2188696563243866, + "kl": 0.6703398540616036, + "learning_rate": 1.4697017974038192e-05, + "loss": 0.1193, + "reward": 0.9994791984558106, + "reward_std": 0.12184309475123882, + "rewards/accuracy_reward": 0.037500002048909666, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9619791865348816, + "step": 1281 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.2937683105469, + "epoch": 0.4103056489038246, + "grad_norm": 0.46435990929603577, + "kl": 0.5058012694120407, + "learning_rate": 1.4687148529134621e-05, + "loss": 0.1268, + "reward": 0.9937500178813934, + "reward_std": 0.16947735473513603, + "rewards/accuracy_reward": 0.035416668839752675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9583333492279053, + "step": 1282 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.6354370117188, + "epoch": 0.41062570011201793, + "grad_norm": 0.08456681668758392, + "kl": 0.2596034061163664, + "learning_rate": 1.467727322977264e-05, + "loss": 0.0533, + "reward": 1.015625011920929, + "reward_std": 0.08625244870781898, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250059604645, + "step": 1283 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.0146057128907, + "epoch": 0.4109457513202112, + "grad_norm": 0.14722603559494019, + "kl": 0.26935129314661027, + "learning_rate": 1.4667392088286946e-05, + "loss": 0.0653, + "reward": 1.1036458671092988, + "reward_std": 0.12447739690542221, + "rewards/accuracy_reward": 0.12500000409781933, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458432674408, + "step": 1284 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.1020935058593, + "epoch": 0.41126580252840456, + "grad_norm": 0.1227230504155159, + "kl": 0.3881824046373367, + "learning_rate": 1.4657505117019523e-05, + "loss": 0.1171, + "reward": 1.140625035762787, + "reward_std": 0.1513745330274105, + "rewards/accuracy_reward": 0.16875000558793546, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.971875011920929, + "step": 1285 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.5791900634765, + "epoch": 0.41158585373659784, + "grad_norm": 0.04737142100930214, + "kl": 0.231598449498415, + "learning_rate": 1.4647612328319645e-05, + "loss": 0.0141, + "reward": 1.1666666984558105, + "reward_std": 0.09083670191466808, + "rewards/accuracy_reward": 0.17916667461395264, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9854166805744171, + "step": 1286 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.2229370117187, + "epoch": 0.4119059049447912, + "grad_norm": 0.10313890129327774, + "kl": 0.2947948418557644, + "learning_rate": 1.4637713734543844e-05, + "loss": 0.0591, + "reward": 1.092187523841858, + "reward_std": 0.07089405842125415, + "rewards/accuracy_reward": 0.10833333656191826, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9838541805744171, + "step": 1287 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.3896026611328, + "epoch": 0.41222595615298446, + "grad_norm": 0.12789444625377655, + "kl": 0.3417969450354576, + "learning_rate": 1.4627809348055908e-05, + "loss": 0.0693, + "reward": 1.0427083730697633, + "reward_std": 0.12029938250780106, + "rewards/accuracy_reward": 0.06875000316649675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583492279053, + "step": 1288 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.1062683105469, + "epoch": 0.4125460073611778, + "grad_norm": 0.06556902825832367, + "kl": 0.1632651649415493, + "learning_rate": 1.461789918122686e-05, + "loss": 0.0484, + "reward": 1.0713542103767395, + "reward_std": 0.10263727717101574, + "rewards/accuracy_reward": 0.08541666902601719, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9859375119209289, + "step": 1289 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.1187744140625, + "epoch": 0.4128660585693711, + "grad_norm": 0.03857985883951187, + "kl": 0.14200967624783517, + "learning_rate": 1.460798324643494e-05, + "loss": 0.0268, + "reward": 1.1416666984558106, + "reward_std": 0.09959968477487564, + "rewards/accuracy_reward": 0.15000000577419997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9916666746139526, + "step": 1290 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.018765258789, + "epoch": 0.4131861097775644, + "grad_norm": 0.13862329721450806, + "kl": 0.49638293087482455, + "learning_rate": 1.4598061556065598e-05, + "loss": 0.0776, + "reward": 1.066666692495346, + "reward_std": 0.13814237490296363, + "rewards/accuracy_reward": 0.09166666883975268, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9750000178813935, + "step": 1291 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.9104431152343, + "epoch": 0.4135061609857577, + "grad_norm": 0.1622048169374466, + "kl": 0.22472710385918618, + "learning_rate": 1.4588134122511467e-05, + "loss": 0.0493, + "reward": 1.059375035762787, + "reward_std": 0.11591703221201896, + "rewards/accuracy_reward": 0.0750000024214387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750119209289, + "step": 1292 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.279183959961, + "epoch": 0.41382621219395105, + "grad_norm": 0.0760549008846283, + "kl": 0.20149843543767929, + "learning_rate": 1.457820095817236e-05, + "loss": 0.0466, + "reward": 1.0385416865348815, + "reward_std": 0.09624491930007935, + "rewards/accuracy_reward": 0.0541666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750059604645, + "step": 1293 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.4354278564454, + "epoch": 0.41414626340214433, + "grad_norm": 0.07412170618772507, + "kl": 0.1810336247086525, + "learning_rate": 1.4568262075455237e-05, + "loss": 0.0198, + "reward": 1.1057291984558106, + "reward_std": 0.07921138815581799, + "rewards/accuracy_reward": 0.11458333730697631, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9911458432674408, + "step": 1294 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.7146057128906, + "epoch": 0.41446631461033767, + "grad_norm": 0.11219371855258942, + "kl": 0.3419033609330654, + "learning_rate": 1.4558317486774216e-05, + "loss": 0.0647, + "reward": 1.0661458730697633, + "reward_std": 0.09991935733705759, + "rewards/accuracy_reward": 0.07708333563059569, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9869791865348816, + "step": 1295 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.9521026611328, + "epoch": 0.41478636581853096, + "grad_norm": 0.10580622404813766, + "kl": 0.1414179392158985, + "learning_rate": 1.4548367204550526e-05, + "loss": 0.0409, + "reward": 1.0713541865348817, + "reward_std": 0.09464184809476137, + "rewards/accuracy_reward": 0.07916666883975268, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9921875119209289, + "step": 1296 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.620849609375, + "epoch": 0.4151064170267243, + "grad_norm": 0.06379967927932739, + "kl": 0.1508323907852173, + "learning_rate": 1.4538411241212518e-05, + "loss": 0.0404, + "reward": 1.0572916924953462, + "reward_std": 0.1208875872194767, + "rewards/accuracy_reward": 0.06666666977107524, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.990625011920929, + "step": 1297 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.933349609375, + "epoch": 0.4154264682349176, + "grad_norm": 0.18415191769599915, + "kl": 0.5200513236224651, + "learning_rate": 1.4528449609195639e-05, + "loss": 0.0552, + "reward": 1.0677083611488343, + "reward_std": 0.12492301575839519, + "rewards/accuracy_reward": 0.09166667014360427, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9760416805744171, + "step": 1298 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.5270965576171, + "epoch": 0.4157465194431109, + "grad_norm": 0.09631240367889404, + "kl": 0.23828165605664253, + "learning_rate": 1.4518482320942409e-05, + "loss": 0.0598, + "reward": 1.021875023841858, + "reward_std": 0.11106022223830223, + "rewards/accuracy_reward": 0.03750000093132257, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750059604645, + "step": 1299 + }, + { + "clip_ratio": 0.0, + "completion_length": 622.9687622070312, + "epoch": 0.4160665706513042, + "grad_norm": 0.05873395502567291, + "kl": 0.22104017101228238, + "learning_rate": 1.4508509388902421e-05, + "loss": 0.0281, + "reward": 1.1083333551883698, + "reward_std": 0.11215355768799781, + "rewards/accuracy_reward": 0.12708333730697632, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9812500178813934, + "step": 1300 + }, + { + "clip_ratio": 0.0, + "completion_length": 593.1458557128906, + "epoch": 0.41638662185949754, + "grad_norm": 0.07611161470413208, + "kl": 0.42822214812040327, + "learning_rate": 1.4498530825532309e-05, + "loss": 0.0433, + "reward": 1.0796875357627869, + "reward_std": 0.11318734250962734, + "rewards/accuracy_reward": 0.10000000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.979687511920929, + "step": 1301 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.6083435058594, + "epoch": 0.4167066730676908, + "grad_norm": 0.08425391465425491, + "kl": 0.2452217899262905, + "learning_rate": 1.448854664329575e-05, + "loss": 0.056, + "reward": 1.0208333551883697, + "reward_std": 0.12165859192609788, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9833333492279053, + "step": 1302 + }, + { + "clip_ratio": 0.0, + "completion_length": 595.1333557128906, + "epoch": 0.41702672427588416, + "grad_norm": 0.07507262378931046, + "kl": 0.16818516626954078, + "learning_rate": 1.4478556854663435e-05, + "loss": 0.0294, + "reward": 1.0869791865348817, + "reward_std": 0.1346562247723341, + "rewards/accuracy_reward": 0.09791667070239782, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9890625059604645, + "step": 1303 + }, + { + "clip_ratio": 0.0, + "completion_length": 604.5583557128906, + "epoch": 0.41734677548407745, + "grad_norm": 0.06492973864078522, + "kl": 0.2809316359460354, + "learning_rate": 1.4468561472113053e-05, + "loss": 0.0517, + "reward": 0.9942708492279053, + "reward_std": 0.08789652790874243, + "rewards/accuracy_reward": 0.01250000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708432674408, + "step": 1304 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.5625152587891, + "epoch": 0.4176668266922708, + "grad_norm": 0.2417496144771576, + "kl": 0.35190742164850236, + "learning_rate": 1.4458560508129286e-05, + "loss": 0.0777, + "reward": 1.008854192495346, + "reward_std": 0.1146691657602787, + "rewards/accuracy_reward": 0.029166667349636556, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9796875178813934, + "step": 1305 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.8854339599609, + "epoch": 0.41798687790046407, + "grad_norm": 0.07067929953336716, + "kl": 0.43219382539391515, + "learning_rate": 1.444855397520379e-05, + "loss": 0.0774, + "reward": 1.0885416984558105, + "reward_std": 0.19529549181461334, + "rewards/accuracy_reward": 0.1166666692122817, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9718750178813934, + "step": 1306 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.2937622070312, + "epoch": 0.4183069291086574, + "grad_norm": 0.17449365556240082, + "kl": 0.4062021173536777, + "learning_rate": 1.4438541885835167e-05, + "loss": 0.0993, + "reward": 1.0052083671092986, + "reward_std": 0.1413590393960476, + "rewards/accuracy_reward": 0.033333334140479565, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.971875011920929, + "step": 1307 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.5812622070313, + "epoch": 0.4186269803168507, + "grad_norm": 0.11451167613267899, + "kl": 0.30510389655828474, + "learning_rate": 1.4428524252528968e-05, + "loss": 0.0636, + "reward": 1.0005208551883698, + "reward_std": 0.07861845903098583, + "rewards/accuracy_reward": 0.016666667722165584, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9838541805744171, + "step": 1308 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.8896026611328, + "epoch": 0.41894703152504403, + "grad_norm": 0.12974369525909424, + "kl": 0.3899438038468361, + "learning_rate": 1.4418501087797667e-05, + "loss": 0.0522, + "reward": 1.029166692495346, + "reward_std": 0.0809813478961587, + "rewards/accuracy_reward": 0.04583333563059568, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9833333492279053, + "step": 1309 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.8771057128906, + "epoch": 0.4192670827332373, + "grad_norm": 0.370892196893692, + "kl": 0.6947083935141564, + "learning_rate": 1.440847240416064e-05, + "loss": 0.0718, + "reward": 1.0385416984558105, + "reward_std": 0.14330552741885186, + "rewards/accuracy_reward": 0.06041666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250178813934, + "step": 1310 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.027099609375, + "epoch": 0.4195871339414306, + "grad_norm": 0.20165027678012848, + "kl": 0.5955793671309948, + "learning_rate": 1.4398438214144168e-05, + "loss": 0.1082, + "reward": 1.053645873069763, + "reward_std": 0.1770230144262314, + "rewards/accuracy_reward": 0.08125000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9723958551883698, + "step": 1311 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.0750183105469, + "epoch": 0.41990718514962394, + "grad_norm": 0.31423094868659973, + "kl": 0.6826820693910122, + "learning_rate": 1.4388398530281403e-05, + "loss": 0.0712, + "reward": 1.0364583551883697, + "reward_std": 0.12629178818315268, + "rewards/accuracy_reward": 0.05625000223517418, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9781250119209289, + "step": 1312 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.3646057128906, + "epoch": 0.4202272363578172, + "grad_norm": 0.30876481533050537, + "kl": 0.5968435898423194, + "learning_rate": 1.4378353365112353e-05, + "loss": 0.1111, + "reward": 1.0307291805744172, + "reward_std": 0.1586002826690674, + "rewards/accuracy_reward": 0.0645833345130086, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9661458432674408, + "step": 1313 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.1750244140625, + "epoch": 0.42054728756601056, + "grad_norm": 0.1654668152332306, + "kl": 0.26816257983446123, + "learning_rate": 1.436830273118389e-05, + "loss": 0.0452, + "reward": 1.0468750298023224, + "reward_std": 0.11725292392075062, + "rewards/accuracy_reward": 0.06666666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9802083551883698, + "step": 1314 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.8916839599609, + "epoch": 0.42086733877420385, + "grad_norm": 0.12751136720180511, + "kl": 0.33656053617596626, + "learning_rate": 1.4358246641049696e-05, + "loss": 0.0896, + "reward": 1.046354204416275, + "reward_std": 0.14575700759887694, + "rewards/accuracy_reward": 0.07291666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9734375238418579, + "step": 1315 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.2708618164063, + "epoch": 0.4211873899823972, + "grad_norm": 0.20244768261909485, + "kl": 0.6099851727485657, + "learning_rate": 1.4348185107270282e-05, + "loss": 0.0776, + "reward": 1.0927083671092988, + "reward_std": 0.16826162450015544, + "rewards/accuracy_reward": 0.11875000316649675, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583492279053, + "step": 1316 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.8937744140625, + "epoch": 0.42150744119059047, + "grad_norm": 0.13575513660907745, + "kl": 0.4396487962454557, + "learning_rate": 1.4338118142412956e-05, + "loss": 0.0572, + "reward": 1.1109375298023223, + "reward_std": 0.15248046070337296, + "rewards/accuracy_reward": 0.13750000409781932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.973437511920929, + "step": 1317 + }, + { + "clip_ratio": 0.0, + "completion_length": 617.4583557128906, + "epoch": 0.4218274923987838, + "grad_norm": 0.07689894735813141, + "kl": 0.4052378758788109, + "learning_rate": 1.4328045759051805e-05, + "loss": 0.0527, + "reward": 1.017187523841858, + "reward_std": 0.1121408674865961, + "rewards/accuracy_reward": 0.0458333345130086, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9713541865348816, + "step": 1318 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.8083557128906, + "epoch": 0.4221475436069771, + "grad_norm": 0.12368390709161758, + "kl": 0.47785288393497466, + "learning_rate": 1.4317967969767688e-05, + "loss": 0.0957, + "reward": 1.0130208492279054, + "reward_std": 0.16585390493273736, + "rewards/accuracy_reward": 0.054166669771075246, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9588541746139526, + "step": 1319 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.3416870117187, + "epoch": 0.42246759481517043, + "grad_norm": 0.20103225111961365, + "kl": 0.5053670577704906, + "learning_rate": 1.4307884787148216e-05, + "loss": 0.0999, + "reward": 1.0145833671092988, + "reward_std": 0.2102899357676506, + "rewards/accuracy_reward": 0.06458333618938923, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9500000238418579, + "step": 1320 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.6166809082031, + "epoch": 0.4227876460233637, + "grad_norm": 0.12856552004814148, + "kl": 0.45006143152713773, + "learning_rate": 1.4297796223787734e-05, + "loss": 0.0796, + "reward": 1.0359375298023223, + "reward_std": 0.16981710996478797, + "rewards/accuracy_reward": 0.07500000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9609375178813935, + "step": 1321 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.7375213623047, + "epoch": 0.42310769723155706, + "grad_norm": 0.26487281918525696, + "kl": 0.5033496886491775, + "learning_rate": 1.4287702292287308e-05, + "loss": 0.115, + "reward": 1.142187523841858, + "reward_std": 0.18477849662303925, + "rewards/accuracy_reward": 0.18333333563059567, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9588541805744171, + "step": 1322 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.2937713623047, + "epoch": 0.42342774843975034, + "grad_norm": 0.08229225873947144, + "kl": 0.34848415181040765, + "learning_rate": 1.4277603005254715e-05, + "loss": 0.0462, + "reward": 1.048437523841858, + "reward_std": 0.11120131872594356, + "rewards/accuracy_reward": 0.07916666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9692708492279053, + "step": 1323 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.0687713623047, + "epoch": 0.4237477996479437, + "grad_norm": 0.2630603015422821, + "kl": 0.40130707398056986, + "learning_rate": 1.4267498375304417e-05, + "loss": 0.0867, + "reward": 1.148437535762787, + "reward_std": 0.1714845359325409, + "rewards/accuracy_reward": 0.16875000353902578, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.979687511920929, + "step": 1324 + }, + { + "clip_ratio": 0.0, + "completion_length": 614.0854309082031, + "epoch": 0.42406785085613696, + "grad_norm": 0.2801266610622406, + "kl": 0.7189634054899215, + "learning_rate": 1.425738841505754e-05, + "loss": 0.1297, + "reward": 1.0114583671092987, + "reward_std": 0.16967907920479774, + "rewards/accuracy_reward": 0.05208333469927311, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9593750238418579, + "step": 1325 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.1020965576172, + "epoch": 0.4243879020643303, + "grad_norm": 0.5371996164321899, + "kl": 0.9050520665943622, + "learning_rate": 1.4247273137141888e-05, + "loss": 0.15, + "reward": 1.0114583492279052, + "reward_std": 0.16600136533379556, + "rewards/accuracy_reward": 0.05833333507180214, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9531250119209289, + "step": 1326 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.4521026611328, + "epoch": 0.4247079532725236, + "grad_norm": 0.36296284198760986, + "kl": 0.7386391490697861, + "learning_rate": 1.4237152554191889e-05, + "loss": 0.0947, + "reward": 0.9786458432674408, + "reward_std": 0.16325723454356195, + "rewards/accuracy_reward": 0.02291666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9557291746139527, + "step": 1327 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.6958557128906, + "epoch": 0.4250280044807169, + "grad_norm": 0.19977205991744995, + "kl": 0.4357809633016586, + "learning_rate": 1.422702667884861e-05, + "loss": 0.1095, + "reward": 1.0614583492279053, + "reward_std": 0.1346949001774192, + "rewards/accuracy_reward": 0.08541666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9760416805744171, + "step": 1328 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.3979431152344, + "epoch": 0.4253480556889102, + "grad_norm": 0.19443321228027344, + "kl": 0.5088992670178414, + "learning_rate": 1.421689552375972e-05, + "loss": 0.114, + "reward": 1.0536458551883698, + "reward_std": 0.16536558866500856, + "rewards/accuracy_reward": 0.09166666995733977, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9619791865348816, + "step": 1329 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.1500244140625, + "epoch": 0.42566810689710355, + "grad_norm": 0.10612791031599045, + "kl": 0.22260906249284745, + "learning_rate": 1.4206759101579481e-05, + "loss": 0.0702, + "reward": 1.0682291746139527, + "reward_std": 0.12378347031772137, + "rewards/accuracy_reward": 0.08750000335276127, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9786458492279053, + "step": 1330 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.0666778564453, + "epoch": 0.42598815810529683, + "grad_norm": 0.30102500319480896, + "kl": 0.4394591063261032, + "learning_rate": 1.4196617424968744e-05, + "loss": 0.0728, + "reward": 1.051562535762787, + "reward_std": 0.13498535864055156, + "rewards/accuracy_reward": 0.07500000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9765625119209289, + "step": 1331 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.0229370117188, + "epoch": 0.42630820931349017, + "grad_norm": 0.17076443135738373, + "kl": 0.2940096914768219, + "learning_rate": 1.4186470506594919e-05, + "loss": 0.0614, + "reward": 1.027604204416275, + "reward_std": 0.12638225294649602, + "rewards/accuracy_reward": 0.05208333507180214, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208492279053, + "step": 1332 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.5958435058594, + "epoch": 0.42662826052168346, + "grad_norm": 0.2015293687582016, + "kl": 0.1857258200645447, + "learning_rate": 1.4176318359131955e-05, + "loss": 0.0245, + "reward": 1.0583333492279052, + "reward_std": 0.09938515722751617, + "rewards/accuracy_reward": 0.07291666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9854166746139527, + "step": 1333 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.6291870117187, + "epoch": 0.4269483117298768, + "grad_norm": 0.14335834980010986, + "kl": 0.2323061317205429, + "learning_rate": 1.4166160995260342e-05, + "loss": 0.0585, + "reward": 1.136979192495346, + "reward_std": 0.13655745461583138, + "rewards/accuracy_reward": 0.16041666697710752, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9765625059604645, + "step": 1334 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.8916839599609, + "epoch": 0.4272683629380701, + "grad_norm": 0.12033113837242126, + "kl": 0.2362310327589512, + "learning_rate": 1.4155998427667083e-05, + "loss": 0.0372, + "reward": 1.0317708492279052, + "reward_std": 0.11146253608167171, + "rewards/accuracy_reward": 0.04791666772216559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9838541805744171, + "step": 1335 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.7312744140625, + "epoch": 0.4275884141462634, + "grad_norm": 0.1028214618563652, + "kl": 0.3947672449052334, + "learning_rate": 1.414583066904568e-05, + "loss": 0.0747, + "reward": 1.0239583611488343, + "reward_std": 0.08998498003929853, + "rewards/accuracy_reward": 0.04375000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9802083551883698, + "step": 1336 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.7854431152343, + "epoch": 0.4279084653544567, + "grad_norm": 0.11494532227516174, + "kl": 0.3638791225850582, + "learning_rate": 1.4135657732096118e-05, + "loss": 0.1045, + "reward": 1.0416666984558105, + "reward_std": 0.12086176574230194, + "rewards/accuracy_reward": 0.07291666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9687500119209289, + "step": 1337 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.3312683105469, + "epoch": 0.42822851656265004, + "grad_norm": 0.09641270339488983, + "kl": 0.22046290934085847, + "learning_rate": 1.4125479629524849e-05, + "loss": 0.0818, + "reward": 1.0807291984558105, + "reward_std": 0.1261975012719631, + "rewards/accuracy_reward": 0.10625000316649676, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9744791865348816, + "step": 1338 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.4583526611328, + "epoch": 0.4285485677708433, + "grad_norm": 0.07313210517168045, + "kl": 0.35226295664906504, + "learning_rate": 1.411529637404478e-05, + "loss": 0.0564, + "reward": 1.121354204416275, + "reward_std": 0.14535944275557994, + "rewards/accuracy_reward": 0.14791666958481073, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9734375178813934, + "step": 1339 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.185433959961, + "epoch": 0.42886861897903666, + "grad_norm": 0.09117227047681808, + "kl": 0.3916263036429882, + "learning_rate": 1.4105107978375256e-05, + "loss": 0.093, + "reward": 0.986979192495346, + "reward_std": 0.10924595408141613, + "rewards/accuracy_reward": 0.010416666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9765625178813935, + "step": 1340 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.2000244140625, + "epoch": 0.42918867018722995, + "grad_norm": 0.17177411913871765, + "kl": 0.3724663570523262, + "learning_rate": 1.409491445524204e-05, + "loss": 0.0719, + "reward": 1.139062523841858, + "reward_std": 0.15040809139609337, + "rewards/accuracy_reward": 0.15833333749324083, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9807291746139526, + "step": 1341 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.383349609375, + "epoch": 0.4295087213954233, + "grad_norm": 0.248824343085289, + "kl": 0.46576319485902784, + "learning_rate": 1.4084715817377292e-05, + "loss": 0.0807, + "reward": 1.0031250238418579, + "reward_std": 0.1396949838846922, + "rewards/accuracy_reward": 0.02916666753590107, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583492279053, + "step": 1342 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.879183959961, + "epoch": 0.42982877260361657, + "grad_norm": 0.15470728278160095, + "kl": 0.295084173977375, + "learning_rate": 1.4074512077519571e-05, + "loss": 0.0712, + "reward": 1.148437535762787, + "reward_std": 0.11446435116231442, + "rewards/accuracy_reward": 0.170833339355886, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9776041805744171, + "step": 1343 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.7166778564454, + "epoch": 0.4301488238118099, + "grad_norm": 0.11134085059165955, + "kl": 0.4050963319838047, + "learning_rate": 1.4064303248413808e-05, + "loss": 0.0674, + "reward": 1.0354166865348815, + "reward_std": 0.1173494003713131, + "rewards/accuracy_reward": 0.05208333618938923, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9833333373069764, + "step": 1344 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.3625183105469, + "epoch": 0.4304688750200032, + "grad_norm": 0.1598719358444214, + "kl": 0.3194881580770016, + "learning_rate": 1.4054089342811286e-05, + "loss": 0.0722, + "reward": 1.0427083492279052, + "reward_std": 0.11805956549942494, + "rewards/accuracy_reward": 0.0645833346992731, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250178813934, + "step": 1345 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.6125091552734, + "epoch": 0.43078892622819653, + "grad_norm": 0.06611867994070053, + "kl": 0.24079276397824287, + "learning_rate": 1.4043870373469628e-05, + "loss": 0.0698, + "reward": 1.0916666865348816, + "reward_std": 0.12190479338169098, + "rewards/accuracy_reward": 0.1083333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9833333492279053, + "step": 1346 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.9312622070313, + "epoch": 0.4311089774363898, + "grad_norm": 0.16819950938224792, + "kl": 0.30558022633194926, + "learning_rate": 1.4033646353152786e-05, + "loss": 0.05, + "reward": 1.0427083551883698, + "reward_std": 0.14329716470092535, + "rewards/accuracy_reward": 0.0604166679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9822916746139526, + "step": 1347 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.6791839599609, + "epoch": 0.43142902864458316, + "grad_norm": 0.06702463328838348, + "kl": 0.24792197570204735, + "learning_rate": 1.4023417294631019e-05, + "loss": 0.0523, + "reward": 1.0468750298023224, + "reward_std": 0.0981780519708991, + "rewards/accuracy_reward": 0.06250000316649676, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750119209289, + "step": 1348 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.0000152587891, + "epoch": 0.43174907985277644, + "grad_norm": 0.1554039567708969, + "kl": 0.26410412788391113, + "learning_rate": 1.401318321068088e-05, + "loss": 0.0761, + "reward": 1.0552083432674408, + "reward_std": 0.13532831519842148, + "rewards/accuracy_reward": 0.0791666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9760416746139526, + "step": 1349 + }, + { + "clip_ratio": 0.0, + "completion_length": 506.9354309082031, + "epoch": 0.4320691310609698, + "grad_norm": 0.07704272121191025, + "kl": 0.310164712369442, + "learning_rate": 1.40029441140852e-05, + "loss": 0.034, + "reward": 1.0609375298023225, + "reward_std": 0.13052574992179872, + "rewards/accuracy_reward": 0.08125000260770321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.979687511920929, + "step": 1350 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.5687683105468, + "epoch": 0.43238918226916306, + "grad_norm": 0.07124790549278259, + "kl": 0.33384485468268393, + "learning_rate": 1.3992700017633063e-05, + "loss": 0.0371, + "reward": 1.0890625357627868, + "reward_std": 0.15975245274603367, + "rewards/accuracy_reward": 0.10833333600312471, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9807291865348816, + "step": 1351 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.70419921875, + "epoch": 0.4327092334773564, + "grad_norm": 0.05800170451402664, + "kl": 0.24518816471099852, + "learning_rate": 1.3982450934119808e-05, + "loss": 0.0494, + "reward": 1.035937535762787, + "reward_std": 0.12522053439170122, + "rewards/accuracy_reward": 0.052083334513008596, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9817708492279053, + "step": 1352 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.5833526611328, + "epoch": 0.4330292846855497, + "grad_norm": 0.23074017465114594, + "kl": 0.4324066393077374, + "learning_rate": 1.3972196876347005e-05, + "loss": 0.0881, + "reward": 1.1557292103767396, + "reward_std": 0.10613631978631019, + "rewards/accuracy_reward": 0.17291667219251394, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.982812511920929, + "step": 1353 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.9500213623047, + "epoch": 0.433349335893743, + "grad_norm": 0.20919708907604218, + "kl": 0.3582280553877354, + "learning_rate": 1.3961937857122418e-05, + "loss": 0.0597, + "reward": 1.097916692495346, + "reward_std": 0.10435024127364159, + "rewards/accuracy_reward": 0.12291667070239783, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9750000059604644, + "step": 1354 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.3750213623047, + "epoch": 0.4336693871019363, + "grad_norm": 0.16085828840732574, + "kl": 0.47188392728567125, + "learning_rate": 1.3951673889260033e-05, + "loss": 0.0707, + "reward": 1.1635417103767396, + "reward_std": 0.1453533548861742, + "rewards/accuracy_reward": 0.18333334047347308, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9802083492279052, + "step": 1355 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.4375183105469, + "epoch": 0.4339894383101296, + "grad_norm": 0.09656932204961777, + "kl": 0.3448480650782585, + "learning_rate": 1.394140498558e-05, + "loss": 0.0775, + "reward": 0.9875000178813934, + "reward_std": 0.12802811972796918, + "rewards/accuracy_reward": 0.01666666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333551883698, + "step": 1356 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.3520965576172, + "epoch": 0.43430948951832293, + "grad_norm": 0.12296207249164581, + "kl": 0.41478071361780167, + "learning_rate": 1.3931131158908644e-05, + "loss": 0.0401, + "reward": 1.014583373069763, + "reward_std": 0.12463017739355564, + "rewards/accuracy_reward": 0.0375000013038516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833611488342, + "step": 1357 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.7896118164062, + "epoch": 0.4346295407265162, + "grad_norm": 0.11532151699066162, + "kl": 0.5874006308615207, + "learning_rate": 1.392085242207843e-05, + "loss": 0.1042, + "reward": 0.9869791865348816, + "reward_std": 0.15448905751109124, + "rewards/accuracy_reward": 0.025000000931322576, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9619791805744171, + "step": 1358 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.7541900634766, + "epoch": 0.43494959193470956, + "grad_norm": 0.16093531250953674, + "kl": 0.7109584361314774, + "learning_rate": 1.391056878792796e-05, + "loss": 0.1082, + "reward": 1.0348958671092987, + "reward_std": 0.18050943203270436, + "rewards/accuracy_reward": 0.07291666958481073, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9619791865348816, + "step": 1359 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.0562744140625, + "epoch": 0.43526964314290284, + "grad_norm": 0.13128206133842468, + "kl": 0.5672376766800881, + "learning_rate": 1.3900280269301957e-05, + "loss": 0.0712, + "reward": 1.043750035762787, + "reward_std": 0.17327113449573517, + "rewards/accuracy_reward": 0.07500000167638063, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9687500178813935, + "step": 1360 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.2541809082031, + "epoch": 0.4355896943510962, + "grad_norm": 0.09163526445627213, + "kl": 0.29370440244674684, + "learning_rate": 1.3889986879051242e-05, + "loss": 0.0631, + "reward": 1.1031250357627869, + "reward_std": 0.13333576880395412, + "rewards/accuracy_reward": 0.1250000050291419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250178813934, + "step": 1361 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.808349609375, + "epoch": 0.43590974555928946, + "grad_norm": 0.14829999208450317, + "kl": 0.40503218322992324, + "learning_rate": 1.3879688630032717e-05, + "loss": 0.0806, + "reward": 1.079166704416275, + "reward_std": 0.1650255832821131, + "rewards/accuracy_reward": 0.10833333563059569, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333551883698, + "step": 1362 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.5041900634766, + "epoch": 0.4362297967674828, + "grad_norm": 0.23036116361618042, + "kl": 0.4940149299800396, + "learning_rate": 1.3869385535109358e-05, + "loss": 0.0977, + "reward": 1.0208333492279054, + "reward_std": 0.18670744299888611, + "rewards/accuracy_reward": 0.05208333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9687500119209289, + "step": 1363 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.0000244140625, + "epoch": 0.4365498479756761, + "grad_norm": 0.07721938192844391, + "kl": 0.32006452083587644, + "learning_rate": 1.385907760715019e-05, + "loss": 0.0435, + "reward": 1.0260416865348816, + "reward_std": 0.10851494073867798, + "rewards/accuracy_reward": 0.05208333507180214, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583492279053, + "step": 1364 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.4312744140625, + "epoch": 0.4368698991838694, + "grad_norm": 0.07113399356603622, + "kl": 0.3193306714296341, + "learning_rate": 1.3848764859030281e-05, + "loss": 0.0463, + "reward": 1.076562523841858, + "reward_std": 0.14118604324758052, + "rewards/accuracy_reward": 0.09166666939854622, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9848958432674408, + "step": 1365 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.0958587646485, + "epoch": 0.4371899503920627, + "grad_norm": 0.0633530393242836, + "kl": 0.27230303883552553, + "learning_rate": 1.3838447303630713e-05, + "loss": 0.0554, + "reward": 1.0484375298023223, + "reward_std": 0.12122409045696259, + "rewards/accuracy_reward": 0.06666666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708432674408, + "step": 1366 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.2021026611328, + "epoch": 0.43751000160025605, + "grad_norm": 0.0803065150976181, + "kl": 0.3396205462515354, + "learning_rate": 1.3828124953838574e-05, + "loss": 0.0545, + "reward": 1.0364583671092986, + "reward_std": 0.12746839523315429, + "rewards/accuracy_reward": 0.05833333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250178813934, + "step": 1367 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.239599609375, + "epoch": 0.43783005280844933, + "grad_norm": 0.10097281634807587, + "kl": 0.3694083333015442, + "learning_rate": 1.381779782254694e-05, + "loss": 0.0711, + "reward": 1.115625023841858, + "reward_std": 0.17042535543441772, + "rewards/accuracy_reward": 0.1375000026077032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250119209289, + "step": 1368 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.095849609375, + "epoch": 0.43815010401664267, + "grad_norm": 0.1160140186548233, + "kl": 0.41535960510373116, + "learning_rate": 1.3807465922654863e-05, + "loss": 0.0712, + "reward": 0.9838541805744171, + "reward_std": 0.10553747303783893, + "rewards/accuracy_reward": 0.012500000558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9713541865348816, + "step": 1369 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.1791839599609, + "epoch": 0.43847015522483596, + "grad_norm": 0.1678745597600937, + "kl": 0.25593645721673963, + "learning_rate": 1.3797129267067348e-05, + "loss": 0.0589, + "reward": 1.0921875357627868, + "reward_std": 0.14206044673919677, + "rewards/accuracy_reward": 0.11041667088866233, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708373069763, + "step": 1370 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.6916809082031, + "epoch": 0.4387902064330293, + "grad_norm": 0.0591396726667881, + "kl": 0.24178946688771247, + "learning_rate": 1.378678786869534e-05, + "loss": 0.0485, + "reward": 1.0317708492279052, + "reward_std": 0.1332608327269554, + "rewards/accuracy_reward": 0.054166667722165586, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9776041746139527, + "step": 1371 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.7541900634766, + "epoch": 0.4391102576412226, + "grad_norm": 0.11310972273349762, + "kl": 0.44280795007944107, + "learning_rate": 1.3776441740455706e-05, + "loss": 0.0808, + "reward": 1.0494792103767394, + "reward_std": 0.162718590721488, + "rewards/accuracy_reward": 0.07916666772216559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9703125119209289, + "step": 1372 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.5375183105468, + "epoch": 0.4394303088494159, + "grad_norm": 0.7537197470664978, + "kl": 0.926392175257206, + "learning_rate": 1.376609089527123e-05, + "loss": 0.1126, + "reward": 1.079166692495346, + "reward_std": 0.13849810790270567, + "rewards/accuracy_reward": 0.11875000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9604166865348815, + "step": 1373 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.4771026611328, + "epoch": 0.4397503600576092, + "grad_norm": 0.17892588675022125, + "kl": 0.3169956490397453, + "learning_rate": 1.3755735346070576e-05, + "loss": 0.0841, + "reward": 1.1223958790302277, + "reward_std": 0.18202281817793847, + "rewards/accuracy_reward": 0.14791667200624942, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9744791865348816, + "step": 1374 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.9979370117187, + "epoch": 0.44007041126580254, + "grad_norm": 0.1239943578839302, + "kl": 0.5117902666330337, + "learning_rate": 1.374537510578829e-05, + "loss": 0.0844, + "reward": 1.0890625357627868, + "reward_std": 0.13241406325250865, + "rewards/accuracy_reward": 0.1166666703298688, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9723958492279052, + "step": 1375 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.6083526611328, + "epoch": 0.4403904624739958, + "grad_norm": 0.12511348724365234, + "kl": 0.4907986491918564, + "learning_rate": 1.3735010187364776e-05, + "loss": 0.0473, + "reward": 0.9937500119209289, + "reward_std": 0.13010377399623393, + "rewards/accuracy_reward": 0.01875000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9750000178813935, + "step": 1376 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.4604370117188, + "epoch": 0.44071051368218916, + "grad_norm": 0.11388861387968063, + "kl": 0.2962417095899582, + "learning_rate": 1.3724640603746282e-05, + "loss": 0.0731, + "reward": 1.1531250298023223, + "reward_std": 0.1589014722034335, + "rewards/accuracy_reward": 0.1812500048428774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9718750178813934, + "step": 1377 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.4437683105468, + "epoch": 0.44103056489038245, + "grad_norm": 0.07961485534906387, + "kl": 0.2853081613779068, + "learning_rate": 1.3714266367884883e-05, + "loss": 0.0625, + "reward": 1.008854204416275, + "reward_std": 0.1624489687383175, + "rewards/accuracy_reward": 0.0354166679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9734375178813934, + "step": 1378 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.8771057128906, + "epoch": 0.4413506160985758, + "grad_norm": 0.2938500940799713, + "kl": 0.4174025818705559, + "learning_rate": 1.3703887492738463e-05, + "loss": 0.1021, + "reward": 1.0604166865348816, + "reward_std": 0.12792656924575568, + "rewards/accuracy_reward": 0.09166666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9687500178813935, + "step": 1379 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.7812713623047, + "epoch": 0.44167066730676907, + "grad_norm": 0.19335335493087769, + "kl": 0.3752726331353188, + "learning_rate": 1.36935039912707e-05, + "loss": 0.0885, + "reward": 1.1031250298023223, + "reward_std": 0.16487400010228156, + "rewards/accuracy_reward": 0.1375000050291419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.965625011920929, + "step": 1380 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.3437622070312, + "epoch": 0.4419907185149624, + "grad_norm": 0.1562214195728302, + "kl": 0.46785186752676966, + "learning_rate": 1.3683115876451054e-05, + "loss": 0.0611, + "reward": 1.1208333492279052, + "reward_std": 0.16405072771012782, + "rewards/accuracy_reward": 0.15000000428408383, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333432674408, + "step": 1381 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.9271057128906, + "epoch": 0.4423107697231557, + "grad_norm": 0.1530592143535614, + "kl": 0.3877778798341751, + "learning_rate": 1.3672723161254748e-05, + "loss": 0.0817, + "reward": 1.063541692495346, + "reward_std": 0.18836085498332977, + "rewards/accuracy_reward": 0.10000000335276127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1382 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.8270874023438, + "epoch": 0.44263082093134903, + "grad_norm": 0.23629511892795563, + "kl": 0.3606575734913349, + "learning_rate": 1.3662325858662743e-05, + "loss": 0.0675, + "reward": 1.1322916924953461, + "reward_std": 0.10985236279666424, + "rewards/accuracy_reward": 0.1541666707023978, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250178813934, + "step": 1383 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.9500274658203, + "epoch": 0.4429508721395423, + "grad_norm": 0.1339237093925476, + "kl": 0.508060896396637, + "learning_rate": 1.3651923981661741e-05, + "loss": 0.1255, + "reward": 0.9791666984558105, + "reward_std": 0.16428967230021954, + "rewards/accuracy_reward": 0.01458333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9645833492279052, + "step": 1384 + }, + { + "clip_ratio": 0.0, + "completion_length": 610.7604370117188, + "epoch": 0.44327092334773566, + "grad_norm": 0.07876008003950119, + "kl": 0.30484682992100715, + "learning_rate": 1.3641517543244152e-05, + "loss": 0.0557, + "reward": 1.0671875178813934, + "reward_std": 0.13168583028018474, + "rewards/accuracy_reward": 0.08541666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708611488343, + "step": 1385 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.7666870117188, + "epoch": 0.44359097455592894, + "grad_norm": 0.22591634094715118, + "kl": 0.41740057840943334, + "learning_rate": 1.363110655640808e-05, + "loss": 0.0997, + "reward": 1.0437500298023223, + "reward_std": 0.13870418183505534, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9750000178813935, + "step": 1386 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.49794921875, + "epoch": 0.4439110257641223, + "grad_norm": 0.34502559900283813, + "kl": 0.41770399175584316, + "learning_rate": 1.3620691034157314e-05, + "loss": 0.0936, + "reward": 1.0588541865348815, + "reward_std": 0.11579778082668782, + "rewards/accuracy_reward": 0.0937500026077032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9651041805744172, + "step": 1387 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.4083557128906, + "epoch": 0.44423107697231556, + "grad_norm": 0.08325286954641342, + "kl": 0.22816858440637589, + "learning_rate": 1.3610270989501311e-05, + "loss": 0.06, + "reward": 1.1005208671092988, + "reward_std": 0.16243541650474072, + "rewards/accuracy_reward": 0.12500000316649676, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208551883697, + "step": 1388 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.3875244140625, + "epoch": 0.4445511281805089, + "grad_norm": 0.10017146915197372, + "kl": 0.32076493874192236, + "learning_rate": 1.3599846435455168e-05, + "loss": 0.1516, + "reward": 1.0677083671092986, + "reward_std": 0.17430904135107994, + "rewards/accuracy_reward": 0.10833333842456341, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9593750178813935, + "step": 1389 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.8875183105469, + "epoch": 0.4448711793887022, + "grad_norm": 0.05737130716443062, + "kl": 0.19035155102610588, + "learning_rate": 1.358941738503963e-05, + "loss": 0.0669, + "reward": 1.051562523841858, + "reward_std": 0.12057933807373047, + "rewards/accuracy_reward": 0.07500000335276127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9765625178813935, + "step": 1390 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.5666870117187, + "epoch": 0.4451912305968955, + "grad_norm": 0.20350056886672974, + "kl": 0.42712721824645994, + "learning_rate": 1.3578983851281036e-05, + "loss": 0.0805, + "reward": 1.098437523841858, + "reward_std": 0.1661032922565937, + "rewards/accuracy_reward": 0.12500000316649676, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9734375059604645, + "step": 1391 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.7083557128906, + "epoch": 0.4455112818050888, + "grad_norm": 0.1656087338924408, + "kl": 0.30918216332793236, + "learning_rate": 1.3568545847211345e-05, + "loss": 0.0968, + "reward": 1.0078125178813935, + "reward_std": 0.12089228257536888, + "rewards/accuracy_reward": 0.03541666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9723958551883698, + "step": 1392 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.4666809082031, + "epoch": 0.44583133301328215, + "grad_norm": 0.046774476766586304, + "kl": 0.19798714965581893, + "learning_rate": 1.3558103385868087e-05, + "loss": 0.0465, + "reward": 1.0338541924953462, + "reward_std": 0.0858245899900794, + "rewards/accuracy_reward": 0.052083334885537624, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708492279053, + "step": 1393 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.0250183105469, + "epoch": 0.44615138422147543, + "grad_norm": 0.06257335096597672, + "kl": 0.15758491531014443, + "learning_rate": 1.3547656480294365e-05, + "loss": 0.0458, + "reward": 1.069791704416275, + "reward_std": 0.13865265790373088, + "rewards/accuracy_reward": 0.08750000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9822916924953461, + "step": 1394 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.7229370117187, + "epoch": 0.4464714354296688, + "grad_norm": 0.21203400194644928, + "kl": 0.16767778843641282, + "learning_rate": 1.3537205143538837e-05, + "loss": 0.0709, + "reward": 1.051562523841858, + "reward_std": 0.11704848129302263, + "rewards/accuracy_reward": 0.07291666995733977, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458492279053, + "step": 1395 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.4479339599609, + "epoch": 0.44679148663786206, + "grad_norm": 0.09236126393079758, + "kl": 0.2601070187985897, + "learning_rate": 1.352674938865568e-05, + "loss": 0.0594, + "reward": 1.0505208551883698, + "reward_std": 0.0968201220035553, + "rewards/accuracy_reward": 0.07500000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208432674408, + "step": 1396 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.2979400634765, + "epoch": 0.4471115378460554, + "grad_norm": 0.0691533014178276, + "kl": 0.251144764572382, + "learning_rate": 1.351628922870461e-05, + "loss": 0.0553, + "reward": 1.0411458671092988, + "reward_std": 0.12758275177329778, + "rewards/accuracy_reward": 0.05625000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9848958492279053, + "step": 1397 + }, + { + "clip_ratio": 0.0, + "completion_length": 630.1812683105469, + "epoch": 0.4474315890542487, + "grad_norm": 0.07024051249027252, + "kl": 0.21441357135772704, + "learning_rate": 1.350582467675083e-05, + "loss": 0.053, + "reward": 1.0067708492279053, + "reward_std": 0.09378877226263285, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9692708432674408, + "step": 1398 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.5125183105469, + "epoch": 0.44775164026244196, + "grad_norm": 0.08377721160650253, + "kl": 0.2407459184527397, + "learning_rate": 1.3495355745865038e-05, + "loss": 0.0422, + "reward": 1.0979166984558106, + "reward_std": 0.16228131018579006, + "rewards/accuracy_reward": 0.1166666705161333, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.981250011920929, + "step": 1399 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.1604248046875, + "epoch": 0.4480716914706353, + "grad_norm": 0.13796372711658478, + "kl": 0.13778001070022583, + "learning_rate": 1.348488244912339e-05, + "loss": 0.0283, + "reward": 1.0557291865348817, + "reward_std": 0.13359115049242973, + "rewards/accuracy_reward": 0.06875000279396773, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9869791805744171, + "step": 1400 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.5208557128906, + "epoch": 0.4483917426788286, + "grad_norm": 0.06486453860998154, + "kl": 0.22841630578041078, + "learning_rate": 1.347440479960751e-05, + "loss": 0.0407, + "reward": 1.1463542044162751, + "reward_std": 0.09497451074421406, + "rewards/accuracy_reward": 0.16041667331010104, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9859375119209289, + "step": 1401 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.2041931152344, + "epoch": 0.4487117938870219, + "grad_norm": 0.07434765994548798, + "kl": 0.3341860793530941, + "learning_rate": 1.3463922810404448e-05, + "loss": 0.0444, + "reward": 0.9833333551883697, + "reward_std": 0.08629503026604653, + "rewards/accuracy_reward": 0.006250000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833551883698, + "step": 1402 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.9000061035156, + "epoch": 0.4490318450952152, + "grad_norm": 0.11220329999923706, + "kl": 0.17129188179969787, + "learning_rate": 1.3453436494606683e-05, + "loss": 0.0858, + "reward": 1.0385416865348815, + "reward_std": 0.17846698872745037, + "rewards/accuracy_reward": 0.06666666809469461, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9718750178813934, + "step": 1403 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.8812683105468, + "epoch": 0.44935189630340855, + "grad_norm": 0.11341900378465652, + "kl": 0.2534954246133566, + "learning_rate": 1.3442945865312085e-05, + "loss": 0.0456, + "reward": 1.0838541984558105, + "reward_std": 0.15260363109409808, + "rewards/accuracy_reward": 0.10208333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708492279053, + "step": 1404 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.1354400634766, + "epoch": 0.44967194751160183, + "grad_norm": 0.07155326008796692, + "kl": 0.21175614856183528, + "learning_rate": 1.3432450935623922e-05, + "loss": 0.0746, + "reward": 1.0703125298023224, + "reward_std": 0.11777629610151052, + "rewards/accuracy_reward": 0.09583333600312471, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9744791805744171, + "step": 1405 + }, + { + "clip_ratio": 0.0, + "completion_length": 613.7104370117188, + "epoch": 0.44999199871979517, + "grad_norm": 0.06758978217840195, + "kl": 0.21431030780076982, + "learning_rate": 1.3421951718650836e-05, + "loss": 0.0479, + "reward": 1.0343750178813935, + "reward_std": 0.09610320255160332, + "rewards/accuracy_reward": 0.05000000242143869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750119209289, + "step": 1406 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.6521087646485, + "epoch": 0.45031204992798846, + "grad_norm": 0.06923665851354599, + "kl": 0.17125880494713783, + "learning_rate": 1.3411448227506815e-05, + "loss": 0.0414, + "reward": 1.0781250298023224, + "reward_std": 0.08269294798374176, + "rewards/accuracy_reward": 0.09166666883975268, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9864583492279053, + "step": 1407 + }, + { + "clip_ratio": 0.0, + "completion_length": 590.1666839599609, + "epoch": 0.4506321011361818, + "grad_norm": 0.046295374631881714, + "kl": 0.16617081128060818, + "learning_rate": 1.3400940475311193e-05, + "loss": 0.0337, + "reward": 1.1041666984558105, + "reward_std": 0.15676663368940352, + "rewards/accuracy_reward": 0.12083333563059569, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9833333492279053, + "step": 1408 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.8687744140625, + "epoch": 0.4509521523443751, + "grad_norm": 0.12262304872274399, + "kl": 0.3631344482302666, + "learning_rate": 1.3390428475188617e-05, + "loss": 0.022, + "reward": 1.117187535762787, + "reward_std": 0.11911057773977518, + "rewards/accuracy_reward": 0.1333333384245634, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9838541805744171, + "step": 1409 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.472933959961, + "epoch": 0.4512722035525684, + "grad_norm": 0.06850114464759827, + "kl": 0.25028989017009734, + "learning_rate": 1.337991224026905e-05, + "loss": 0.067, + "reward": 1.0416666924953462, + "reward_std": 0.09098817594349384, + "rewards/accuracy_reward": 0.06458333637565375, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833551883698, + "step": 1410 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.7666870117188, + "epoch": 0.4515922547607617, + "grad_norm": 0.10766538232564926, + "kl": 0.2578369677066803, + "learning_rate": 1.3369391783687742e-05, + "loss": 0.0676, + "reward": 1.090625035762787, + "reward_std": 0.14667030721902846, + "rewards/accuracy_reward": 0.1062500026077032, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9822916865348816, + "step": 1411 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.9416931152343, + "epoch": 0.45191230596895504, + "grad_norm": 0.07885698229074478, + "kl": 0.33700631856918334, + "learning_rate": 1.3358867118585212e-05, + "loss": 0.0704, + "reward": 1.0671875417232513, + "reward_std": 0.09663599599152803, + "rewards/accuracy_reward": 0.08750000260770321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9796875178813934, + "step": 1412 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.1187683105469, + "epoch": 0.4522323571771483, + "grad_norm": 0.10163812339305878, + "kl": 0.2576158232986927, + "learning_rate": 1.3348338258107235e-05, + "loss": 0.0956, + "reward": 1.1973958730697631, + "reward_std": 0.1255306104198098, + "rewards/accuracy_reward": 0.21875000596046448, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458492279053, + "step": 1413 + }, + { + "clip_ratio": 0.0, + "completion_length": 611.9375122070312, + "epoch": 0.45255240838534166, + "grad_norm": 0.09681346267461777, + "kl": 0.4438394993543625, + "learning_rate": 1.3337805215404837e-05, + "loss": 0.101, + "reward": 1.0468750417232513, + "reward_std": 0.1585536990314722, + "rewards/accuracy_reward": 0.08333333563059568, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.963541692495346, + "step": 1414 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.3833465576172, + "epoch": 0.45287245959353495, + "grad_norm": 0.09335222840309143, + "kl": 0.3850545734167099, + "learning_rate": 1.3327268003634255e-05, + "loss": 0.1005, + "reward": 1.0765625357627868, + "reward_std": 0.17124846372753383, + "rewards/accuracy_reward": 0.0958333371207118, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9807291805744172, + "step": 1415 + }, + { + "clip_ratio": 0.0, + "completion_length": 599.4562805175781, + "epoch": 0.4531925108017283, + "grad_norm": 0.14342984557151794, + "kl": 0.3690617233514786, + "learning_rate": 1.3316726635956938e-05, + "loss": 0.0825, + "reward": 1.0729166984558105, + "reward_std": 0.09912073966115713, + "rewards/accuracy_reward": 0.09583333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833551883698, + "step": 1416 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.1208557128906, + "epoch": 0.45351256200992157, + "grad_norm": 0.09222087264060974, + "kl": 0.13021155372262, + "learning_rate": 1.3306181125539528e-05, + "loss": 0.0387, + "reward": 1.0973958551883698, + "reward_std": 0.06767892204225064, + "rewards/accuracy_reward": 0.10833333749324084, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9890625059604645, + "step": 1417 + }, + { + "clip_ratio": 0.0, + "completion_length": 612.4729370117187, + "epoch": 0.4538326132181149, + "grad_norm": 0.1089828833937645, + "kl": 0.32629442512989043, + "learning_rate": 1.3295631485553838e-05, + "loss": 0.102, + "reward": 1.005729192495346, + "reward_std": 0.12626549191772937, + "rewards/accuracy_reward": 0.03541666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9703125119209289, + "step": 1418 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.3458557128906, + "epoch": 0.4541526644263082, + "grad_norm": 0.09334749728441238, + "kl": 0.2463594913482666, + "learning_rate": 1.3285077729176844e-05, + "loss": 0.0788, + "reward": 1.0145833551883698, + "reward_std": 0.1507036415860057, + "rewards/accuracy_reward": 0.0375000013038516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833432674408, + "step": 1419 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.6541809082031, + "epoch": 0.45447271563450153, + "grad_norm": 0.18406108021736145, + "kl": 0.39999381825327873, + "learning_rate": 1.3274519869590656e-05, + "loss": 0.0841, + "reward": 1.0093750357627869, + "reward_std": 0.14787054806947708, + "rewards/accuracy_reward": 0.039583333767950535, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916865348816, + "step": 1420 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.1104370117188, + "epoch": 0.4547927668426948, + "grad_norm": 0.16637839376926422, + "kl": 0.4374181792140007, + "learning_rate": 1.3263957919982516e-05, + "loss": 0.0952, + "reward": 1.0369791865348816, + "reward_std": 0.16850600093603135, + "rewards/accuracy_reward": 0.06875000335276127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9682291746139526, + "step": 1421 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.7020935058594, + "epoch": 0.45511281805088816, + "grad_norm": 0.11868277937173843, + "kl": 0.2869627773761749, + "learning_rate": 1.325339189354477e-05, + "loss": 0.0445, + "reward": 1.0234375298023224, + "reward_std": 0.11299450956285, + "rewards/accuracy_reward": 0.0458333345130086, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9755208551883697, + "step": 1422 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.7312683105469, + "epoch": 0.45543286925908144, + "grad_norm": 0.08685970306396484, + "kl": 0.25337800160050394, + "learning_rate": 1.3242821803474861e-05, + "loss": 0.0851, + "reward": 1.0723958492279053, + "reward_std": 0.13812698870897294, + "rewards/accuracy_reward": 0.1000000050291419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9723958492279052, + "step": 1423 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.2104278564453, + "epoch": 0.4557529204672748, + "grad_norm": 0.10650160908699036, + "kl": 0.44310767501592635, + "learning_rate": 1.3232247662975304e-05, + "loss": 0.1005, + "reward": 1.054166704416275, + "reward_std": 0.17150254175066948, + "rewards/accuracy_reward": 0.08958333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9645833432674408, + "step": 1424 + }, + { + "clip_ratio": 0.0, + "completion_length": 619.7708557128906, + "epoch": 0.45607297167546806, + "grad_norm": 0.0737314224243164, + "kl": 0.26119700372219085, + "learning_rate": 1.3221669485253672e-05, + "loss": 0.0651, + "reward": 1.0958333730697631, + "reward_std": 0.09929907992482186, + "rewards/accuracy_reward": 0.11666667126119137, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1425 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.039599609375, + "epoch": 0.4563930228836614, + "grad_norm": 0.23615843057632446, + "kl": 0.5649647109210492, + "learning_rate": 1.3211087283522586e-05, + "loss": 0.1103, + "reward": 1.0640625298023223, + "reward_std": 0.1466512806713581, + "rewards/accuracy_reward": 0.09791666995733976, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9661458492279053, + "step": 1426 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.2666870117188, + "epoch": 0.4567130740918547, + "grad_norm": 0.08350611478090286, + "kl": 0.23456739112734795, + "learning_rate": 1.3200501070999687e-05, + "loss": 0.0658, + "reward": 1.024479192495346, + "reward_std": 0.11737537570297718, + "rewards/accuracy_reward": 0.04583333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458492279053, + "step": 1427 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.1104370117188, + "epoch": 0.457033125300048, + "grad_norm": 0.11149536818265915, + "kl": 0.24888658449053763, + "learning_rate": 1.3189910860907631e-05, + "loss": 0.0654, + "reward": 1.0437500476837158, + "reward_std": 0.13792316131293775, + "rewards/accuracy_reward": 0.06458333488553762, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1428 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.9500122070312, + "epoch": 0.4573531765082413, + "grad_norm": 0.08928157389163971, + "kl": 0.44104146808385847, + "learning_rate": 1.3179316666474063e-05, + "loss": 0.0931, + "reward": 1.069791704416275, + "reward_std": 0.16804887503385543, + "rewards/accuracy_reward": 0.10416666902601719, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9656250178813934, + "step": 1429 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.0541839599609, + "epoch": 0.45767322771643465, + "grad_norm": 0.0916551873087883, + "kl": 0.4219443365931511, + "learning_rate": 1.3168718500931603e-05, + "loss": 0.0807, + "reward": 1.061979204416275, + "reward_std": 0.1655805967748165, + "rewards/accuracy_reward": 0.0958333345130086, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9661458551883697, + "step": 1430 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.0854370117188, + "epoch": 0.45799327892462793, + "grad_norm": 0.13563001155853271, + "kl": 0.29530239701271055, + "learning_rate": 1.315811637751784e-05, + "loss": 0.0605, + "reward": 1.0119791805744172, + "reward_std": 0.07693684957921505, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458492279053, + "step": 1431 + }, + { + "clip_ratio": 0.0, + "completion_length": 627.0708557128906, + "epoch": 0.4583133301328213, + "grad_norm": 0.13829165697097778, + "kl": 0.6939724013209343, + "learning_rate": 1.3147510309475301e-05, + "loss": 0.0984, + "reward": 1.0390625238418578, + "reward_std": 0.14951678588986397, + "rewards/accuracy_reward": 0.08750000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.951562511920929, + "step": 1432 + }, + { + "clip_ratio": 0.0, + "completion_length": 613.764599609375, + "epoch": 0.45863338134101456, + "grad_norm": 0.07941343635320663, + "kl": 0.4705329492688179, + "learning_rate": 1.3136900310051438e-05, + "loss": 0.1283, + "reward": 1.0531250178813933, + "reward_std": 0.19366805925965308, + "rewards/accuracy_reward": 0.10416666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9489583492279052, + "step": 1433 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.504183959961, + "epoch": 0.4589534325492079, + "grad_norm": 0.14773200452327728, + "kl": 0.30267433300614355, + "learning_rate": 1.312628639249861e-05, + "loss": 0.0869, + "reward": 1.1286458611488341, + "reward_std": 0.14947104826569557, + "rewards/accuracy_reward": 0.15833333935588598, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9703125119209289, + "step": 1434 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.289599609375, + "epoch": 0.4592734837574012, + "grad_norm": 0.06837635487318039, + "kl": 0.2559683620929718, + "learning_rate": 1.3115668570074083e-05, + "loss": 0.0478, + "reward": 1.1395833671092988, + "reward_std": 0.15164603665471077, + "rewards/accuracy_reward": 0.17291667219251394, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9666666865348816, + "step": 1435 + }, + { + "clip_ratio": 0.0, + "completion_length": 593.6312744140625, + "epoch": 0.4595935349655945, + "grad_norm": 0.12697356939315796, + "kl": 0.35023130998015406, + "learning_rate": 1.3105046856039994e-05, + "loss": 0.0754, + "reward": 1.110416692495346, + "reward_std": 0.11461557075381279, + "rewards/accuracy_reward": 0.13750000707805157, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9729166746139526, + "step": 1436 + }, + { + "clip_ratio": 0.0, + "completion_length": 611.8250244140625, + "epoch": 0.4599135861737878, + "grad_norm": 0.08488718420267105, + "kl": 0.3687517575919628, + "learning_rate": 1.309442126366333e-05, + "loss": 0.0785, + "reward": 1.0333333492279053, + "reward_std": 0.14628687109798194, + "rewards/accuracy_reward": 0.06250000018626452, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333492279053, + "step": 1437 + }, + { + "clip_ratio": 0.0, + "completion_length": 623.1646026611328, + "epoch": 0.46023363738198114, + "grad_norm": 0.15668301284313202, + "kl": 0.31853192709386347, + "learning_rate": 1.308379180621594e-05, + "loss": 0.0935, + "reward": 1.071354204416275, + "reward_std": 0.15687522031366824, + "rewards/accuracy_reward": 0.10208333767950535, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9692708551883698, + "step": 1438 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.6916931152343, + "epoch": 0.4605536885901744, + "grad_norm": 0.10902131348848343, + "kl": 0.34631902873516085, + "learning_rate": 1.3073158496974487e-05, + "loss": 0.0796, + "reward": 1.0937500238418578, + "reward_std": 0.1236114427447319, + "rewards/accuracy_reward": 0.12083333749324084, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9729166805744172, + "step": 1439 + }, + { + "clip_ratio": 0.0, + "completion_length": 607.9875244140625, + "epoch": 0.46087373979836777, + "grad_norm": 0.0653231143951416, + "kl": 0.14004647061228753, + "learning_rate": 1.3062521349220459e-05, + "loss": 0.0568, + "reward": 1.0395833611488343, + "reward_std": 0.13148568905889987, + "rewards/accuracy_reward": 0.05416666846722364, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9854166805744171, + "step": 1440 + }, + { + "clip_ratio": 0.0, + "completion_length": 599.5166931152344, + "epoch": 0.46119379100656105, + "grad_norm": 0.2973286509513855, + "kl": 0.2680055730044842, + "learning_rate": 1.3051880376240117e-05, + "loss": 0.1017, + "reward": 1.0463541984558105, + "reward_std": 0.15086615979671478, + "rewards/accuracy_reward": 0.07916666828095913, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9651041746139526, + "step": 1441 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.0729370117188, + "epoch": 0.4615138422147544, + "grad_norm": 0.10781467705965042, + "kl": 0.46533993929624556, + "learning_rate": 1.3041235591324521e-05, + "loss": 0.1004, + "reward": 1.0479166865348817, + "reward_std": 0.1986624613404274, + "rewards/accuracy_reward": 0.08750000335276127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9604166805744171, + "step": 1442 + }, + { + "clip_ratio": 0.0, + "completion_length": 595.733349609375, + "epoch": 0.46183389342294767, + "grad_norm": 0.08439289033412933, + "kl": 0.14252968803048133, + "learning_rate": 1.3030587007769486e-05, + "loss": 0.0462, + "reward": 1.0255208551883697, + "reward_std": 0.10829742904752493, + "rewards/accuracy_reward": 0.04375000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708492279053, + "step": 1443 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.30419921875, + "epoch": 0.46215394463114096, + "grad_norm": 0.13936440646648407, + "kl": 0.2732355587184429, + "learning_rate": 1.3019934638875565e-05, + "loss": 0.0673, + "reward": 1.0255208432674408, + "reward_std": 0.13787070866674184, + "rewards/accuracy_reward": 0.05000000055879354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208492279053, + "step": 1444 + }, + { + "clip_ratio": 0.0, + "completion_length": 593.7437622070313, + "epoch": 0.4624739958393343, + "grad_norm": 0.1179627776145935, + "kl": 0.3253137730062008, + "learning_rate": 1.3009278497948046e-05, + "loss": 0.0835, + "reward": 1.054687535762787, + "reward_std": 0.13018050529062747, + "rewards/accuracy_reward": 0.08750000353902579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9671875178813935, + "step": 1445 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.2541809082031, + "epoch": 0.4627940470475276, + "grad_norm": 0.07859959453344345, + "kl": 0.25831368714571, + "learning_rate": 1.2998618598296922e-05, + "loss": 0.1257, + "reward": 0.9828125298023224, + "reward_std": 0.14702120125293733, + "rewards/accuracy_reward": 0.02083333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9619791805744171, + "step": 1446 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.6625183105468, + "epoch": 0.4631140982557209, + "grad_norm": 0.20410098135471344, + "kl": 0.2851632609963417, + "learning_rate": 1.298795495323689e-05, + "loss": 0.0749, + "reward": 1.0578125238418579, + "reward_std": 0.11233447343111039, + "rewards/accuracy_reward": 0.08750000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9703125178813934, + "step": 1447 + }, + { + "clip_ratio": 0.0, + "completion_length": 604.816683959961, + "epoch": 0.4634341494639142, + "grad_norm": 0.10286929458379745, + "kl": 0.39714363887906073, + "learning_rate": 1.297728757608732e-05, + "loss": 0.0495, + "reward": 1.0468750298023224, + "reward_std": 0.10802833493798972, + "rewards/accuracy_reward": 0.06458333488553762, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9822916805744171, + "step": 1448 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.7479370117187, + "epoch": 0.46375420067210754, + "grad_norm": 0.19550444185733795, + "kl": 0.3406421348452568, + "learning_rate": 1.2966616480172243e-05, + "loss": 0.1146, + "reward": 1.0473958611488343, + "reward_std": 0.1252935364842415, + "rewards/accuracy_reward": 0.0812500011175871, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9661458492279053, + "step": 1449 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.3687683105469, + "epoch": 0.4640742518803008, + "grad_norm": 0.2726002335548401, + "kl": 0.23093743473291398, + "learning_rate": 1.2955941678820332e-05, + "loss": 0.04, + "reward": 1.0791666984558106, + "reward_std": 0.13660317473113537, + "rewards/accuracy_reward": 0.09791666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.981250011920929, + "step": 1450 + }, + { + "clip_ratio": 0.0, + "completion_length": 621.6291931152343, + "epoch": 0.46439430308849416, + "grad_norm": 0.12068649381399155, + "kl": 0.3264674745500088, + "learning_rate": 1.2945263185364895e-05, + "loss": 0.0531, + "reward": 1.0875000298023223, + "reward_std": 0.1675384446978569, + "rewards/accuracy_reward": 0.11666667014360428, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333432674408, + "step": 1451 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.3708435058594, + "epoch": 0.46471435429668745, + "grad_norm": 0.14527225494384766, + "kl": 0.2746707245707512, + "learning_rate": 1.293458101314385e-05, + "loss": 0.0709, + "reward": 0.9901041924953461, + "reward_std": 0.11297452226281166, + "rewards/accuracy_reward": 0.018750000558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9713541865348816, + "step": 1452 + }, + { + "clip_ratio": 0.0, + "completion_length": 607.8437622070312, + "epoch": 0.4650344055048808, + "grad_norm": 0.12880566716194153, + "kl": 0.31128202080726625, + "learning_rate": 1.292389517549971e-05, + "loss": 0.1268, + "reward": 1.0010416805744171, + "reward_std": 0.1498323068022728, + "rewards/accuracy_reward": 0.047916668094694616, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9531250178813935, + "step": 1453 + }, + { + "clip_ratio": 0.0, + "completion_length": 614.5083618164062, + "epoch": 0.46535445671307407, + "grad_norm": 0.13385367393493652, + "kl": 0.44625467509031297, + "learning_rate": 1.2913205685779557e-05, + "loss": 0.0819, + "reward": 1.1098958611488343, + "reward_std": 0.15377984158694744, + "rewards/accuracy_reward": 0.14583333991467953, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9619791865348816, + "step": 1454 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.0291870117187, + "epoch": 0.4656745079212674, + "grad_norm": 0.10405029356479645, + "kl": 0.2602963488548994, + "learning_rate": 1.2902512557335047e-05, + "loss": 0.0737, + "reward": 0.9890625178813934, + "reward_std": 0.1055045148357749, + "rewards/accuracy_reward": 0.010416666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458492279053, + "step": 1455 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.4521118164063, + "epoch": 0.4659945591294607, + "grad_norm": 0.07072511315345764, + "kl": 0.3464597135782242, + "learning_rate": 1.2891815803522378e-05, + "loss": 0.0692, + "reward": 1.0333333611488342, + "reward_std": 0.16075058728456498, + "rewards/accuracy_reward": 0.06666666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9666666865348816, + "step": 1456 + }, + { + "clip_ratio": 0.0, + "completion_length": 615.8562652587891, + "epoch": 0.46631461033765403, + "grad_norm": 0.13768284022808075, + "kl": 0.3706891119480133, + "learning_rate": 1.2881115437702274e-05, + "loss": 0.0992, + "reward": 1.0208333611488343, + "reward_std": 0.1893759747967124, + "rewards/accuracy_reward": 0.0708333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9500000178813934, + "step": 1457 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.783349609375, + "epoch": 0.4666346615458473, + "grad_norm": 0.18860451877117157, + "kl": 0.361300827562809, + "learning_rate": 1.287041147323997e-05, + "loss": 0.1191, + "reward": 1.023437535762787, + "reward_std": 0.1736320335417986, + "rewards/accuracy_reward": 0.060416667349636556, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9630208551883698, + "step": 1458 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.2229431152343, + "epoch": 0.46695471275404066, + "grad_norm": 0.10289126634597778, + "kl": 0.2510980851948261, + "learning_rate": 1.2859703923505194e-05, + "loss": 0.1045, + "reward": 1.054687535762787, + "reward_std": 0.16347628384828566, + "rewards/accuracy_reward": 0.08333333693444729, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9713541984558105, + "step": 1459 + }, + { + "clip_ratio": 0.0, + "completion_length": 615.0083618164062, + "epoch": 0.46727476396223394, + "grad_norm": 0.20354001224040985, + "kl": 0.6329892605543137, + "learning_rate": 1.2848992801872159e-05, + "loss": 0.1397, + "reward": 0.9583333551883697, + "reward_std": 0.1352860927581787, + "rewards/accuracy_reward": 0.006250000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9520833551883697, + "step": 1460 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.6750305175781, + "epoch": 0.4675948151704273, + "grad_norm": 0.08727524429559708, + "kl": 0.30383690968155863, + "learning_rate": 1.2838278121719536e-05, + "loss": 0.0646, + "reward": 1.0593750298023223, + "reward_std": 0.12578080594539642, + "rewards/accuracy_reward": 0.08958333563059569, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916805744171, + "step": 1461 + }, + { + "clip_ratio": 0.0, + "completion_length": 588.2791870117187, + "epoch": 0.46791486637862056, + "grad_norm": 0.16131596267223358, + "kl": 0.31957222819328307, + "learning_rate": 1.2827559896430437e-05, + "loss": 0.109, + "reward": 1.0489583671092988, + "reward_std": 0.13771594911813737, + "rewards/accuracy_reward": 0.08333333544433116, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9656250238418579, + "step": 1462 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.7104370117188, + "epoch": 0.4682349175868139, + "grad_norm": 0.1428447663784027, + "kl": 0.3186722435057163, + "learning_rate": 1.2816838139392407e-05, + "loss": 0.0811, + "reward": 1.0541666924953461, + "reward_std": 0.13920316584408282, + "rewards/accuracy_reward": 0.08125000409781932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9729166805744172, + "step": 1463 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.6666809082031, + "epoch": 0.4685549687950072, + "grad_norm": 0.13763944804668427, + "kl": 0.2436652660369873, + "learning_rate": 1.2806112863997401e-05, + "loss": 0.0541, + "reward": 1.1296875298023223, + "reward_std": 0.1380587823688984, + "rewards/accuracy_reward": 0.15416667181998492, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208432674408, + "step": 1464 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.4270935058594, + "epoch": 0.4688750200032005, + "grad_norm": 0.10999301820993423, + "kl": 0.24977174699306487, + "learning_rate": 1.279538408364177e-05, + "loss": 0.0907, + "reward": 1.0541666984558105, + "reward_std": 0.11675290018320084, + "rewards/accuracy_reward": 0.07500000316649676, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9791666746139527, + "step": 1465 + }, + { + "clip_ratio": 0.0, + "completion_length": 593.502099609375, + "epoch": 0.4691950712113938, + "grad_norm": 0.1865115612745285, + "kl": 0.29081103280186654, + "learning_rate": 1.2784651811726238e-05, + "loss": 0.0705, + "reward": 1.0786458551883698, + "reward_std": 0.10002645067870616, + "rewards/accuracy_reward": 0.09791666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9807291805744172, + "step": 1466 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.252099609375, + "epoch": 0.46951512241958715, + "grad_norm": 0.215255007147789, + "kl": 0.4380812518298626, + "learning_rate": 1.2773916061655893e-05, + "loss": 0.1143, + "reward": 1.0145833611488342, + "reward_std": 0.17872334346175195, + "rewards/accuracy_reward": 0.05208333544433117, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9604166865348815, + "step": 1467 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.3271026611328, + "epoch": 0.46983517362778043, + "grad_norm": 0.11711432039737701, + "kl": 0.3209188118577003, + "learning_rate": 1.276317684684017e-05, + "loss": 0.0772, + "reward": 1.0890625238418579, + "reward_std": 0.0958840724080801, + "rewards/accuracy_reward": 0.11250000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9765625119209289, + "step": 1468 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.5458526611328, + "epoch": 0.4701552248359738, + "grad_norm": 0.16603219509124756, + "kl": 0.3087894439697266, + "learning_rate": 1.275243418069283e-05, + "loss": 0.077, + "reward": 1.1411458611488343, + "reward_std": 0.15554081853479146, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9744791865348816, + "step": 1469 + }, + { + "clip_ratio": 0.0, + "completion_length": 613.839599609375, + "epoch": 0.47047527604416706, + "grad_norm": 0.1444757878780365, + "kl": 0.5770320266485214, + "learning_rate": 1.2741688076631942e-05, + "loss": 0.1205, + "reward": 1.0265625178813935, + "reward_std": 0.16366661600768567, + "rewards/accuracy_reward": 0.07083333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.955729192495346, + "step": 1470 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.1187683105469, + "epoch": 0.4707953272523604, + "grad_norm": 0.10163528472185135, + "kl": 0.3341947510838509, + "learning_rate": 1.2730938548079873e-05, + "loss": 0.0968, + "reward": 1.0072916924953461, + "reward_std": 0.13251439444720745, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.965625011920929, + "step": 1471 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.0416839599609, + "epoch": 0.4711153784605537, + "grad_norm": 0.24541088938713074, + "kl": 0.4345988750457764, + "learning_rate": 1.2720185608463258e-05, + "loss": 0.1186, + "reward": 1.1192708611488342, + "reward_std": 0.18835799023509026, + "rewards/accuracy_reward": 0.14583333879709243, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9734375238418579, + "step": 1472 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.9062652587891, + "epoch": 0.471435429668747, + "grad_norm": 0.26369932293891907, + "kl": 0.33843834325671196, + "learning_rate": 1.2709429271213009e-05, + "loss": 0.1012, + "reward": 1.0208333492279054, + "reward_std": 0.12742016687989235, + "rewards/accuracy_reward": 0.04791666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9729166865348816, + "step": 1473 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.9416809082031, + "epoch": 0.4717554808769403, + "grad_norm": 0.10718197375535965, + "kl": 0.31209646463394164, + "learning_rate": 1.2698669549764272e-05, + "loss": 0.0915, + "reward": 1.0500000178813935, + "reward_std": 0.15133947264403105, + "rewards/accuracy_reward": 0.08125000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9687500178813935, + "step": 1474 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.1021026611328, + "epoch": 0.47207553208513364, + "grad_norm": 0.08452266454696655, + "kl": 0.3574494168162346, + "learning_rate": 1.2687906457556416e-05, + "loss": 0.1113, + "reward": 1.0921875417232514, + "reward_std": 0.13789307028055192, + "rewards/accuracy_reward": 0.11666667070239782, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208611488342, + "step": 1475 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.202099609375, + "epoch": 0.4723955832933269, + "grad_norm": 0.2100512832403183, + "kl": 0.35172075033187866, + "learning_rate": 1.267714000803303e-05, + "loss": 0.0955, + "reward": 1.0885416865348816, + "reward_std": 0.12355173248797655, + "rewards/accuracy_reward": 0.11458333674818277, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583492279053, + "step": 1476 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.2187713623047, + "epoch": 0.47271563450152027, + "grad_norm": 0.07331164926290512, + "kl": 0.2190377414226532, + "learning_rate": 1.266637021464189e-05, + "loss": 0.0696, + "reward": 1.0927083611488342, + "reward_std": 0.07819271050393581, + "rewards/accuracy_reward": 0.10833333730697632, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750119209289, + "step": 1477 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.1166839599609, + "epoch": 0.47303568570971355, + "grad_norm": 0.06377699971199036, + "kl": 0.2705439478158951, + "learning_rate": 1.265559709083495e-05, + "loss": 0.0505, + "reward": 1.0239583432674408, + "reward_std": 0.10356269031763077, + "rewards/accuracy_reward": 0.039583335444331166, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9822916746139526, + "step": 1478 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.3437683105469, + "epoch": 0.4733557369179069, + "grad_norm": 0.08778975158929825, + "kl": 0.23727463632822038, + "learning_rate": 1.2644820650068323e-05, + "loss": 0.0854, + "reward": 1.1609375298023223, + "reward_std": 0.09437338933348656, + "rewards/accuracy_reward": 0.17708333767950535, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9838541746139526, + "step": 1479 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.3250183105469, + "epoch": 0.4736757881261002, + "grad_norm": 0.07925225049257278, + "kl": 0.2749205954372883, + "learning_rate": 1.2634040905802267e-05, + "loss": 0.0885, + "reward": 1.097916704416275, + "reward_std": 0.12705145999789239, + "rewards/accuracy_reward": 0.1187500013038516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9791666805744171, + "step": 1480 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.7979339599609, + "epoch": 0.4739958393342935, + "grad_norm": 0.06212180107831955, + "kl": 0.12944966927170753, + "learning_rate": 1.2623257871501165e-05, + "loss": 0.0504, + "reward": 1.1041666984558105, + "reward_std": 0.08514518775045872, + "rewards/accuracy_reward": 0.11458333693444729, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1481 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.9333557128906, + "epoch": 0.4743158905424868, + "grad_norm": 0.07425817102193832, + "kl": 0.21376031935214995, + "learning_rate": 1.2612471560633512e-05, + "loss": 0.0614, + "reward": 1.0046875119209289, + "reward_std": 0.06692595425993204, + "rewards/accuracy_reward": 0.01458333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9901041805744171, + "step": 1482 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.8854400634766, + "epoch": 0.47463594175068013, + "grad_norm": 0.053915202617645264, + "kl": 0.1447036750614643, + "learning_rate": 1.260168198667189e-05, + "loss": 0.045, + "reward": 1.1250000298023224, + "reward_std": 0.1259409360587597, + "rewards/accuracy_reward": 0.13958333749324084, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9854166865348816, + "step": 1483 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.1687652587891, + "epoch": 0.4749559929588734, + "grad_norm": 0.19801031053066254, + "kl": 0.18226547986268998, + "learning_rate": 1.2590889163092963e-05, + "loss": 0.0328, + "reward": 1.0447916865348816, + "reward_std": 0.09412735253572464, + "rewards/accuracy_reward": 0.05625000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9885416746139526, + "step": 1484 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.0125183105469, + "epoch": 0.47527604416706676, + "grad_norm": 0.1381172090768814, + "kl": 0.3441140428185463, + "learning_rate": 1.2580093103377446e-05, + "loss": 0.0378, + "reward": 1.113541692495346, + "reward_std": 0.10265696365386248, + "rewards/accuracy_reward": 0.1270833384245634, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9864583432674408, + "step": 1485 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.4166870117188, + "epoch": 0.47559609537526004, + "grad_norm": 0.05816073715686798, + "kl": 0.18868185505270957, + "learning_rate": 1.2569293821010109e-05, + "loss": 0.0696, + "reward": 1.0906250298023223, + "reward_std": 0.08026152718812227, + "rewards/accuracy_reward": 0.10208333730697632, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9885416805744172, + "step": 1486 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.7750213623046, + "epoch": 0.4759161465834533, + "grad_norm": 0.06502912193536758, + "kl": 0.13506832644343375, + "learning_rate": 1.2558491329479732e-05, + "loss": 0.0541, + "reward": 1.0291667044162751, + "reward_std": 0.08429014421999455, + "rewards/accuracy_reward": 0.04583333469927311, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9833333551883697, + "step": 1487 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.0937683105469, + "epoch": 0.47623619779164666, + "grad_norm": 0.04643028974533081, + "kl": 0.10432569906115532, + "learning_rate": 1.2547685642279113e-05, + "loss": 0.0304, + "reward": 1.1744791984558105, + "reward_std": 0.09066424928605557, + "rewards/accuracy_reward": 0.1833333380520344, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9911458373069764, + "step": 1488 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.2187744140625, + "epoch": 0.47655624899983995, + "grad_norm": 0.055091604590415955, + "kl": 0.18568008467555047, + "learning_rate": 1.253687677290504e-05, + "loss": 0.052, + "reward": 1.0333333492279053, + "reward_std": 0.08247921578586101, + "rewards/accuracy_reward": 0.04791666772216559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9854166746139527, + "step": 1489 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.9479370117188, + "epoch": 0.4768763002080333, + "grad_norm": 0.11636830866336823, + "kl": 0.17313535772264005, + "learning_rate": 1.2526064734858277e-05, + "loss": 0.0694, + "reward": 1.1880208730697632, + "reward_std": 0.1414164997637272, + "rewards/accuracy_reward": 0.21250000800937413, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208432674408, + "step": 1490 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.087515258789, + "epoch": 0.47719635141622657, + "grad_norm": 0.04212115705013275, + "kl": 0.13310096897184848, + "learning_rate": 1.2515249541643537e-05, + "loss": 0.0488, + "reward": 1.0312500178813935, + "reward_std": 0.06270579267293215, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9895833492279053, + "step": 1491 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.9500305175782, + "epoch": 0.4775164026244199, + "grad_norm": 0.0569349005818367, + "kl": 0.1305923953652382, + "learning_rate": 1.2504431206769487e-05, + "loss": 0.0547, + "reward": 1.0276041746139526, + "reward_std": 0.09775371477007866, + "rewards/accuracy_reward": 0.04166666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9859375119209289, + "step": 1492 + }, + { + "clip_ratio": 0.0, + "completion_length": 599.3604431152344, + "epoch": 0.4778364538326132, + "grad_norm": 0.09578822553157806, + "kl": 0.1873940646648407, + "learning_rate": 1.2493609743748709e-05, + "loss": 0.0518, + "reward": 1.079166692495346, + "reward_std": 0.1045138731598854, + "rewards/accuracy_reward": 0.09583333637565375, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9833333432674408, + "step": 1493 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.6750244140625, + "epoch": 0.47815650504080653, + "grad_norm": 0.05655219033360481, + "kl": 0.1329231120646, + "learning_rate": 1.2482785166097697e-05, + "loss": 0.0415, + "reward": 1.0520833671092986, + "reward_std": 0.11519988998770714, + "rewards/accuracy_reward": 0.06875000111758708, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9833333551883697, + "step": 1494 + }, + { + "clip_ratio": 0.0, + "completion_length": 590.6271118164062, + "epoch": 0.4784765562489998, + "grad_norm": 0.1878066509962082, + "kl": 0.2167329777032137, + "learning_rate": 1.247195748733683e-05, + "loss": 0.0675, + "reward": 1.0916666984558105, + "reward_std": 0.08340043239295483, + "rewards/accuracy_reward": 0.11041666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9812500178813934, + "step": 1495 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.614599609375, + "epoch": 0.47879660745719316, + "grad_norm": 0.06385314464569092, + "kl": 0.18466985821723939, + "learning_rate": 1.2461126720990367e-05, + "loss": 0.0717, + "reward": 1.004687523841858, + "reward_std": 0.08886413350701332, + "rewards/accuracy_reward": 0.0229166679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708551883697, + "step": 1496 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.7083435058594, + "epoch": 0.47911665866538644, + "grad_norm": 0.0636759102344513, + "kl": 0.12463123425841331, + "learning_rate": 1.2450292880586414e-05, + "loss": 0.0456, + "reward": 1.0427083551883698, + "reward_std": 0.05680535212159157, + "rewards/accuracy_reward": 0.05625000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9864583432674408, + "step": 1497 + }, + { + "clip_ratio": 0.0, + "completion_length": 588.3458618164062, + "epoch": 0.4794367098735798, + "grad_norm": 0.1012827605009079, + "kl": 0.1598178006708622, + "learning_rate": 1.2439455979656931e-05, + "loss": 0.0343, + "reward": 1.0802083551883697, + "reward_std": 0.0793739415705204, + "rewards/accuracy_reward": 0.08958333488553763, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9885416805744172, + "step": 1498 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.6041900634766, + "epoch": 0.47975676108177306, + "grad_norm": 0.15515443682670593, + "kl": 0.21187707930803298, + "learning_rate": 1.2428616031737688e-05, + "loss": 0.0709, + "reward": 1.1161458551883698, + "reward_std": 0.12064327895641327, + "rewards/accuracy_reward": 0.13333333879709244, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.982812511920929, + "step": 1499 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.1687683105469, + "epoch": 0.4800768122899664, + "grad_norm": 0.10084983706474304, + "kl": 0.128275853022933, + "learning_rate": 1.241777305036827e-05, + "loss": 0.0384, + "reward": 1.0453125178813933, + "reward_std": 0.12377176433801651, + "rewards/accuracy_reward": 0.06250000204890967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9828125178813935, + "step": 1500 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.1291870117187, + "epoch": 0.4803968634981597, + "grad_norm": 0.09301966428756714, + "kl": 0.18412938639521598, + "learning_rate": 1.2406927049092034e-05, + "loss": 0.0557, + "reward": 1.0333333611488342, + "reward_std": 0.08592780809849501, + "rewards/accuracy_reward": 0.05416666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1501 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.5791870117188, + "epoch": 0.480716914706353, + "grad_norm": 0.1392291635274887, + "kl": 0.31433880925178526, + "learning_rate": 1.2396078041456137e-05, + "loss": 0.1133, + "reward": 0.9890625298023223, + "reward_std": 0.12419578358530999, + "rewards/accuracy_reward": 0.01458333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9744791924953461, + "step": 1502 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.0854278564453, + "epoch": 0.4810369659145463, + "grad_norm": 0.21207989752292633, + "kl": 0.3643794015049934, + "learning_rate": 1.2385226041011464e-05, + "loss": 0.0725, + "reward": 1.076562523841858, + "reward_std": 0.13115446493029595, + "rewards/accuracy_reward": 0.09583333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9807291805744172, + "step": 1503 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.4500152587891, + "epoch": 0.48135701712273965, + "grad_norm": 0.10227753221988678, + "kl": 0.2762680515646935, + "learning_rate": 1.2374371061312655e-05, + "loss": 0.0846, + "reward": 1.1239583671092988, + "reward_std": 0.1950426936149597, + "rewards/accuracy_reward": 0.14375000596046447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9802083551883698, + "step": 1504 + }, + { + "clip_ratio": 0.0, + "completion_length": 622.4208435058594, + "epoch": 0.48167706833093293, + "grad_norm": 0.08751754462718964, + "kl": 0.33224751353263854, + "learning_rate": 1.2363513115918065e-05, + "loss": 0.072, + "reward": 1.0447916984558105, + "reward_std": 0.11386512406170368, + "rewards/accuracy_reward": 0.0708333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583551883697, + "step": 1505 + }, + { + "clip_ratio": 0.0, + "completion_length": 593.8750183105469, + "epoch": 0.4819971195391263, + "grad_norm": 0.16370034217834473, + "kl": 0.20460733622312546, + "learning_rate": 1.2352652218389754e-05, + "loss": 0.0757, + "reward": 1.0338541746139527, + "reward_std": 0.1230011885985732, + "rewards/accuracy_reward": 0.05625000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9776041746139527, + "step": 1506 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.0771026611328, + "epoch": 0.48231717074731956, + "grad_norm": 0.07614739239215851, + "kl": 0.2973733879625797, + "learning_rate": 1.2341788382293467e-05, + "loss": 0.0886, + "reward": 1.0916666984558105, + "reward_std": 0.13258982375264167, + "rewards/accuracy_reward": 0.11458333563059568, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833492279053, + "step": 1507 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.1729370117188, + "epoch": 0.4826372219555129, + "grad_norm": 0.34557607769966125, + "kl": 0.2178966648876667, + "learning_rate": 1.2330921621198624e-05, + "loss": 0.0636, + "reward": 1.0552083671092987, + "reward_std": 0.11473358049988747, + "rewards/accuracy_reward": 0.07500000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9802083492279052, + "step": 1508 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.839599609375, + "epoch": 0.4829572731637062, + "grad_norm": 0.3721748888492584, + "kl": 0.33902390524744985, + "learning_rate": 1.2320051948678295e-05, + "loss": 0.0928, + "reward": 1.1062500298023223, + "reward_std": 0.14177928101271392, + "rewards/accuracy_reward": 0.13333333786576987, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9729166924953461, + "step": 1509 + }, + { + "clip_ratio": 0.0, + "completion_length": 595.5125122070312, + "epoch": 0.4832773243718995, + "grad_norm": 0.10752350836992264, + "kl": 0.30240178406238555, + "learning_rate": 1.2309179378309188e-05, + "loss": 0.1099, + "reward": 1.0609375357627868, + "reward_std": 0.15852489322423935, + "rewards/accuracy_reward": 0.09791667088866234, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9630208551883698, + "step": 1510 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.4062683105469, + "epoch": 0.4835973755800928, + "grad_norm": 0.10335452109575272, + "kl": 0.34756200537085535, + "learning_rate": 1.2298303923671635e-05, + "loss": 0.0878, + "reward": 1.1312500417232514, + "reward_std": 0.14642674401402472, + "rewards/accuracy_reward": 0.16041667088866235, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333551883698, + "step": 1511 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.2604431152344, + "epoch": 0.48391742678828614, + "grad_norm": 0.09037502855062485, + "kl": 0.2697148099541664, + "learning_rate": 1.2287425598349558e-05, + "loss": 0.0704, + "reward": 1.0531250357627868, + "reward_std": 0.15801854096353055, + "rewards/accuracy_reward": 0.08125000447034836, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9718750178813934, + "step": 1512 + }, + { + "clip_ratio": 0.0, + "completion_length": 613.7021057128907, + "epoch": 0.4842374779964794, + "grad_norm": 0.10988093167543411, + "kl": 0.41062536016106604, + "learning_rate": 1.2276544415930476e-05, + "loss": 0.0912, + "reward": 1.0375000417232514, + "reward_std": 0.2113563533872366, + "rewards/accuracy_reward": 0.0770833371207118, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9583333492279053, + "step": 1513 + }, + { + "clip_ratio": 0.0, + "completion_length": 603.2291931152344, + "epoch": 0.48455752920467277, + "grad_norm": 0.12491331994533539, + "kl": 0.4697680056095123, + "learning_rate": 1.2265660390005474e-05, + "loss": 0.0787, + "reward": 1.0557291805744171, + "reward_std": 0.13224833961576224, + "rewards/accuracy_reward": 0.08541666865348815, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9703125119209289, + "step": 1514 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.4583618164063, + "epoch": 0.48487758041286605, + "grad_norm": 0.20729385316371918, + "kl": 0.41887201070785524, + "learning_rate": 1.2254773534169188e-05, + "loss": 0.0538, + "reward": 1.1151042103767395, + "reward_std": 0.10528469458222389, + "rewards/accuracy_reward": 0.1458333373069763, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9692708551883698, + "step": 1515 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.0312744140625, + "epoch": 0.4851976316210594, + "grad_norm": 0.12627162039279938, + "kl": 0.40239308923482897, + "learning_rate": 1.2243883862019787e-05, + "loss": 0.1111, + "reward": 1.1109375536441803, + "reward_std": 0.18767590299248696, + "rewards/accuracy_reward": 0.14375000558793544, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9671875178813935, + "step": 1516 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.4666809082031, + "epoch": 0.4855176828292527, + "grad_norm": 0.16976335644721985, + "kl": 0.4639628753066063, + "learning_rate": 1.2232991387158957e-05, + "loss": 0.0657, + "reward": 1.0390625298023224, + "reward_std": 0.1840406185016036, + "rewards/accuracy_reward": 0.0687500013038516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9703125119209289, + "step": 1517 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.6625244140625, + "epoch": 0.485837734037446, + "grad_norm": 0.2504444718360901, + "kl": 0.4745438635349274, + "learning_rate": 1.2222096123191891e-05, + "loss": 0.0635, + "reward": 1.154166692495346, + "reward_std": 0.17704404518008232, + "rewards/accuracy_reward": 0.18750000409781933, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9666666805744171, + "step": 1518 + }, + { + "clip_ratio": 0.0, + "completion_length": 610.3666931152344, + "epoch": 0.4861577852456393, + "grad_norm": 0.15202420949935913, + "kl": 0.6530636228621006, + "learning_rate": 1.2211198083727262e-05, + "loss": 0.11, + "reward": 1.0015625178813934, + "reward_std": 0.1717325121164322, + "rewards/accuracy_reward": 0.0479166679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9536458492279053, + "step": 1519 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.3354370117188, + "epoch": 0.48647783645383263, + "grad_norm": 0.20241940021514893, + "kl": 0.46638872250914576, + "learning_rate": 1.2200297282377207e-05, + "loss": 0.0907, + "reward": 1.0963541984558105, + "reward_std": 0.13941559456288816, + "rewards/accuracy_reward": 0.1270833384245634, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9692708492279053, + "step": 1520 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.6875244140625, + "epoch": 0.4867978876620259, + "grad_norm": 0.1839069426059723, + "kl": 0.36424013748764994, + "learning_rate": 1.2189393732757313e-05, + "loss": 0.0786, + "reward": 1.035416704416275, + "reward_std": 0.10752957388758659, + "rewards/accuracy_reward": 0.05625000167638063, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9791666805744171, + "step": 1521 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.3687622070313, + "epoch": 0.48711793887021926, + "grad_norm": 0.3818299174308777, + "kl": 0.43634158819913865, + "learning_rate": 1.2178487448486607e-05, + "loss": 0.0566, + "reward": 1.0375000298023225, + "reward_std": 0.12123782895505428, + "rewards/accuracy_reward": 0.06250000130385161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.975000011920929, + "step": 1522 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.6416931152344, + "epoch": 0.48743799007841254, + "grad_norm": 0.1835007667541504, + "kl": 0.6065421789884567, + "learning_rate": 1.2167578443187523e-05, + "loss": 0.0932, + "reward": 1.1057291746139526, + "reward_std": 0.14355219900608063, + "rewards/accuracy_reward": 0.13958333637565373, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9661458492279053, + "step": 1523 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.9229309082032, + "epoch": 0.4877580412866059, + "grad_norm": 0.23025159537792206, + "kl": 0.3891874521970749, + "learning_rate": 1.2156666730485895e-05, + "loss": 0.0711, + "reward": 1.0072916805744172, + "reward_std": 0.15119225680828094, + "rewards/accuracy_reward": 0.035416667349636555, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.971875011920929, + "step": 1524 + }, + { + "clip_ratio": 0.0, + "completion_length": 613.2854370117187, + "epoch": 0.48807809249479917, + "grad_norm": 0.15648485720157623, + "kl": 0.45245604068040846, + "learning_rate": 1.2145752324010948e-05, + "loss": 0.0598, + "reward": 1.0708333492279052, + "reward_std": 0.10634525269269943, + "rewards/accuracy_reward": 0.09583333730697632, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.975000011920929, + "step": 1525 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.4625122070313, + "epoch": 0.4883981437029925, + "grad_norm": 0.23647770285606384, + "kl": 0.31469220519065855, + "learning_rate": 1.2134835237395254e-05, + "loss": 0.054, + "reward": 0.9968750238418579, + "reward_std": 0.1189862385392189, + "rewards/accuracy_reward": 0.020833333767950536, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9760416805744171, + "step": 1526 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.8854339599609, + "epoch": 0.4887181949111858, + "grad_norm": 0.08674870431423187, + "kl": 0.32070714607834816, + "learning_rate": 1.2123915484274755e-05, + "loss": 0.0821, + "reward": 1.0953125476837158, + "reward_std": 0.1944058895111084, + "rewards/accuracy_reward": 0.12083333935588599, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9744791686534882, + "step": 1527 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.8812805175781, + "epoch": 0.4890382461193791, + "grad_norm": 0.09347715973854065, + "kl": 0.32560470774769784, + "learning_rate": 1.2112993078288702e-05, + "loss": 0.0686, + "reward": 1.021875023841858, + "reward_std": 0.12335868813097477, + "rewards/accuracy_reward": 0.0458333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9760416805744171, + "step": 1528 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.4937774658204, + "epoch": 0.4893582973275724, + "grad_norm": 0.17592473328113556, + "kl": 0.4667899638414383, + "learning_rate": 1.2102068033079672e-05, + "loss": 0.1084, + "reward": 1.0171875298023223, + "reward_std": 0.16079341247677803, + "rewards/accuracy_reward": 0.04375000130385161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9734375059604645, + "step": 1529 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.1625183105468, + "epoch": 0.4896783485357657, + "grad_norm": 0.18604399263858795, + "kl": 0.6784585162997245, + "learning_rate": 1.2091140362293538e-05, + "loss": 0.0743, + "reward": 1.1005208611488342, + "reward_std": 0.17486817315220832, + "rewards/accuracy_reward": 0.1333333384245634, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9671875238418579, + "step": 1530 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.4166870117188, + "epoch": 0.48999839974395903, + "grad_norm": 0.24168041348457336, + "kl": 0.4977486953139305, + "learning_rate": 1.2080210079579452e-05, + "loss": 0.094, + "reward": 1.052604192495346, + "reward_std": 0.1793554574251175, + "rewards/accuracy_reward": 0.08541666939854622, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9671875, + "step": 1531 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.9062683105469, + "epoch": 0.4903184509521523, + "grad_norm": 0.27037033438682556, + "kl": 0.486817866563797, + "learning_rate": 1.2069277198589819e-05, + "loss": 0.0892, + "reward": 1.1145833730697632, + "reward_std": 0.15329679772257804, + "rewards/accuracy_reward": 0.14583333786576985, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9687500119209289, + "step": 1532 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.5521057128906, + "epoch": 0.49063850216034566, + "grad_norm": 0.12519511580467224, + "kl": 0.5029525205492973, + "learning_rate": 1.2058341732980303e-05, + "loss": 0.0983, + "reward": 1.071875023841858, + "reward_std": 0.1138947419822216, + "rewards/accuracy_reward": 0.10208333656191826, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916805744171, + "step": 1533 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.558349609375, + "epoch": 0.49095855336853894, + "grad_norm": 0.2664872407913208, + "kl": 0.3999428883194923, + "learning_rate": 1.2047403696409787e-05, + "loss": 0.0554, + "reward": 1.012500023841858, + "reward_std": 0.09246203228831291, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9729166746139526, + "step": 1534 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.7562683105468, + "epoch": 0.4912786045767323, + "grad_norm": 0.21524342894554138, + "kl": 0.5321291498839855, + "learning_rate": 1.2036463102540375e-05, + "loss": 0.0624, + "reward": 1.0281250119209289, + "reward_std": 0.20354544073343278, + "rewards/accuracy_reward": 0.06250000242143869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9656250178813934, + "step": 1535 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.5437713623047, + "epoch": 0.49159865578492556, + "grad_norm": 0.1489776223897934, + "kl": 0.42065939456224444, + "learning_rate": 1.202551996503735e-05, + "loss": 0.1074, + "reward": 1.012500023841858, + "reward_std": 0.11747174710035324, + "rewards/accuracy_reward": 0.045833334885537626, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9666666865348816, + "step": 1536 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.7625213623047, + "epoch": 0.4919187069931189, + "grad_norm": 0.12150443345308304, + "kl": 0.27668090388178823, + "learning_rate": 1.2014574297569182e-05, + "loss": 0.044, + "reward": 1.1286458551883698, + "reward_std": 0.08452764227986335, + "rewards/accuracy_reward": 0.14166667070239783, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9869791686534881, + "step": 1537 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.3229370117188, + "epoch": 0.4922387582013122, + "grad_norm": 0.2578086256980896, + "kl": 0.4889775365591049, + "learning_rate": 1.2003626113807504e-05, + "loss": 0.1198, + "reward": 1.1427083611488342, + "reward_std": 0.1477345634251833, + "rewards/accuracy_reward": 0.17291667070239783, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.969791692495346, + "step": 1538 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.3729370117187, + "epoch": 0.4925588094095055, + "grad_norm": 0.2890605926513672, + "kl": 0.750163146853447, + "learning_rate": 1.1992675427427085e-05, + "loss": 0.0899, + "reward": 1.0348958611488341, + "reward_std": 0.18009860459715127, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9661458492279053, + "step": 1539 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.2646026611328, + "epoch": 0.4928788606176988, + "grad_norm": 0.2128416746854782, + "kl": 0.7321541458368301, + "learning_rate": 1.1981722252105827e-05, + "loss": 0.0875, + "reward": 0.9989583432674408, + "reward_std": 0.15414710491895675, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9614583432674408, + "step": 1540 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.1687774658203, + "epoch": 0.49319891182589215, + "grad_norm": 0.31140488386154175, + "kl": 0.5592325419187546, + "learning_rate": 1.1970766601524733e-05, + "loss": 0.1257, + "reward": 1.0786458671092987, + "reward_std": 0.20177911669015886, + "rewards/accuracy_reward": 0.11250000279396773, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9661458492279053, + "step": 1541 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.1750305175781, + "epoch": 0.49351896303408543, + "grad_norm": 0.21769623458385468, + "kl": 0.8614889137446881, + "learning_rate": 1.1959808489367897e-05, + "loss": 0.0884, + "reward": 1.0255208432674408, + "reward_std": 0.15918941050767899, + "rewards/accuracy_reward": 0.07916666772216559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9463541805744171, + "step": 1542 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.5604370117187, + "epoch": 0.4938390142422788, + "grad_norm": 0.15683266520500183, + "kl": 0.3458200544118881, + "learning_rate": 1.1948847929322498e-05, + "loss": 0.0774, + "reward": 1.0020833551883697, + "reward_std": 0.10447518564760686, + "rewards/accuracy_reward": 0.02083333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9812500178813934, + "step": 1543 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.0687683105468, + "epoch": 0.49415906545047206, + "grad_norm": 0.25657808780670166, + "kl": 0.3729611948132515, + "learning_rate": 1.1937884935078767e-05, + "loss": 0.1044, + "reward": 1.0755208551883697, + "reward_std": 0.12975474894046785, + "rewards/accuracy_reward": 0.10208333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.973437511920929, + "step": 1544 + }, + { + "clip_ratio": 0.0, + "completion_length": 514.5958435058594, + "epoch": 0.4944791166586654, + "grad_norm": 0.28509220480918884, + "kl": 0.39910185933113096, + "learning_rate": 1.192691952032997e-05, + "loss": 0.0795, + "reward": 1.0265625238418579, + "reward_std": 0.1085195817053318, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9848958373069763, + "step": 1545 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.1687774658203, + "epoch": 0.4947991678668587, + "grad_norm": 0.08284153044223785, + "kl": 0.42232955545186995, + "learning_rate": 1.1915951698772403e-05, + "loss": 0.0637, + "reward": 1.0494791924953462, + "reward_std": 0.125248346850276, + "rewards/accuracy_reward": 0.07083333507180214, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458551883698, + "step": 1546 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.6583587646485, + "epoch": 0.495119219075052, + "grad_norm": 0.19617708027362823, + "kl": 0.3461019277572632, + "learning_rate": 1.1904981484105367e-05, + "loss": 0.0888, + "reward": 1.0942708551883698, + "reward_std": 0.12184848748147488, + "rewards/accuracy_reward": 0.1187500050291419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208432674408, + "step": 1547 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.3375061035156, + "epoch": 0.4954392702832453, + "grad_norm": 0.12780652940273285, + "kl": 0.6290895700454712, + "learning_rate": 1.1894008890031152e-05, + "loss": 0.0819, + "reward": 1.0213541865348816, + "reward_std": 0.1498611181974411, + "rewards/accuracy_reward": 0.06041666883975268, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9609375119209289, + "step": 1548 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.0604309082031, + "epoch": 0.49575932149143864, + "grad_norm": 0.2656075656414032, + "kl": 0.5985722355544567, + "learning_rate": 1.1883033930255018e-05, + "loss": 0.113, + "reward": 1.0385416984558105, + "reward_std": 0.13213938027620314, + "rewards/accuracy_reward": 0.06041666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250119209289, + "step": 1549 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.5437622070312, + "epoch": 0.4960793726996319, + "grad_norm": 0.10763701051473618, + "kl": 0.33446870669722556, + "learning_rate": 1.1872056618485183e-05, + "loss": 0.1121, + "reward": 1.0614583551883698, + "reward_std": 0.14568435922265052, + "rewards/accuracy_reward": 0.08958333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9718750059604645, + "step": 1550 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.8854309082031, + "epoch": 0.49639942390782527, + "grad_norm": 0.1511547714471817, + "kl": 0.42227124869823457, + "learning_rate": 1.1861076968432794e-05, + "loss": 0.0709, + "reward": 1.0765625298023225, + "reward_std": 0.12650047056376934, + "rewards/accuracy_reward": 0.10625000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9703125178813934, + "step": 1551 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.7541809082031, + "epoch": 0.49671947511601855, + "grad_norm": 0.10951292514801025, + "kl": 0.31010197959840297, + "learning_rate": 1.1850094993811936e-05, + "loss": 0.0808, + "reward": 1.0333333551883697, + "reward_std": 0.11681788396090269, + "rewards/accuracy_reward": 0.052083333395421504, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9812500178813934, + "step": 1552 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.702099609375, + "epoch": 0.4970395263242119, + "grad_norm": 0.1494865119457245, + "kl": 0.45758294574916364, + "learning_rate": 1.183911070833958e-05, + "loss": 0.0535, + "reward": 1.0880208551883697, + "reward_std": 0.1360863834619522, + "rewards/accuracy_reward": 0.10416666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9838541805744171, + "step": 1553 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.3000122070313, + "epoch": 0.4973595775324052, + "grad_norm": 0.12859082221984863, + "kl": 0.47827325090765954, + "learning_rate": 1.1828124125735597e-05, + "loss": 0.0762, + "reward": 1.0489583492279053, + "reward_std": 0.11680370531976222, + "rewards/accuracy_reward": 0.06875000111758708, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9802083492279052, + "step": 1554 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.5396057128906, + "epoch": 0.4976796287405985, + "grad_norm": 0.14917534589767456, + "kl": 0.4069339819252491, + "learning_rate": 1.1817135259722707e-05, + "loss": 0.0827, + "reward": 1.094791704416275, + "reward_std": 0.13878009673207997, + "rewards/accuracy_reward": 0.11666667181998491, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250119209289, + "step": 1555 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.6979370117188, + "epoch": 0.4979996799487918, + "grad_norm": 0.20614580810070038, + "kl": 0.3559134520590305, + "learning_rate": 1.1806144124026514e-05, + "loss": 0.0353, + "reward": 1.083854180574417, + "reward_std": 0.1292146436870098, + "rewards/accuracy_reward": 0.10000000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9838541686534882, + "step": 1556 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.9000183105469, + "epoch": 0.49831973115698514, + "grad_norm": 0.054509177803993225, + "kl": 0.16616240218281747, + "learning_rate": 1.1795150732375425e-05, + "loss": 0.0222, + "reward": 1.007812511920929, + "reward_std": 0.05190774351358414, + "rewards/accuracy_reward": 0.01458333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9932291686534882, + "step": 1557 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.9208435058594, + "epoch": 0.4986397823651784, + "grad_norm": 0.09618407487869263, + "kl": 0.27430230379104614, + "learning_rate": 1.1784155098500682e-05, + "loss": 0.061, + "reward": 1.0598958671092986, + "reward_std": 0.0757111854851246, + "rewards/accuracy_reward": 0.07500000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9848958551883698, + "step": 1558 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.4854339599609, + "epoch": 0.49895983357337176, + "grad_norm": 0.2027837336063385, + "kl": 0.5071586236357689, + "learning_rate": 1.1773157236136328e-05, + "loss": 0.0846, + "reward": 1.0614583551883698, + "reward_std": 0.1345365099608898, + "rewards/accuracy_reward": 0.08541666939854622, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9760416805744171, + "step": 1559 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.9750244140625, + "epoch": 0.49927988478156504, + "grad_norm": 0.07568039745092392, + "kl": 0.18464777991175652, + "learning_rate": 1.1762157159019184e-05, + "loss": 0.042, + "reward": 0.9994791865348815, + "reward_std": 0.0876783674582839, + "rewards/accuracy_reward": 0.01458333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9848958373069763, + "step": 1560 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.9687622070312, + "epoch": 0.4995999359897584, + "grad_norm": 0.09287263453006744, + "kl": 0.21261435151100158, + "learning_rate": 1.1751154880888835e-05, + "loss": 0.034, + "reward": 1.0916666865348816, + "reward_std": 0.10271332561969757, + "rewards/accuracy_reward": 0.10208333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9895833373069763, + "step": 1561 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.5041931152343, + "epoch": 0.49991998719795167, + "grad_norm": 0.06782330572605133, + "kl": 0.26801488250494004, + "learning_rate": 1.1740150415487621e-05, + "loss": 0.0638, + "reward": 1.053125023841858, + "reward_std": 0.1198198726400733, + "rewards/accuracy_reward": 0.07083333488553763, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9822916746139526, + "step": 1562 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.9812744140625, + "epoch": 0.500240038406145, + "grad_norm": 0.24701856076717377, + "kl": 0.2620903179049492, + "learning_rate": 1.1729143776560614e-05, + "loss": 0.1063, + "reward": 1.0911458671092986, + "reward_std": 0.1619997039437294, + "rewards/accuracy_reward": 0.1125000050291419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458492279053, + "step": 1563 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.0208526611328, + "epoch": 0.5005600896143383, + "grad_norm": 0.12949657440185547, + "kl": 0.30676123052835463, + "learning_rate": 1.17181349778556e-05, + "loss": 0.0417, + "reward": 1.0942708492279052, + "reward_std": 0.16428543999791145, + "rewards/accuracy_reward": 0.11666666734963656, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9776041805744171, + "step": 1564 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.2500152587891, + "epoch": 0.5008801408225316, + "grad_norm": 0.1947976052761078, + "kl": 0.3174207493662834, + "learning_rate": 1.1707124033123058e-05, + "loss": 0.0828, + "reward": 1.0536458611488342, + "reward_std": 0.1148330207914114, + "rewards/accuracy_reward": 0.07500000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458492279053, + "step": 1565 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.660433959961, + "epoch": 0.501200192030725, + "grad_norm": 0.13384360074996948, + "kl": 0.4696915991604328, + "learning_rate": 1.1696110956116151e-05, + "loss": 0.1037, + "reward": 1.0854166984558105, + "reward_std": 0.12102588117122651, + "rewards/accuracy_reward": 0.10625000335276127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9791666746139527, + "step": 1566 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.2395965576172, + "epoch": 0.5015202432389182, + "grad_norm": 0.38326171040534973, + "kl": 0.25607917830348015, + "learning_rate": 1.1685095760590706e-05, + "loss": 0.0815, + "reward": 1.0500000178813935, + "reward_std": 0.11502253264188766, + "rewards/accuracy_reward": 0.07291666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833492279053, + "step": 1567 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.458349609375, + "epoch": 0.5018402944471115, + "grad_norm": 0.15577267110347748, + "kl": 0.39244875088334086, + "learning_rate": 1.1674078460305199e-05, + "loss": 0.0476, + "reward": 1.035416692495346, + "reward_std": 0.12932371124625205, + "rewards/accuracy_reward": 0.05208333563059568, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9833333432674408, + "step": 1568 + }, + { + "clip_ratio": 0.0, + "completion_length": 595.4021026611329, + "epoch": 0.5021603456553049, + "grad_norm": 0.20703621208667755, + "kl": 0.300426347181201, + "learning_rate": 1.1663059069020728e-05, + "loss": 0.0965, + "reward": 1.112500011920929, + "reward_std": 0.15419322103261948, + "rewards/accuracy_reward": 0.13541666995733975, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833373069763, + "step": 1569 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.2916900634766, + "epoch": 0.5024803968634982, + "grad_norm": 0.13127927482128143, + "kl": 0.3124852038919926, + "learning_rate": 1.1652037600501007e-05, + "loss": 0.0743, + "reward": 1.0692708611488342, + "reward_std": 0.1161904064938426, + "rewards/accuracy_reward": 0.08958333879709243, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.979687511920929, + "step": 1570 + }, + { + "clip_ratio": 0.0, + "completion_length": 603.0646026611328, + "epoch": 0.5028004480716914, + "grad_norm": 0.24042336642742157, + "kl": 0.44360672757029534, + "learning_rate": 1.1641014068512342e-05, + "loss": 0.071, + "reward": 1.0895833849906922, + "reward_std": 0.14635562822222709, + "rewards/accuracy_reward": 0.11458333879709244, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9750000238418579, + "step": 1571 + }, + { + "clip_ratio": 0.0, + "completion_length": 595.0375244140625, + "epoch": 0.5031204992798848, + "grad_norm": 0.17726963758468628, + "kl": 0.32658193036913874, + "learning_rate": 1.162998848682362e-05, + "loss": 0.0587, + "reward": 1.1692708730697632, + "reward_std": 0.12005004528909921, + "rewards/accuracy_reward": 0.1812500072643161, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9880208492279052, + "step": 1572 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.9583557128906, + "epoch": 0.5034405504880781, + "grad_norm": 0.38185715675354004, + "kl": 0.6799829356372357, + "learning_rate": 1.1618960869206287e-05, + "loss": 0.118, + "reward": 1.0156250178813935, + "reward_std": 0.14403234291821718, + "rewards/accuracy_reward": 0.0354166679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9802083492279052, + "step": 1573 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.5062744140625, + "epoch": 0.5037606016962713, + "grad_norm": 0.19469714164733887, + "kl": 0.5438534311950207, + "learning_rate": 1.1607931229434328e-05, + "loss": 0.0845, + "reward": 1.0177083671092988, + "reward_std": 0.09994984827935696, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9760416865348815, + "step": 1574 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.4979309082031, + "epoch": 0.5040806529044647, + "grad_norm": 0.16624751687049866, + "kl": 0.49131586998701093, + "learning_rate": 1.1596899581284263e-05, + "loss": 0.0904, + "reward": 1.0052083671092986, + "reward_std": 0.1507533010095358, + "rewards/accuracy_reward": 0.033333334140479565, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.971875011920929, + "step": 1575 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.9333465576171, + "epoch": 0.504400704112658, + "grad_norm": 0.12965738773345947, + "kl": 0.3315769825130701, + "learning_rate": 1.1585865938535106e-05, + "loss": 0.0851, + "reward": 1.079687523841858, + "reward_std": 0.10819105207920074, + "rewards/accuracy_reward": 0.09791666995733976, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708373069763, + "step": 1576 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.7479309082031, + "epoch": 0.5047207553208514, + "grad_norm": 0.10354313999414444, + "kl": 0.501846868172288, + "learning_rate": 1.157483031496838e-05, + "loss": 0.0522, + "reward": 1.1197916865348816, + "reward_std": 0.11418646480888128, + "rewards/accuracy_reward": 0.13750000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9822916805744171, + "step": 1577 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.2604370117188, + "epoch": 0.5050408065290446, + "grad_norm": 0.10002946853637695, + "kl": 0.256185794621706, + "learning_rate": 1.1563792724368066e-05, + "loss": 0.0363, + "reward": 1.0333333551883697, + "reward_std": 0.08332265578210354, + "rewards/accuracy_reward": 0.04583333563059568, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.987500011920929, + "step": 1578 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.5146057128907, + "epoch": 0.5053608577372379, + "grad_norm": 0.08440849930047989, + "kl": 0.23348399624228477, + "learning_rate": 1.1552753180520612e-05, + "loss": 0.0526, + "reward": 1.0494791924953462, + "reward_std": 0.06868757046759129, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9807291865348816, + "step": 1579 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.3291870117188, + "epoch": 0.5056809089454313, + "grad_norm": 0.07465245574712753, + "kl": 0.2693470485508442, + "learning_rate": 1.15417116972149e-05, + "loss": 0.057, + "reward": 1.0104166924953462, + "reward_std": 0.10453488267958164, + "rewards/accuracy_reward": 0.025000001303851604, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9854166805744171, + "step": 1580 + }, + { + "clip_ratio": 0.0, + "completion_length": 611.9937622070313, + "epoch": 0.5060009601536246, + "grad_norm": 0.12690547108650208, + "kl": 0.36593677997589114, + "learning_rate": 1.1530668288242244e-05, + "loss": 0.1098, + "reward": 1.122916692495346, + "reward_std": 0.15517406426370145, + "rewards/accuracy_reward": 0.1583333384245634, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9645833492279052, + "step": 1581 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.5250183105469, + "epoch": 0.5063210113618178, + "grad_norm": 0.08542995899915695, + "kl": 0.30837327912449836, + "learning_rate": 1.1519622967396347e-05, + "loss": 0.0626, + "reward": 1.0864583671092987, + "reward_std": 0.1340044129639864, + "rewards/accuracy_reward": 0.11041667181998491, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9760416805744171, + "step": 1582 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.783349609375, + "epoch": 0.5066410625700112, + "grad_norm": 0.12930899858474731, + "kl": 0.20259029194712638, + "learning_rate": 1.1508575748473317e-05, + "loss": 0.07, + "reward": 1.027604192495346, + "reward_std": 0.10508870705962181, + "rewards/accuracy_reward": 0.0479166679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9796875178813934, + "step": 1583 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.8916870117188, + "epoch": 0.5069611137782045, + "grad_norm": 0.12330963462591171, + "kl": 0.2539986282587051, + "learning_rate": 1.1497526645271618e-05, + "loss": 0.0491, + "reward": 1.0385416924953461, + "reward_std": 0.09695471059530973, + "rewards/accuracy_reward": 0.0541666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750178813935, + "step": 1584 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.6916809082031, + "epoch": 0.5072811649863979, + "grad_norm": 0.09102137386798859, + "kl": 0.23063104897737502, + "learning_rate": 1.1486475671592084e-05, + "loss": 0.0721, + "reward": 1.0562500298023223, + "reward_std": 0.12390441037714481, + "rewards/accuracy_reward": 0.07916666846722364, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833551883698, + "step": 1585 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.4791870117188, + "epoch": 0.5076012161945911, + "grad_norm": 0.10432296246290207, + "kl": 0.30286980494856836, + "learning_rate": 1.1475422841237867e-05, + "loss": 0.0615, + "reward": 1.0531250298023225, + "reward_std": 0.10416628401726484, + "rewards/accuracy_reward": 0.07291666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9802083551883698, + "step": 1586 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.1687744140625, + "epoch": 0.5079212674027844, + "grad_norm": 0.07721851021051407, + "kl": 0.2693323642015457, + "learning_rate": 1.146436816801445e-05, + "loss": 0.0738, + "reward": 1.0328125298023223, + "reward_std": 0.13880394026637077, + "rewards/accuracy_reward": 0.05208333563059568, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9786458492279053, + "step": 1587 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.4062683105469, + "epoch": 0.5082413186109778, + "grad_norm": 0.1546679586172104, + "kl": 0.18442733883857726, + "learning_rate": 1.1453311665729618e-05, + "loss": 0.0621, + "reward": 1.068229180574417, + "reward_std": 0.0874556915834546, + "rewards/accuracy_reward": 0.08541666865348815, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.982812511920929, + "step": 1588 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.8958557128906, + "epoch": 0.5085613698191711, + "grad_norm": 0.11382945626974106, + "kl": 0.1633252240717411, + "learning_rate": 1.1442253348193437e-05, + "loss": 0.063, + "reward": 1.1130208551883698, + "reward_std": 0.10782817732542753, + "rewards/accuracy_reward": 0.1270833358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9859375119209289, + "step": 1589 + }, + { + "clip_ratio": 0.0, + "completion_length": 595.0937652587891, + "epoch": 0.5088814210273643, + "grad_norm": 0.10278374701738358, + "kl": 0.18282031267881393, + "learning_rate": 1.1431193229218236e-05, + "loss": 0.0389, + "reward": 1.1364583492279052, + "reward_std": 0.10143474787473679, + "rewards/accuracy_reward": 0.1541666707023978, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9822916746139526, + "step": 1590 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.0812683105469, + "epoch": 0.5092014722355577, + "grad_norm": 0.13747954368591309, + "kl": 0.21457262933254242, + "learning_rate": 1.1420131322618601e-05, + "loss": 0.0535, + "reward": 1.0312500298023224, + "reward_std": 0.09525865130126476, + "rewards/accuracy_reward": 0.05000000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9812500238418579, + "step": 1591 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.743765258789, + "epoch": 0.509521523443751, + "grad_norm": 0.06254348903894424, + "kl": 0.1685093741863966, + "learning_rate": 1.1409067642211352e-05, + "loss": 0.0469, + "reward": 1.0687500178813933, + "reward_std": 0.10046824738383293, + "rewards/accuracy_reward": 0.08333333544433116, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9854166746139527, + "step": 1592 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.8271057128907, + "epoch": 0.5098415746519444, + "grad_norm": 0.10820659250020981, + "kl": 0.20122895240783692, + "learning_rate": 1.1398002201815517e-05, + "loss": 0.0486, + "reward": 1.007812511920929, + "reward_std": 0.0893495699390769, + "rewards/accuracy_reward": 0.02291666716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9848958432674408, + "step": 1593 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.4104309082031, + "epoch": 0.5101616258601376, + "grad_norm": 0.1397324651479721, + "kl": 0.22246124893426894, + "learning_rate": 1.138693501525233e-05, + "loss": 0.0574, + "reward": 1.1260416865348817, + "reward_std": 0.14539001807570456, + "rewards/accuracy_reward": 0.14166667126119137, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750119209289, + "step": 1594 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.7250183105468, + "epoch": 0.5104816770683309, + "grad_norm": 0.08888793736696243, + "kl": 0.25225738137960435, + "learning_rate": 1.1375866096345201e-05, + "loss": 0.0785, + "reward": 1.0520833671092986, + "reward_std": 0.11396115198731423, + "rewards/accuracy_reward": 0.07708333563059569, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.975000011920929, + "step": 1595 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.0479309082032, + "epoch": 0.5108017282765243, + "grad_norm": 0.10985810309648514, + "kl": 0.17879950627684593, + "learning_rate": 1.1364795458919704e-05, + "loss": 0.0433, + "reward": 1.083854180574417, + "reward_std": 0.0588629137724638, + "rewards/accuracy_reward": 0.09375000502914191, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9901041746139526, + "step": 1596 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.3020935058594, + "epoch": 0.5111217794847176, + "grad_norm": 0.09722897410392761, + "kl": 0.23140017688274384, + "learning_rate": 1.135372311680356e-05, + "loss": 0.0473, + "reward": 1.1265625298023223, + "reward_std": 0.13466337826102973, + "rewards/accuracy_reward": 0.1395833369344473, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9869791805744171, + "step": 1597 + }, + { + "clip_ratio": 0.0, + "completion_length": 610.4125122070312, + "epoch": 0.5114418306929108, + "grad_norm": 0.06069159880280495, + "kl": 0.19542404487729073, + "learning_rate": 1.1342649083826629e-05, + "loss": 0.0555, + "reward": 1.0776041805744172, + "reward_std": 0.08706863857805729, + "rewards/accuracy_reward": 0.09583333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708432674408, + "step": 1598 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.1021026611328, + "epoch": 0.5117618819011042, + "grad_norm": 0.04462220519781113, + "kl": 0.16532036811113357, + "learning_rate": 1.1331573373820864e-05, + "loss": 0.07, + "reward": 1.0604166865348816, + "reward_std": 0.08268660437315703, + "rewards/accuracy_reward": 0.07708333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9833333432674408, + "step": 1599 + }, + { + "clip_ratio": 0.0, + "completion_length": 611.520849609375, + "epoch": 0.5120819331092975, + "grad_norm": 0.16866251826286316, + "kl": 0.19539010524749756, + "learning_rate": 1.1320496000620325e-05, + "loss": 0.0499, + "reward": 1.0390625417232513, + "reward_std": 0.11077856961637736, + "rewards/accuracy_reward": 0.05416666939854622, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9848958432674408, + "step": 1600 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.6520965576171, + "epoch": 0.5124019843174908, + "grad_norm": 0.07243788987398148, + "kl": 0.16208918057382107, + "learning_rate": 1.1309416978061149e-05, + "loss": 0.0587, + "reward": 1.0708333492279052, + "reward_std": 0.10626544300466775, + "rewards/accuracy_reward": 0.08541666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9854166805744171, + "step": 1601 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.2062683105469, + "epoch": 0.5127220355256841, + "grad_norm": 0.060984719544649124, + "kl": 0.22590798139572144, + "learning_rate": 1.1298336319981532e-05, + "loss": 0.0347, + "reward": 1.0562500178813934, + "reward_std": 0.06318792756646871, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9854166865348816, + "step": 1602 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.9771057128906, + "epoch": 0.5130420867338774, + "grad_norm": 0.05511503294110298, + "kl": 0.13532592430710794, + "learning_rate": 1.128725404022171e-05, + "loss": 0.027, + "reward": 1.0979166984558106, + "reward_std": 0.14514823630452156, + "rewards/accuracy_reward": 0.10833333469927311, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1603 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.5083526611328, + "epoch": 0.5133621379420708, + "grad_norm": 0.06461673974990845, + "kl": 0.1610957682132721, + "learning_rate": 1.1276170152623948e-05, + "loss": 0.045, + "reward": 1.0687500357627868, + "reward_std": 0.07482259795069694, + "rewards/accuracy_reward": 0.07916666902601718, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1604 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.4104370117187, + "epoch": 0.5136821891502641, + "grad_norm": 0.05804125964641571, + "kl": 0.14314365684986113, + "learning_rate": 1.1265084671032516e-05, + "loss": 0.0558, + "reward": 1.1213541984558106, + "reward_std": 0.11102346312254667, + "rewards/accuracy_reward": 0.13333333786576987, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9880208492279052, + "step": 1605 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.770849609375, + "epoch": 0.5140022403584573, + "grad_norm": 0.045790914446115494, + "kl": 0.17196787595748902, + "learning_rate": 1.1253997609293684e-05, + "loss": 0.0459, + "reward": 1.0552083671092987, + "reward_std": 0.16030770651996135, + "rewards/accuracy_reward": 0.07291666995733977, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9822916865348816, + "step": 1606 + }, + { + "clip_ratio": 0.0, + "completion_length": 634.3562683105469, + "epoch": 0.5143222915666507, + "grad_norm": 0.03354915976524353, + "kl": 0.10071540996432304, + "learning_rate": 1.1242908981255676e-05, + "loss": 0.0336, + "reward": 1.0520833611488343, + "reward_std": 0.06626205574721097, + "rewards/accuracy_reward": 0.06250000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9895833492279053, + "step": 1607 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.9458526611328, + "epoch": 0.514642342774844, + "grad_norm": 0.06027187034487724, + "kl": 0.1353273831307888, + "learning_rate": 1.1231818800768696e-05, + "loss": 0.0616, + "reward": 1.064062511920929, + "reward_std": 0.08453094661235809, + "rewards/accuracy_reward": 0.08125000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.982812511920929, + "step": 1608 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.9979309082031, + "epoch": 0.5149623939830373, + "grad_norm": 0.08312620967626572, + "kl": 0.10024843215942383, + "learning_rate": 1.122072708168487e-05, + "loss": 0.0348, + "reward": 1.0635416865348817, + "reward_std": 0.05407893825322389, + "rewards/accuracy_reward": 0.07291666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9906250178813935, + "step": 1609 + }, + { + "clip_ratio": 0.0, + "completion_length": 617.764599609375, + "epoch": 0.5152824451912306, + "grad_norm": 0.13844886422157288, + "kl": 0.30158936940133574, + "learning_rate": 1.1209633837858256e-05, + "loss": 0.0307, + "reward": 1.043750023841858, + "reward_std": 0.08770579267293215, + "rewards/accuracy_reward": 0.06458333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9791666805744171, + "step": 1610 + }, + { + "clip_ratio": 0.0, + "completion_length": 599.081265258789, + "epoch": 0.5156024963994239, + "grad_norm": 0.08852372318506241, + "kl": 0.14461980909109115, + "learning_rate": 1.1198539083144808e-05, + "loss": 0.0344, + "reward": 1.0302083671092988, + "reward_std": 0.11091033667325974, + "rewards/accuracy_reward": 0.052083334885537624, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250178813934, + "step": 1611 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.8437744140625, + "epoch": 0.5159225476076172, + "grad_norm": 0.05620553344488144, + "kl": 0.1273003876209259, + "learning_rate": 1.1187442831402378e-05, + "loss": 0.0337, + "reward": 1.028125035762787, + "reward_std": 0.11089132819324732, + "rewards/accuracy_reward": 0.043750000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750178813935, + "step": 1612 + }, + { + "clip_ratio": 0.0, + "completion_length": 624.5875244140625, + "epoch": 0.5162425988158106, + "grad_norm": 0.07525043934583664, + "kl": 0.13220786526799203, + "learning_rate": 1.1176345096490671e-05, + "loss": 0.0326, + "reward": 1.1453125298023223, + "reward_std": 0.12826823052018882, + "rewards/accuracy_reward": 0.16666667368263005, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458492279053, + "step": 1613 + }, + { + "clip_ratio": 0.0, + "completion_length": 603.2125244140625, + "epoch": 0.5165626500240038, + "grad_norm": 0.06219992786645889, + "kl": 0.2638593137264252, + "learning_rate": 1.1165245892271265e-05, + "loss": 0.0666, + "reward": 1.1098958551883698, + "reward_std": 0.11750640124082565, + "rewards/accuracy_reward": 0.13125000409781934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458492279053, + "step": 1614 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.9750122070312, + "epoch": 0.5168827012321972, + "grad_norm": 0.1037401407957077, + "kl": 0.49187539964914323, + "learning_rate": 1.1154145232607558e-05, + "loss": 0.0055, + "reward": 1.1229166865348816, + "reward_std": 0.10175382327288389, + "rewards/accuracy_reward": 0.13958333674818277, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9833333432674408, + "step": 1615 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.4583526611328, + "epoch": 0.5172027524403905, + "grad_norm": 0.09893354773521423, + "kl": 0.33075669668614865, + "learning_rate": 1.114304313136477e-05, + "loss": 0.03, + "reward": 1.0557292103767395, + "reward_std": 0.10697339177131653, + "rewards/accuracy_reward": 0.07291666995733977, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9828125178813935, + "step": 1616 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.6666839599609, + "epoch": 0.5175228036485837, + "grad_norm": 0.10171794891357422, + "kl": 0.172719369456172, + "learning_rate": 1.1131939602409926e-05, + "loss": 0.0272, + "reward": 1.1250000298023224, + "reward_std": 0.11128932349383831, + "rewards/accuracy_reward": 0.13958333544433116, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9854166865348816, + "step": 1617 + }, + { + "clip_ratio": 0.0, + "completion_length": 640.4916900634765, + "epoch": 0.5178428548567771, + "grad_norm": 0.08092484623193741, + "kl": 0.38851368948817255, + "learning_rate": 1.1120834659611832e-05, + "loss": 0.0293, + "reward": 1.115104192495346, + "reward_std": 0.079699626006186, + "rewards/accuracy_reward": 0.13125000409781934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9838541805744171, + "step": 1618 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.6375122070312, + "epoch": 0.5181629060649704, + "grad_norm": 0.13454897701740265, + "kl": 0.2053068086504936, + "learning_rate": 1.1109728316841056e-05, + "loss": 0.0206, + "reward": 1.025000011920929, + "reward_std": 0.08904234617948532, + "rewards/accuracy_reward": 0.0375000013038516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.987500011920929, + "step": 1619 + }, + { + "clip_ratio": 0.0, + "completion_length": 603.0979400634766, + "epoch": 0.5184829572731637, + "grad_norm": 0.040230825543403625, + "kl": 0.10919744968414306, + "learning_rate": 1.1098620587969915e-05, + "loss": 0.0143, + "reward": 1.082812523841858, + "reward_std": 0.05984876081347466, + "rewards/accuracy_reward": 0.08750000260770321, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9932291805744171, + "step": 1620 + }, + { + "clip_ratio": 0.0, + "completion_length": 629.633349609375, + "epoch": 0.518803008481357, + "grad_norm": 0.05787614732980728, + "kl": 0.2819319121539593, + "learning_rate": 1.1087511486872461e-05, + "loss": 0.0489, + "reward": 1.0286458611488343, + "reward_std": 0.12565587386488913, + "rewards/accuracy_reward": 0.04791666772216559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9807291746139526, + "step": 1621 + }, + { + "clip_ratio": 0.0, + "completion_length": 631.9333618164062, + "epoch": 0.5191230596895503, + "grad_norm": 0.09872201085090637, + "kl": 0.14482049122452736, + "learning_rate": 1.1076401027424464e-05, + "loss": 0.0377, + "reward": 1.032291704416275, + "reward_std": 0.11295400913804769, + "rewards/accuracy_reward": 0.04375000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9885416805744172, + "step": 1622 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.5416870117188, + "epoch": 0.5194431108977436, + "grad_norm": 0.05958201363682747, + "kl": 0.17514662258327007, + "learning_rate": 1.106528922350338e-05, + "loss": 0.0322, + "reward": 1.0880208551883697, + "reward_std": 0.061942750960588454, + "rewards/accuracy_reward": 0.10000000596046447, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9880208432674408, + "step": 1623 + }, + { + "clip_ratio": 0.0, + "completion_length": 614.2687713623047, + "epoch": 0.519763162105937, + "grad_norm": 0.16206012666225433, + "kl": 0.14412535950541497, + "learning_rate": 1.1054176088988352e-05, + "loss": 0.0338, + "reward": 1.0739583551883698, + "reward_std": 0.08700152412056923, + "rewards/accuracy_reward": 0.08958333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750119209289, + "step": 1624 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.5583526611329, + "epoch": 0.5200832133141302, + "grad_norm": 0.06881947815418243, + "kl": 0.1701977513730526, + "learning_rate": 1.1043061637760184e-05, + "loss": 0.0415, + "reward": 1.090104192495346, + "reward_std": 0.13068218622356653, + "rewards/accuracy_reward": 0.10833333767950534, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708551883697, + "step": 1625 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.1791870117188, + "epoch": 0.5204032645223235, + "grad_norm": 0.09448045492172241, + "kl": 0.2662590444087982, + "learning_rate": 1.1031945883701319e-05, + "loss": 0.0261, + "reward": 1.0036458551883698, + "reward_std": 0.11030229702591896, + "rewards/accuracy_reward": 0.01875000037252903, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.982812511920929, + "step": 1626 + }, + { + "clip_ratio": 0.0, + "completion_length": 642.1333557128906, + "epoch": 0.5207233157305169, + "grad_norm": 0.2936466634273529, + "kl": 0.3425960190594196, + "learning_rate": 1.1020828840695836e-05, + "loss": 0.059, + "reward": 1.071354192495346, + "reward_std": 0.11369431018829346, + "rewards/accuracy_reward": 0.09375000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9776041805744171, + "step": 1627 + }, + { + "clip_ratio": 0.0, + "completion_length": 639.9271057128906, + "epoch": 0.5210433669387102, + "grad_norm": 0.12267967313528061, + "kl": 0.16440969109535217, + "learning_rate": 1.1009710522629415e-05, + "loss": 0.0323, + "reward": 1.0927083492279053, + "reward_std": 0.11118660233914852, + "rewards/accuracy_reward": 0.10416666846722364, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9864583432674408, + "step": 1628 + }, + { + "clip_ratio": 0.0, + "completion_length": 638.9687744140625, + "epoch": 0.5213634181469035, + "grad_norm": 0.07434359192848206, + "kl": 0.13105014562606812, + "learning_rate": 1.099859094338934e-05, + "loss": 0.0258, + "reward": 1.0406250417232514, + "reward_std": 0.11527959946542979, + "rewards/accuracy_reward": 0.05000000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9906250059604644, + "step": 1629 + }, + { + "clip_ratio": 0.0, + "completion_length": 593.7312683105469, + "epoch": 0.5216834693550968, + "grad_norm": 0.06500350683927536, + "kl": 0.12343095205724239, + "learning_rate": 1.0987470116864454e-05, + "loss": 0.0325, + "reward": 1.1812500298023223, + "reward_std": 0.0721820143982768, + "rewards/accuracy_reward": 0.18958333935588598, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9916666746139526, + "step": 1630 + }, + { + "clip_ratio": 0.0, + "completion_length": 604.7062683105469, + "epoch": 0.5220035205632901, + "grad_norm": 0.057139065116643906, + "kl": 0.28337795436382296, + "learning_rate": 1.0976348056945176e-05, + "loss": 0.0548, + "reward": 1.0281250298023223, + "reward_std": 0.11384273990988732, + "rewards/accuracy_reward": 0.05208333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9760416865348815, + "step": 1631 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.2166870117187, + "epoch": 0.5223235717714835, + "grad_norm": 0.05733027681708336, + "kl": 0.19319367222487926, + "learning_rate": 1.096522477752345e-05, + "loss": 0.0497, + "reward": 1.0739583492279052, + "reward_std": 0.1705170389264822, + "rewards/accuracy_reward": 0.08958333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750178813935, + "step": 1632 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.8000244140625, + "epoch": 0.5226436229796767, + "grad_norm": 0.06594210118055344, + "kl": 0.1955754801630974, + "learning_rate": 1.0954100292492758e-05, + "loss": 0.0696, + "reward": 1.0895833611488341, + "reward_std": 0.16243830658495426, + "rewards/accuracy_reward": 0.10833333749324084, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9812500178813934, + "step": 1633 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.0229431152344, + "epoch": 0.52296367418787, + "grad_norm": 0.10943306237459183, + "kl": 0.1569736622273922, + "learning_rate": 1.0942974615748069e-05, + "loss": 0.0473, + "reward": 1.0062500178813933, + "reward_std": 0.08850672990083694, + "rewards/accuracy_reward": 0.020833334513008596, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9854166805744171, + "step": 1634 + }, + { + "clip_ratio": 0.0, + "completion_length": 628.8208618164062, + "epoch": 0.5232837253960634, + "grad_norm": 0.06816142052412033, + "kl": 0.20981598347425462, + "learning_rate": 1.0931847761185863e-05, + "loss": 0.0467, + "reward": 1.0416666924953462, + "reward_std": 0.13459960371255875, + "rewards/accuracy_reward": 0.06041666902601719, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9791666805744171, + "step": 1635 + }, + { + "clip_ratio": 0.0, + "completion_length": 653.4812744140625, + "epoch": 0.5236037766042567, + "grad_norm": 0.08721835911273956, + "kl": 0.2609092280268669, + "learning_rate": 1.0920719742704071e-05, + "loss": 0.027, + "reward": 1.1067708551883697, + "reward_std": 0.1595559787005186, + "rewards/accuracy_reward": 0.13125000409781934, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208551883697, + "step": 1636 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.3937683105469, + "epoch": 0.52392382781245, + "grad_norm": 0.09751680493354797, + "kl": 0.16274499967694284, + "learning_rate": 1.0909590574202094e-05, + "loss": 0.0467, + "reward": 1.0932291805744172, + "reward_std": 0.15058468338102102, + "rewards/accuracy_reward": 0.10625000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9869791805744171, + "step": 1637 + }, + { + "clip_ratio": 0.0, + "completion_length": 623.6104309082032, + "epoch": 0.5242438790206433, + "grad_norm": 0.05089619755744934, + "kl": 0.1571653764694929, + "learning_rate": 1.0898460269580753e-05, + "loss": 0.025, + "reward": 1.060937523841858, + "reward_std": 0.11460729204118252, + "rewards/accuracy_reward": 0.07083333488553763, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9901041746139526, + "step": 1638 + }, + { + "clip_ratio": 0.0, + "completion_length": 621.3625213623047, + "epoch": 0.5245639302288366, + "grad_norm": 0.05131271854043007, + "kl": 0.19617617279291152, + "learning_rate": 1.0887328842742307e-05, + "loss": 0.061, + "reward": 1.0338541924953462, + "reward_std": 0.10966868083924056, + "rewards/accuracy_reward": 0.054166667722165586, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9796875178813934, + "step": 1639 + }, + { + "clip_ratio": 0.0, + "completion_length": 617.6271057128906, + "epoch": 0.52488398143703, + "grad_norm": 0.11150997877120972, + "kl": 0.29234243743121624, + "learning_rate": 1.0876196307590396e-05, + "loss": 0.0396, + "reward": 1.129687523841858, + "reward_std": 0.16783834397792816, + "rewards/accuracy_reward": 0.1479166716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708432674408, + "step": 1640 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.3250183105469, + "epoch": 0.5252040326452232, + "grad_norm": 0.07527544349431992, + "kl": 0.12925414890050888, + "learning_rate": 1.0865062678030065e-05, + "loss": 0.0348, + "reward": 1.1135416984558106, + "reward_std": 0.12643732279539108, + "rewards/accuracy_reward": 0.12500000316649676, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9885416865348816, + "step": 1641 + }, + { + "clip_ratio": 0.0, + "completion_length": 630.1187683105469, + "epoch": 0.5255240838534165, + "grad_norm": 0.11255325376987457, + "kl": 0.1748013935983181, + "learning_rate": 1.0853927967967705e-05, + "loss": 0.0423, + "reward": 1.0729166984558105, + "reward_std": 0.1471638258546591, + "rewards/accuracy_reward": 0.09166666958481073, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1642 + }, + { + "clip_ratio": 0.0, + "completion_length": 635.6479370117188, + "epoch": 0.5258441350616099, + "grad_norm": 0.13885398209095, + "kl": 0.19732209667563438, + "learning_rate": 1.0842792191311079e-05, + "loss": 0.028, + "reward": 1.0838541984558105, + "reward_std": 0.10696388110518455, + "rewards/accuracy_reward": 0.09583333767950535, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9880208432674408, + "step": 1643 + }, + { + "clip_ratio": 0.0, + "completion_length": 615.5771057128907, + "epoch": 0.5261641862698032, + "grad_norm": 0.17584773898124695, + "kl": 0.21706494837999343, + "learning_rate": 1.0831655361969263e-05, + "loss": 0.0298, + "reward": 1.0614583492279053, + "reward_std": 0.05746750514954328, + "rewards/accuracy_reward": 0.07500000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9864583432674408, + "step": 1644 + }, + { + "clip_ratio": 0.0, + "completion_length": 614.0312622070312, + "epoch": 0.5264842374779964, + "grad_norm": 0.09475237876176834, + "kl": 0.2863244879990816, + "learning_rate": 1.0820517493852655e-05, + "loss": 0.0454, + "reward": 1.1447917103767395, + "reward_std": 0.10612938590347767, + "rewards/accuracy_reward": 0.16041667126119136, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750178813935, + "step": 1645 + }, + { + "clip_ratio": 0.0, + "completion_length": 617.1062744140625, + "epoch": 0.5268042886861898, + "grad_norm": 0.07149934023618698, + "kl": 0.09891332313418388, + "learning_rate": 1.0809378600872957e-05, + "loss": 0.0438, + "reward": 1.0531250357627868, + "reward_std": 0.06739842891693115, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750298023224, + "step": 1646 + }, + { + "clip_ratio": 0.0, + "completion_length": 630.9021057128906, + "epoch": 0.5271243398943831, + "grad_norm": 0.046160925179719925, + "kl": 0.23046648167073727, + "learning_rate": 1.0798238696943144e-05, + "loss": 0.0443, + "reward": 1.0125000119209289, + "reward_std": 0.10311015080660582, + "rewards/accuracy_reward": 0.03125000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.981250011920929, + "step": 1647 + }, + { + "clip_ratio": 0.0, + "completion_length": 644.7291931152344, + "epoch": 0.5274443911025765, + "grad_norm": 0.1453281044960022, + "kl": 0.19713319800794124, + "learning_rate": 1.0787097795977447e-05, + "loss": 0.0627, + "reward": 1.0276041746139526, + "reward_std": 0.12270144894719123, + "rewards/accuracy_reward": 0.05625, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9713541746139527, + "step": 1648 + }, + { + "clip_ratio": 0.0, + "completion_length": 614.435433959961, + "epoch": 0.5277644423107697, + "grad_norm": 0.09991775453090668, + "kl": 0.25720389261841775, + "learning_rate": 1.077595591189136e-05, + "loss": 0.0617, + "reward": 1.0979166984558106, + "reward_std": 0.11305834613740444, + "rewards/accuracy_reward": 0.11666666995733976, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.981250011920929, + "step": 1649 + }, + { + "clip_ratio": 0.0, + "completion_length": 678.664599609375, + "epoch": 0.528084493518963, + "grad_norm": 0.049913953989744186, + "kl": 0.16697454005479812, + "learning_rate": 1.0764813058601591e-05, + "loss": 0.0386, + "reward": 1.0432291984558106, + "reward_std": 0.1091542337089777, + "rewards/accuracy_reward": 0.05833333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9848958492279053, + "step": 1650 + }, + { + "clip_ratio": 0.0, + "completion_length": 629.8896087646484, + "epoch": 0.5284045447271564, + "grad_norm": 0.07997886091470718, + "kl": 0.245851968228817, + "learning_rate": 1.0753669250026062e-05, + "loss": 0.0377, + "reward": 1.109375035762787, + "reward_std": 0.1784997694194317, + "rewards/accuracy_reward": 0.1312500026077032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250178813934, + "step": 1651 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.6104370117188, + "epoch": 0.5287245959353497, + "grad_norm": 0.07213015109300613, + "kl": 0.15853434652090073, + "learning_rate": 1.0742524500083891e-05, + "loss": 0.0473, + "reward": 1.0869792103767395, + "reward_std": 0.14299752824008466, + "rewards/accuracy_reward": 0.10208333861082793, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9848958432674408, + "step": 1652 + }, + { + "clip_ratio": 0.0, + "completion_length": 620.9937713623046, + "epoch": 0.5290446471435429, + "grad_norm": 0.1067395806312561, + "kl": 0.22370605766773224, + "learning_rate": 1.0731378822695368e-05, + "loss": 0.0714, + "reward": 1.1234375298023225, + "reward_std": 0.10482490509748459, + "rewards/accuracy_reward": 0.1458333373069763, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9755208492279053, + "step": 1653 + }, + { + "clip_ratio": 0.0, + "completion_length": 640.8729370117187, + "epoch": 0.5293646983517363, + "grad_norm": 0.23465663194656372, + "kl": 0.2824744485318661, + "learning_rate": 1.0720232231781944e-05, + "loss": 0.0604, + "reward": 1.0869791865348817, + "reward_std": 0.09580317065119744, + "rewards/accuracy_reward": 0.11041666883975268, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9765625119209289, + "step": 1654 + }, + { + "clip_ratio": 0.0, + "completion_length": 637.6833557128906, + "epoch": 0.5296847495599296, + "grad_norm": 0.09326420724391937, + "kl": 0.2504761453717947, + "learning_rate": 1.070908474126621e-05, + "loss": 0.055, + "reward": 1.1098958730697632, + "reward_std": 0.13805349618196489, + "rewards/accuracy_reward": 0.1312500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458492279053, + "step": 1655 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.3979339599609, + "epoch": 0.530004800768123, + "grad_norm": 0.09102252870798111, + "kl": 0.20697028711438178, + "learning_rate": 1.069793636507188e-05, + "loss": 0.054, + "reward": 1.0390625178813935, + "reward_std": 0.09756367336958646, + "rewards/accuracy_reward": 0.0583333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9807291805744172, + "step": 1656 + }, + { + "clip_ratio": 0.0, + "completion_length": 627.1354370117188, + "epoch": 0.5303248519763162, + "grad_norm": 0.09323302656412125, + "kl": 0.20649093016982079, + "learning_rate": 1.0686787117123776e-05, + "loss": 0.0613, + "reward": 1.0656250059604644, + "reward_std": 0.11614571772515773, + "rewards/accuracy_reward": 0.08750000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250119209289, + "step": 1657 + }, + { + "clip_ratio": 0.0, + "completion_length": 650.2395935058594, + "epoch": 0.5306449031845095, + "grad_norm": 0.15243440866470337, + "kl": 0.23995474874973297, + "learning_rate": 1.067563701134781e-05, + "loss": 0.0722, + "reward": 1.1114583551883697, + "reward_std": 0.14056676384061575, + "rewards/accuracy_reward": 0.14166667088866233, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916746139527, + "step": 1658 + }, + { + "clip_ratio": 0.0, + "completion_length": 614.4375305175781, + "epoch": 0.5309649543927029, + "grad_norm": 0.07021994888782501, + "kl": 0.1741291381418705, + "learning_rate": 1.0664486061670957e-05, + "loss": 0.0343, + "reward": 1.0968750357627868, + "reward_std": 0.1501768246293068, + "rewards/accuracy_reward": 0.11666667386889458, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9802083492279052, + "step": 1659 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.739599609375, + "epoch": 0.5312850056008962, + "grad_norm": 0.11777395009994507, + "kl": 0.4072952277958393, + "learning_rate": 1.0653334282021261e-05, + "loss": 0.0914, + "reward": 1.1380208611488343, + "reward_std": 0.18841511830687524, + "rewards/accuracy_reward": 0.17291667219251394, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9651041865348816, + "step": 1660 + }, + { + "clip_ratio": 0.0, + "completion_length": 642.8521057128906, + "epoch": 0.5316050568090894, + "grad_norm": 0.09537654370069504, + "kl": 0.15284304693341255, + "learning_rate": 1.0642181686327788e-05, + "loss": 0.0651, + "reward": 1.052604180574417, + "reward_std": 0.12506925128400326, + "rewards/accuracy_reward": 0.07083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708492279053, + "step": 1661 + }, + { + "clip_ratio": 0.0, + "completion_length": 611.5312683105469, + "epoch": 0.5319251080172828, + "grad_norm": 0.15618105232715607, + "kl": 0.3179732132703066, + "learning_rate": 1.0631028288520634e-05, + "loss": 0.0555, + "reward": 1.0354166805744172, + "reward_std": 0.11091351509094238, + "rewards/accuracy_reward": 0.05625000018626451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9791666805744171, + "step": 1662 + }, + { + "clip_ratio": 0.0, + "completion_length": 621.3104370117187, + "epoch": 0.5322451592254761, + "grad_norm": 0.2050512731075287, + "kl": 0.3762538552284241, + "learning_rate": 1.0619874102530886e-05, + "loss": 0.1004, + "reward": 1.0651041924953462, + "reward_std": 0.18217533379793166, + "rewards/accuracy_reward": 0.10000000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9651041805744172, + "step": 1663 + }, + { + "clip_ratio": 0.0, + "completion_length": 611.4875244140625, + "epoch": 0.5325652104336693, + "grad_norm": 0.10614674538373947, + "kl": 0.1920756734907627, + "learning_rate": 1.0608719142290626e-05, + "loss": 0.052, + "reward": 1.0640625298023223, + "reward_std": 0.12579189017415046, + "rewards/accuracy_reward": 0.08125000223517417, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9807291865348816, + "step": 1664 + }, + { + "clip_ratio": 0.0, + "completion_length": 645.9229370117188, + "epoch": 0.5328852616418627, + "grad_norm": 0.11540885269641876, + "kl": 0.31467584148049355, + "learning_rate": 1.0597563421732899e-05, + "loss": 0.0566, + "reward": 1.026041680574417, + "reward_std": 0.12655243016779422, + "rewards/accuracy_reward": 0.05625000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916865348816, + "step": 1665 + }, + { + "clip_ratio": 0.0, + "completion_length": 609.0500183105469, + "epoch": 0.533205312850056, + "grad_norm": 0.24006302654743195, + "kl": 0.22442513927817345, + "learning_rate": 1.0586406954791702e-05, + "loss": 0.0654, + "reward": 1.152604204416275, + "reward_std": 0.13982443250715731, + "rewards/accuracy_reward": 0.17708333861082792, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208551883697, + "step": 1666 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.9833587646484, + "epoch": 0.5335253640582494, + "grad_norm": 0.24842822551727295, + "kl": 0.45479761958122256, + "learning_rate": 1.0575249755401952e-05, + "loss": 0.0873, + "reward": 1.1083333611488342, + "reward_std": 0.16125447899103165, + "rewards/accuracy_reward": 0.1354166718199849, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9708333551883698, + "step": 1667 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.7250183105468, + "epoch": 0.5338454152664426, + "grad_norm": 0.14025604724884033, + "kl": 0.47216527312994006, + "learning_rate": 1.0564091837499503e-05, + "loss": 0.0835, + "reward": 1.0875000178813934, + "reward_std": 0.1817237138748169, + "rewards/accuracy_reward": 0.12291666939854622, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9625000059604645, + "step": 1668 + }, + { + "clip_ratio": 0.0, + "completion_length": 640.3666931152344, + "epoch": 0.5341654664746359, + "grad_norm": 0.2198648303747177, + "kl": 0.239653842151165, + "learning_rate": 1.0552933215021088e-05, + "loss": 0.0429, + "reward": 1.0875000178813934, + "reward_std": 0.11936910003423691, + "rewards/accuracy_reward": 0.1104166716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833432674408, + "step": 1669 + }, + { + "clip_ratio": 0.0, + "completion_length": 615.527099609375, + "epoch": 0.5344855176828293, + "grad_norm": 0.1614704430103302, + "kl": 0.3589115433394909, + "learning_rate": 1.0541773901904327e-05, + "loss": 0.1041, + "reward": 1.105729192495346, + "reward_std": 0.1668396320194006, + "rewards/accuracy_reward": 0.1375000050291419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9682291865348815, + "step": 1670 + }, + { + "clip_ratio": 0.0, + "completion_length": 632.5521057128906, + "epoch": 0.5348055688910226, + "grad_norm": 0.30173394083976746, + "kl": 0.4323131963610649, + "learning_rate": 1.0530613912087698e-05, + "loss": 0.0656, + "reward": 1.0489583671092988, + "reward_std": 0.10187356732785702, + "rewards/accuracy_reward": 0.07291666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9760416865348815, + "step": 1671 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.1250183105469, + "epoch": 0.5351256200992158, + "grad_norm": 0.349128395318985, + "kl": 0.334938682615757, + "learning_rate": 1.0519453259510535e-05, + "loss": 0.0533, + "reward": 1.0223958611488342, + "reward_std": 0.14337569773197173, + "rewards/accuracy_reward": 0.05000000242143869, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9703125178813934, + "step": 1672 + }, + { + "clip_ratio": 0.0, + "completion_length": 651.4416931152343, + "epoch": 0.5354456713074092, + "grad_norm": 0.20258475840091705, + "kl": 0.4476259782910347, + "learning_rate": 1.0508291958112988e-05, + "loss": 0.0911, + "reward": 1.0401041984558106, + "reward_std": 0.15322203636169435, + "rewards/accuracy_reward": 0.07291666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.967187511920929, + "step": 1673 + }, + { + "clip_ratio": 0.0, + "completion_length": 616.427099609375, + "epoch": 0.5357657225156025, + "grad_norm": 0.1804238259792328, + "kl": 0.31707819551229477, + "learning_rate": 1.0497130021836023e-05, + "loss": 0.0828, + "reward": 1.1041666984558105, + "reward_std": 0.17293839827179908, + "rewards/accuracy_reward": 0.12708333879709244, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833551883698, + "step": 1674 + }, + { + "clip_ratio": 0.0, + "completion_length": 635.5416931152344, + "epoch": 0.5360857737237958, + "grad_norm": 0.23864604532718658, + "kl": 0.4047357439994812, + "learning_rate": 1.0485967464621401e-05, + "loss": 0.1048, + "reward": 1.068229192495346, + "reward_std": 0.16022577956318856, + "rewards/accuracy_reward": 0.1020833369344473, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9661458432674408, + "step": 1675 + }, + { + "clip_ratio": 0.0, + "completion_length": 623.5458435058594, + "epoch": 0.5364058249319891, + "grad_norm": 0.20877757668495178, + "kl": 0.3747469946742058, + "learning_rate": 1.0474804300411652e-05, + "loss": 0.0864, + "reward": 1.0338541924953462, + "reward_std": 0.16450630873441696, + "rewards/accuracy_reward": 0.0666666692122817, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9671875178813935, + "step": 1676 + }, + { + "clip_ratio": 0.0, + "completion_length": 603.9208465576172, + "epoch": 0.5367258761401824, + "grad_norm": 0.4435693323612213, + "kl": 0.4540784254670143, + "learning_rate": 1.046364054315007e-05, + "loss": 0.1087, + "reward": 1.1072917044162751, + "reward_std": 0.18432877622544766, + "rewards/accuracy_reward": 0.14791667070239783, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9593750238418579, + "step": 1677 + }, + { + "clip_ratio": 0.0, + "completion_length": 599.6687652587891, + "epoch": 0.5370459273483758, + "grad_norm": 0.17657384276390076, + "kl": 0.4048570953309536, + "learning_rate": 1.0452476206780686e-05, + "loss": 0.0706, + "reward": 1.0468750238418578, + "reward_std": 0.11663634181022645, + "rewards/accuracy_reward": 0.0729166679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1678 + }, + { + "clip_ratio": 0.0, + "completion_length": 628.0166870117188, + "epoch": 0.5373659785565691, + "grad_norm": 0.23862002789974213, + "kl": 0.5527682453393936, + "learning_rate": 1.0441311305248258e-05, + "loss": 0.0863, + "reward": 1.051041704416275, + "reward_std": 0.17258851006627082, + "rewards/accuracy_reward": 0.08750000260770321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9635416805744171, + "step": 1679 + }, + { + "clip_ratio": 0.0, + "completion_length": 640.6208587646485, + "epoch": 0.5376860297647623, + "grad_norm": 0.4156144857406616, + "kl": 0.6809851691126824, + "learning_rate": 1.043014585249825e-05, + "loss": 0.0852, + "reward": 0.9619791805744171, + "reward_std": 0.13484069108963012, + "rewards/accuracy_reward": 0.006250000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9557291805744171, + "step": 1680 + }, + { + "clip_ratio": 0.0, + "completion_length": 624.408349609375, + "epoch": 0.5380060809729557, + "grad_norm": 0.28923049569129944, + "kl": 0.6403138026595115, + "learning_rate": 1.041897986247681e-05, + "loss": 0.1207, + "reward": 1.0343750178813935, + "reward_std": 0.1490859840065241, + "rewards/accuracy_reward": 0.07708333637565375, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9572916865348816, + "step": 1681 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.8729370117187, + "epoch": 0.538326132181149, + "grad_norm": 0.3962026834487915, + "kl": 0.5739488750696182, + "learning_rate": 1.0407813349130758e-05, + "loss": 0.0971, + "reward": 1.0812500357627868, + "reward_std": 0.19865366518497468, + "rewards/accuracy_reward": 0.12500000316649676, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9562500059604645, + "step": 1682 + }, + { + "clip_ratio": 0.0, + "completion_length": 618.3541809082031, + "epoch": 0.5386461833893423, + "grad_norm": 0.3626198172569275, + "kl": 0.4270638257265091, + "learning_rate": 1.039664632640757e-05, + "loss": 0.1173, + "reward": 0.9994791924953461, + "reward_std": 0.1307765144854784, + "rewards/accuracy_reward": 0.03958333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9598958551883697, + "step": 1683 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.3354370117188, + "epoch": 0.5389662345975356, + "grad_norm": 0.1408691704273224, + "kl": 0.4760843113064766, + "learning_rate": 1.0385478808255358e-05, + "loss": 0.0734, + "reward": 0.9770833492279053, + "reward_std": 0.12489923723042011, + "rewards/accuracy_reward": 0.012500000558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9645833492279052, + "step": 1684 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.3104370117187, + "epoch": 0.5392862858057289, + "grad_norm": 0.2985508143901825, + "kl": 0.3501432552933693, + "learning_rate": 1.0374310808622857e-05, + "loss": 0.0841, + "reward": 0.9781250119209289, + "reward_std": 0.11064638346433639, + "rewards/accuracy_reward": 0.00833333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916746139527, + "step": 1685 + }, + { + "clip_ratio": 0.0, + "completion_length": 613.0687683105468, + "epoch": 0.5396063370139222, + "grad_norm": 0.40105709433555603, + "kl": 0.47514125406742097, + "learning_rate": 1.0363142341459388e-05, + "loss": 0.1189, + "reward": 1.008854192495346, + "reward_std": 0.1446002159267664, + "rewards/accuracy_reward": 0.04791666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9609375178813935, + "step": 1686 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.7958557128907, + "epoch": 0.5399263882221156, + "grad_norm": 0.17932024598121643, + "kl": 0.4909600533545017, + "learning_rate": 1.0351973420714878e-05, + "loss": 0.0821, + "reward": 1.1140625178813934, + "reward_std": 0.15720976814627646, + "rewards/accuracy_reward": 0.137500006146729, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9765625119209289, + "step": 1687 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.2937683105469, + "epoch": 0.5402464394303088, + "grad_norm": 0.1724947839975357, + "kl": 0.33601075038313866, + "learning_rate": 1.0340804060339797e-05, + "loss": 0.0944, + "reward": 0.9734375178813934, + "reward_std": 0.10235114470124244, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.935937511920929, + "step": 1688 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.4458557128906, + "epoch": 0.5405664906385022, + "grad_norm": 0.1991930603981018, + "kl": 0.36307480111718177, + "learning_rate": 1.0329634274285189e-05, + "loss": 0.0666, + "reward": 1.0364583492279054, + "reward_std": 0.09679662808775902, + "rewards/accuracy_reward": 0.06250000204890967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1689 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.658349609375, + "epoch": 0.5408865418466955, + "grad_norm": 0.24091768264770508, + "kl": 0.574387788772583, + "learning_rate": 1.031846407650261e-05, + "loss": 0.1114, + "reward": 1.054166704416275, + "reward_std": 0.15670420825481415, + "rewards/accuracy_reward": 0.08958333600312471, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9645833551883698, + "step": 1690 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.4583557128906, + "epoch": 0.5412065930548888, + "grad_norm": 0.2540891766548157, + "kl": 0.48114641904830935, + "learning_rate": 1.030729348094414e-05, + "loss": 0.1407, + "reward": 1.071354192495346, + "reward_std": 0.14129463881254195, + "rewards/accuracy_reward": 0.09791667070239782, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.973437511920929, + "step": 1691 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.5500244140625, + "epoch": 0.5415266442630821, + "grad_norm": 0.3592195510864258, + "kl": 0.6452970117330551, + "learning_rate": 1.0296122501562347e-05, + "loss": 0.1715, + "reward": 1.0239583671092987, + "reward_std": 0.18306272029876708, + "rewards/accuracy_reward": 0.06666666716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9572916865348816, + "step": 1692 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.8125244140625, + "epoch": 0.5418466954712754, + "grad_norm": 0.192162424325943, + "kl": 0.4380524292588234, + "learning_rate": 1.0284951152310292e-05, + "loss": 0.149, + "reward": 1.006250011920929, + "reward_std": 0.13173125982284545, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9645833432674408, + "step": 1693 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.4479309082031, + "epoch": 0.5421667466794687, + "grad_norm": 0.3079032301902771, + "kl": 0.46010053232312204, + "learning_rate": 1.0273779447141487e-05, + "loss": 0.0816, + "reward": 1.0546875238418578, + "reward_std": 0.13184548616409303, + "rewards/accuracy_reward": 0.08125000204890967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9734375, + "step": 1694 + }, + { + "clip_ratio": 0.0, + "completion_length": 533.2458526611329, + "epoch": 0.5424867978876621, + "grad_norm": 0.2043299674987793, + "kl": 0.27213485464453696, + "learning_rate": 1.0262607400009895e-05, + "loss": 0.1073, + "reward": 1.1109375357627869, + "reward_std": 0.1973067745566368, + "rewards/accuracy_reward": 0.13541667088866233, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208432674408, + "step": 1695 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.2250152587891, + "epoch": 0.5428068490958553, + "grad_norm": 0.33777788281440735, + "kl": 0.27160152047872543, + "learning_rate": 1.0251435024869894e-05, + "loss": 0.0876, + "reward": 1.1229166865348816, + "reward_std": 0.1457744762301445, + "rewards/accuracy_reward": 0.14583333842456342, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833373069763, + "step": 1696 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.5833465576172, + "epoch": 0.5431269003040486, + "grad_norm": 0.19615255296230316, + "kl": 0.38076775074005126, + "learning_rate": 1.0240262335676294e-05, + "loss": 0.1301, + "reward": 1.1328125178813935, + "reward_std": 0.11095966622233391, + "rewards/accuracy_reward": 0.1541666716337204, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9765625119209289, + "step": 1697 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.7187652587891, + "epoch": 0.543446951512242, + "grad_norm": 0.23194903135299683, + "kl": 0.2781291104853153, + "learning_rate": 1.0229089346384273e-05, + "loss": 0.0444, + "reward": 1.053125023841858, + "reward_std": 0.12236902713775635, + "rewards/accuracy_reward": 0.07083333469927311, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9822916746139526, + "step": 1698 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.3187744140625, + "epoch": 0.5437670027204353, + "grad_norm": 0.3693836033344269, + "kl": 0.6928737178444863, + "learning_rate": 1.0217916070949405e-05, + "loss": 0.1238, + "reward": 1.1114583730697631, + "reward_std": 0.17411674037575722, + "rewards/accuracy_reward": 0.1437500011175871, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9677083551883697, + "step": 1699 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.377099609375, + "epoch": 0.5440870539286286, + "grad_norm": 0.2183925211429596, + "kl": 0.44937950894236567, + "learning_rate": 1.02067425233276e-05, + "loss": 0.139, + "reward": 1.1026041865348817, + "reward_std": 0.16677277013659478, + "rewards/accuracy_reward": 0.13958333786576987, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9630208492279053, + "step": 1700 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.3729309082031, + "epoch": 0.5444071051368219, + "grad_norm": 0.16149164736270905, + "kl": 0.36703067272901535, + "learning_rate": 1.0195568717475128e-05, + "loss": 0.0896, + "reward": 1.0265625178813935, + "reward_std": 0.14087636768817902, + "rewards/accuracy_reward": 0.054166668094694614, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9723958432674408, + "step": 1701 + }, + { + "clip_ratio": 0.0, + "completion_length": 503.1750122070313, + "epoch": 0.5447271563450152, + "grad_norm": 0.37145406007766724, + "kl": 0.3414155296981335, + "learning_rate": 1.0184394667348572e-05, + "loss": 0.1194, + "reward": 1.0833333492279054, + "reward_std": 0.13669775277376175, + "rewards/accuracy_reward": 0.10833333730697632, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.975000011920929, + "step": 1702 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.1791809082031, + "epoch": 0.5450472075532086, + "grad_norm": 0.15215015411376953, + "kl": 0.2324100576341152, + "learning_rate": 1.0173220386904817e-05, + "loss": 0.0564, + "reward": 1.0458333671092988, + "reward_std": 0.09034402184188366, + "rewards/accuracy_reward": 0.056250002793967725, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1703 + }, + { + "clip_ratio": 0.0, + "completion_length": 504.6291778564453, + "epoch": 0.5453672587614018, + "grad_norm": 0.17038388550281525, + "kl": 0.5619470663368702, + "learning_rate": 1.016204589010104e-05, + "loss": 0.1342, + "reward": 1.1770833611488343, + "reward_std": 0.1644980400800705, + "rewards/accuracy_reward": 0.20625000745058059, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333432674408, + "step": 1704 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.4396057128906, + "epoch": 0.5456873099695951, + "grad_norm": 0.11146536469459534, + "kl": 0.30440557897090914, + "learning_rate": 1.0150871190894693e-05, + "loss": 0.1285, + "reward": 1.051041704416275, + "reward_std": 0.11958832629024982, + "rewards/accuracy_reward": 0.08125000298023224, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.969791692495346, + "step": 1705 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.3520965576172, + "epoch": 0.5460073611777885, + "grad_norm": 0.14789950847625732, + "kl": 0.2733158372342587, + "learning_rate": 1.0139696303243471e-05, + "loss": 0.08, + "reward": 1.0864583432674408, + "reward_std": 0.04624491911381483, + "rewards/accuracy_reward": 0.10000000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9864583432674408, + "step": 1706 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.9750183105468, + "epoch": 0.5463274123859817, + "grad_norm": 0.09990892559289932, + "kl": 0.3074110925197601, + "learning_rate": 1.0128521241105312e-05, + "loss": 0.0853, + "reward": 0.9968750178813934, + "reward_std": 0.0871005192399025, + "rewards/accuracy_reward": 0.01458333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9822916746139526, + "step": 1707 + }, + { + "clip_ratio": 0.0, + "completion_length": 506.25001525878906, + "epoch": 0.546647463594175, + "grad_norm": 0.13222353160381317, + "kl": 0.2891290545463562, + "learning_rate": 1.0117346018438367e-05, + "loss": 0.0606, + "reward": 1.1437500238418579, + "reward_std": 0.14657528325915337, + "rewards/accuracy_reward": 0.15833333786576986, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9854166686534882, + "step": 1708 + }, + { + "clip_ratio": 0.0, + "completion_length": 512.4229370117188, + "epoch": 0.5469675148023684, + "grad_norm": 0.08542126417160034, + "kl": 0.1715974800288677, + "learning_rate": 1.0106170649200985e-05, + "loss": 0.0243, + "reward": 1.0723958730697631, + "reward_std": 0.06993448249995708, + "rewards/accuracy_reward": 0.08125000242143869, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9911458492279053, + "step": 1709 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.3270935058594, + "epoch": 0.5472875660105617, + "grad_norm": 0.12961845099925995, + "kl": 0.29536170735955236, + "learning_rate": 1.0094995147351715e-05, + "loss": 0.0834, + "reward": 1.0854166984558105, + "reward_std": 0.09780984222888947, + "rewards/accuracy_reward": 0.10000000335276127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9854166746139527, + "step": 1710 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.302099609375, + "epoch": 0.547607617218755, + "grad_norm": 0.10168123990297318, + "kl": 0.19317631945014, + "learning_rate": 1.008381952684925e-05, + "loss": 0.0354, + "reward": 1.0630208671092987, + "reward_std": 0.08708528894931078, + "rewards/accuracy_reward": 0.07291666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9901041746139526, + "step": 1711 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.3437652587891, + "epoch": 0.5479276684269483, + "grad_norm": 0.11124243587255478, + "kl": 0.2048973672091961, + "learning_rate": 1.0072643801652442e-05, + "loss": 0.0773, + "reward": 1.0687500178813933, + "reward_std": 0.1460909903049469, + "rewards/accuracy_reward": 0.09166667070239783, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833432674408, + "step": 1712 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.2229370117187, + "epoch": 0.5482477196351416, + "grad_norm": 0.0702410340309143, + "kl": 0.2370643712580204, + "learning_rate": 1.006146798572027e-05, + "loss": 0.0927, + "reward": 1.0328125357627869, + "reward_std": 0.14687610492110253, + "rewards/accuracy_reward": 0.05000000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9828125059604644, + "step": 1713 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.8208465576172, + "epoch": 0.548567770843335, + "grad_norm": 0.06981991976499557, + "kl": 0.22894330993294715, + "learning_rate": 1.0050292093011835e-05, + "loss": 0.0763, + "reward": 1.0192708492279052, + "reward_std": 0.052289125695824626, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9859375119209289, + "step": 1714 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.9583465576172, + "epoch": 0.5488878220515282, + "grad_norm": 0.10607703030109406, + "kl": 0.28106397688388823, + "learning_rate": 1.0039116137486323e-05, + "loss": 0.0508, + "reward": 1.1041666924953462, + "reward_std": 0.1148946724832058, + "rewards/accuracy_reward": 0.11666667070239782, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.9833333432674408, + "step": 1715 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.5750183105469, + "epoch": 0.5492078732597215, + "grad_norm": 0.1637829840183258, + "kl": 0.25569094344973564, + "learning_rate": 1.0027940133103005e-05, + "loss": 0.0764, + "reward": 1.0239583611488343, + "reward_std": 0.12470999825745821, + "rewards/accuracy_reward": 0.041666668653488156, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9822916865348816, + "step": 1716 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.1979339599609, + "epoch": 0.5495279244679149, + "grad_norm": 0.0756787434220314, + "kl": 0.1757739432156086, + "learning_rate": 1.0016764093821203e-05, + "loss": 0.0281, + "reward": 1.0666666984558106, + "reward_std": 0.08019343838095665, + "rewards/accuracy_reward": 0.07708333563059569, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9895833373069763, + "step": 1717 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.1271057128906, + "epoch": 0.5498479756761082, + "grad_norm": 0.045003801584243774, + "kl": 0.13475093320012094, + "learning_rate": 1.0005588033600305e-05, + "loss": 0.0476, + "reward": 1.083854192495346, + "reward_std": 0.04586947858333588, + "rewards/accuracy_reward": 0.10000000298023223, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9838541805744171, + "step": 1718 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.1666839599609, + "epoch": 0.5501680268843014, + "grad_norm": 0.1230769082903862, + "kl": 0.2083885557949543, + "learning_rate": 9.994411966399699e-06, + "loss": 0.0736, + "reward": 1.0526041865348816, + "reward_std": 0.10287219993770122, + "rewards/accuracy_reward": 0.06875000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9838541865348815, + "step": 1719 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.9979370117187, + "epoch": 0.5504880780924948, + "grad_norm": 0.1451004594564438, + "kl": 0.36494098976254463, + "learning_rate": 9.983235906178798e-06, + "loss": 0.1206, + "reward": 1.0989583611488343, + "reward_std": 0.1661299206316471, + "rewards/accuracy_reward": 0.1291666727513075, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9697916865348816, + "step": 1720 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.5500244140625, + "epoch": 0.5508081293006881, + "grad_norm": 0.08950361609458923, + "kl": 0.330229202657938, + "learning_rate": 9.972059866897002e-06, + "loss": 0.0704, + "reward": 1.0515625178813934, + "reward_std": 0.1308392234146595, + "rewards/accuracy_reward": 0.07708333488553762, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9744791805744171, + "step": 1721 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.695849609375, + "epoch": 0.5511281805088815, + "grad_norm": 0.054417140781879425, + "kl": 0.10512780025601387, + "learning_rate": 9.960883862513682e-06, + "loss": 0.0288, + "reward": 1.082291692495346, + "reward_std": 0.10033504888415337, + "rewards/accuracy_reward": 0.08958333563059569, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.990625011920929, + "step": 1722 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.5875183105469, + "epoch": 0.5514482317170747, + "grad_norm": 0.09014873951673508, + "kl": 0.20714004188776017, + "learning_rate": 9.949707906988165e-06, + "loss": 0.0622, + "reward": 1.0781250238418578, + "reward_std": 0.11389563996344805, + "rewards/accuracy_reward": 0.1041666716337204, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583492279053, + "step": 1723 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.9979370117187, + "epoch": 0.551768282925268, + "grad_norm": 0.08591794967651367, + "kl": 0.17230487614870071, + "learning_rate": 9.938532014279731e-06, + "loss": 0.0433, + "reward": 1.1239583611488342, + "reward_std": 0.08865927271544934, + "rewards/accuracy_reward": 0.1395833369344473, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9843750119209289, + "step": 1724 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.6645904541016, + "epoch": 0.5520883341334614, + "grad_norm": 0.047448597848415375, + "kl": 0.14786509796977043, + "learning_rate": 9.927356198347561e-06, + "loss": 0.0394, + "reward": 1.0640625476837158, + "reward_std": 0.09608743041753769, + "rewards/accuracy_reward": 0.07500000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9890625059604645, + "step": 1725 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.5562713623046, + "epoch": 0.5524083853416547, + "grad_norm": 0.13344469666481018, + "kl": 0.33826933801174164, + "learning_rate": 9.916180473150753e-06, + "loss": 0.0634, + "reward": 0.9958333551883698, + "reward_std": 0.11803839653730393, + "rewards/accuracy_reward": 0.01875000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833432674408, + "step": 1726 + }, + { + "clip_ratio": 0.0, + "completion_length": 599.1375122070312, + "epoch": 0.5527284365498479, + "grad_norm": 0.044267505407333374, + "kl": 0.1466631069779396, + "learning_rate": 9.905004852648288e-06, + "loss": 0.0545, + "reward": 1.0765625357627868, + "reward_std": 0.1297352697700262, + "rewards/accuracy_reward": 0.09166666883975268, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9848958551883698, + "step": 1727 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.8937683105469, + "epoch": 0.5530484877580413, + "grad_norm": 0.11980026960372925, + "kl": 0.28844860270619394, + "learning_rate": 9.893829350799016e-06, + "loss": 0.0464, + "reward": 1.1302083611488343, + "reward_std": 0.09609245825558901, + "rewards/accuracy_reward": 0.1416666716337204, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9864583492279053, + "step": 1728 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.214599609375, + "epoch": 0.5533685389662346, + "grad_norm": 0.072134830057621, + "kl": 0.25654660165309906, + "learning_rate": 9.882653981561638e-06, + "loss": 0.0195, + "reward": 1.0312500238418578, + "reward_std": 0.08351408448070288, + "rewards/accuracy_reward": 0.04583333469927311, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9833333492279053, + "step": 1729 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.9604278564453, + "epoch": 0.553688590174428, + "grad_norm": 0.08518262952566147, + "kl": 0.26288840398192403, + "learning_rate": 9.871478758894692e-06, + "loss": 0.0425, + "reward": 1.1421875298023223, + "reward_std": 0.14297616370022298, + "rewards/accuracy_reward": 0.16458334047347306, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9776041805744171, + "step": 1730 + }, + { + "clip_ratio": 0.0, + "completion_length": 607.177099609375, + "epoch": 0.5540086413826212, + "grad_norm": 0.05721981078386307, + "kl": 0.2188819907605648, + "learning_rate": 9.860303696756528e-06, + "loss": 0.045, + "reward": 1.1135417103767395, + "reward_std": 0.10241693221032619, + "rewards/accuracy_reward": 0.13333333656191826, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9802083492279052, + "step": 1731 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.2521026611328, + "epoch": 0.5543286925908145, + "grad_norm": 0.2216687947511673, + "kl": 0.12100109234452247, + "learning_rate": 9.849128809105309e-06, + "loss": 0.0488, + "reward": 1.044791692495346, + "reward_std": 0.11933745443820953, + "rewards/accuracy_reward": 0.058333336189389226, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9864583432674408, + "step": 1732 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.4812805175782, + "epoch": 0.5546487437990079, + "grad_norm": 0.07465074211359024, + "kl": 0.25847496688365934, + "learning_rate": 9.837954109898961e-06, + "loss": 0.0534, + "reward": 1.0395833730697632, + "reward_std": 0.16923051699995995, + "rewards/accuracy_reward": 0.06666666772216559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9729166865348816, + "step": 1733 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.4104370117187, + "epoch": 0.5549687950072012, + "grad_norm": 0.06225651502609253, + "kl": 0.18474493846297263, + "learning_rate": 9.826779613095188e-06, + "loss": 0.044, + "reward": 1.0541666805744172, + "reward_std": 0.09043601714074612, + "rewards/accuracy_reward": 0.07291666865348816, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9791666805744171, + "step": 1734 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.4187683105469, + "epoch": 0.5552888462153944, + "grad_norm": 0.08788628876209259, + "kl": 0.15150585621595383, + "learning_rate": 9.815605332651433e-06, + "loss": 0.0426, + "reward": 1.0718750178813934, + "reward_std": 0.11696450784802437, + "rewards/accuracy_reward": 0.0854166692122817, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9864583373069763, + "step": 1735 + }, + { + "clip_ratio": 0.0, + "completion_length": 599.5146118164063, + "epoch": 0.5556088974235878, + "grad_norm": 0.07596233487129211, + "kl": 0.38066075593233106, + "learning_rate": 9.804431282524874e-06, + "loss": 0.089, + "reward": 1.019791692495346, + "reward_std": 0.12768033295869827, + "rewards/accuracy_reward": 0.04583333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1736 + }, + { + "clip_ratio": 0.0, + "completion_length": 590.495849609375, + "epoch": 0.5559289486317811, + "grad_norm": 0.19324903190135956, + "kl": 0.40700176954269407, + "learning_rate": 9.793257476672403e-06, + "loss": 0.0711, + "reward": 1.0598958611488343, + "reward_std": 0.1368673298507929, + "rewards/accuracy_reward": 0.08958333563059569, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.970312523841858, + "step": 1737 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.3854339599609, + "epoch": 0.5562489998399744, + "grad_norm": 0.08396127820014954, + "kl": 0.2656080096960068, + "learning_rate": 9.782083929050601e-06, + "loss": 0.0594, + "reward": 1.082812535762787, + "reward_std": 0.1377605564892292, + "rewards/accuracy_reward": 0.10416666921228171, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458492279053, + "step": 1738 + }, + { + "clip_ratio": 0.0, + "completion_length": 619.589599609375, + "epoch": 0.5565690510481677, + "grad_norm": 0.10959643125534058, + "kl": 0.32876670695841315, + "learning_rate": 9.77091065361573e-06, + "loss": 0.0646, + "reward": 1.0072916924953461, + "reward_std": 0.1158070158213377, + "rewards/accuracy_reward": 0.02916666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9781250178813934, + "step": 1739 + }, + { + "clip_ratio": 0.0, + "completion_length": 654.5187744140625, + "epoch": 0.556889102256361, + "grad_norm": 0.1902201622724533, + "kl": 0.44203677251935003, + "learning_rate": 9.759737664323709e-06, + "loss": 0.1057, + "reward": 1.0968750298023224, + "reward_std": 0.14357101432979108, + "rewards/accuracy_reward": 0.13333333637565375, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1740 + }, + { + "clip_ratio": 0.0, + "completion_length": 590.4333435058594, + "epoch": 0.5572091534645544, + "grad_norm": 0.2078673541545868, + "kl": 0.5072756253182888, + "learning_rate": 9.748564975130106e-06, + "loss": 0.1311, + "reward": 1.0427083432674409, + "reward_std": 0.1215952442958951, + "rewards/accuracy_reward": 0.07500000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9677083432674408, + "step": 1741 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.9771118164062, + "epoch": 0.5575292046727477, + "grad_norm": 0.11386916786432266, + "kl": 0.34886466041207315, + "learning_rate": 9.737392599990109e-06, + "loss": 0.0777, + "reward": 1.122916692495346, + "reward_std": 0.14374384582042693, + "rewards/accuracy_reward": 0.14791667126119137, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.975000011920929, + "step": 1742 + }, + { + "clip_ratio": 0.0, + "completion_length": 610.6396057128907, + "epoch": 0.5578492558809409, + "grad_norm": 0.1811244934797287, + "kl": 0.3296015664935112, + "learning_rate": 9.726220552858516e-06, + "loss": 0.113, + "reward": 1.0166666865348817, + "reward_std": 0.1435274824500084, + "rewards/accuracy_reward": 0.052083334513008596, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9645833432674408, + "step": 1743 + }, + { + "clip_ratio": 0.0, + "completion_length": 612.3541809082031, + "epoch": 0.5581693070891343, + "grad_norm": 0.2830093204975128, + "kl": 0.4410018026828766, + "learning_rate": 9.71504884768971e-06, + "loss": 0.1006, + "reward": 1.091666692495346, + "reward_std": 0.17801312804222108, + "rewards/accuracy_reward": 0.11875000111758709, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9708333492279053, + "step": 1744 + }, + { + "clip_ratio": 0.0, + "completion_length": 631.2833557128906, + "epoch": 0.5584893582973276, + "grad_norm": 0.30681195855140686, + "kl": 0.5721335649490357, + "learning_rate": 9.703877498437657e-06, + "loss": 0.0964, + "reward": 1.029166680574417, + "reward_std": 0.11975382026284934, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9604166805744171, + "step": 1745 + }, + { + "clip_ratio": 0.0, + "completion_length": 624.4666931152344, + "epoch": 0.5588094095055209, + "grad_norm": 0.22259894013404846, + "kl": 0.3894990190863609, + "learning_rate": 9.692706519055865e-06, + "loss": 0.0725, + "reward": 1.0526041865348816, + "reward_std": 0.14981426876038312, + "rewards/accuracy_reward": 0.07916666939854622, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.973437511920929, + "step": 1746 + }, + { + "clip_ratio": 0.0, + "completion_length": 607.0333557128906, + "epoch": 0.5591294607137142, + "grad_norm": 0.1207633763551712, + "kl": 0.33568143993616106, + "learning_rate": 9.681535923497394e-06, + "loss": 0.0745, + "reward": 1.0114583492279052, + "reward_std": 0.14165182113647462, + "rewards/accuracy_reward": 0.03958333358168602, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.9677083432674408, + "step": 1747 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.6666839599609, + "epoch": 0.5594495119219075, + "grad_norm": 0.22380438446998596, + "kl": 0.346661651134491, + "learning_rate": 9.670365725714811e-06, + "loss": 0.0865, + "reward": 1.076562523841858, + "reward_std": 0.14469496812671423, + "rewards/accuracy_reward": 0.1125000011175871, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9640625238418579, + "step": 1748 + }, + { + "clip_ratio": 0.0, + "completion_length": 595.5687744140625, + "epoch": 0.5597695631301008, + "grad_norm": 0.12634803354740143, + "kl": 0.41345595121383666, + "learning_rate": 9.659195939660203e-06, + "loss": 0.0835, + "reward": 1.1130208730697633, + "reward_std": 0.17043216675519943, + "rewards/accuracy_reward": 0.1458333373069763, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9651041805744172, + "step": 1749 + }, + { + "clip_ratio": 0.0, + "completion_length": 604.4729309082031, + "epoch": 0.5600896143382941, + "grad_norm": 0.10981366038322449, + "kl": 0.3390358090400696, + "learning_rate": 9.648026579285125e-06, + "loss": 0.069, + "reward": 1.0333333551883697, + "reward_std": 0.16134923771023751, + "rewards/accuracy_reward": 0.0562500024214387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833492279053, + "step": 1750 + }, + { + "clip_ratio": 0.0, + "completion_length": 636.1771057128906, + "epoch": 0.5604096655464874, + "grad_norm": 0.10159897804260254, + "kl": 0.16987637989223003, + "learning_rate": 9.636857658540615e-06, + "loss": 0.0226, + "reward": 1.0968750238418579, + "reward_std": 0.1050514079630375, + "rewards/accuracy_reward": 0.11041666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9864583492279053, + "step": 1751 + }, + { + "clip_ratio": 0.0, + "completion_length": 629.0166870117188, + "epoch": 0.5607297167546808, + "grad_norm": 0.09187393635511398, + "kl": 0.2695496570318937, + "learning_rate": 9.625689191377148e-06, + "loss": 0.0551, + "reward": 1.1104167103767395, + "reward_std": 0.15159886330366135, + "rewards/accuracy_reward": 0.13750000465661288, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9729166865348816, + "step": 1752 + }, + { + "clip_ratio": 0.0, + "completion_length": 643.1916931152343, + "epoch": 0.5610497679628741, + "grad_norm": 0.4285680651664734, + "kl": 0.2703603833913803, + "learning_rate": 9.614521191744644e-06, + "loss": 0.0776, + "reward": 1.0583333611488341, + "reward_std": 0.12563749849796296, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9750000238418579, + "step": 1753 + }, + { + "clip_ratio": 0.0, + "completion_length": 629.8146118164062, + "epoch": 0.5613698191710673, + "grad_norm": 0.09549083560705185, + "kl": 0.2873098261654377, + "learning_rate": 9.603353673592435e-06, + "loss": 0.0493, + "reward": 1.0541666984558105, + "reward_std": 0.14787348750978707, + "rewards/accuracy_reward": 0.07500000130385161, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9770833611488342, + "step": 1754 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.4833557128907, + "epoch": 0.5616898703792607, + "grad_norm": 0.19138483703136444, + "kl": 0.2949482426047325, + "learning_rate": 9.592186650869245e-06, + "loss": 0.0604, + "reward": 1.1250000417232513, + "reward_std": 0.1398756790906191, + "rewards/accuracy_reward": 0.14583333879709243, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9791666805744171, + "step": 1755 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.3729370117187, + "epoch": 0.562009921587454, + "grad_norm": 0.16488175094127655, + "kl": 0.34419357851147653, + "learning_rate": 9.581020137523192e-06, + "loss": 0.0469, + "reward": 1.068750023841858, + "reward_std": 0.0808381624519825, + "rewards/accuracy_reward": 0.08125000298023224, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9854166805744171, + "step": 1756 + }, + { + "clip_ratio": 0.0, + "completion_length": 645.8333435058594, + "epoch": 0.5623299727956473, + "grad_norm": 0.08196991682052612, + "kl": 0.24638563096523286, + "learning_rate": 9.569854147501752e-06, + "loss": 0.0558, + "reward": 1.0651041865348816, + "reward_std": 0.17485166918486356, + "rewards/accuracy_reward": 0.08541666939854622, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9776041805744171, + "step": 1757 + }, + { + "clip_ratio": 0.0, + "completion_length": 599.8979309082031, + "epoch": 0.5626500240038406, + "grad_norm": 0.17706768214702606, + "kl": 0.2538708359003067, + "learning_rate": 9.55868869475174e-06, + "loss": 0.0841, + "reward": 1.074479204416275, + "reward_std": 0.15826401822268962, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.970312523841858, + "step": 1758 + }, + { + "clip_ratio": 0.0, + "completion_length": 625.7750244140625, + "epoch": 0.5629700752120339, + "grad_norm": 0.08481159806251526, + "kl": 0.2495666116476059, + "learning_rate": 9.547523793219315e-06, + "loss": 0.0402, + "reward": 1.0734375417232513, + "reward_std": 0.1499858619645238, + "rewards/accuracy_reward": 0.10000000260770321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9734375238418579, + "step": 1759 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.827099609375, + "epoch": 0.5632901264202272, + "grad_norm": 0.11646457761526108, + "kl": 0.3505744531750679, + "learning_rate": 9.536359456849933e-06, + "loss": 0.0508, + "reward": 1.119791716337204, + "reward_std": 0.1696897467598319, + "rewards/accuracy_reward": 0.1458333356305957, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583551883697, + "step": 1760 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.7646057128907, + "epoch": 0.5636101776284206, + "grad_norm": 0.16916526854038239, + "kl": 0.3435643449425697, + "learning_rate": 9.52519569958835e-06, + "loss": 0.0638, + "reward": 1.092187523841858, + "reward_std": 0.16328086461871863, + "rewards/accuracy_reward": 0.1187500024214387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.973437511920929, + "step": 1761 + }, + { + "clip_ratio": 0.0, + "completion_length": 635.420849609375, + "epoch": 0.5639302288366138, + "grad_norm": 0.2558411657810211, + "kl": 0.39502771496772765, + "learning_rate": 9.514032535378604e-06, + "loss": 0.0492, + "reward": 1.025000023841858, + "reward_std": 0.11617990657687187, + "rewards/accuracy_reward": 0.043750000186264515, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9791666746139527, + "step": 1762 + }, + { + "clip_ratio": 0.0, + "completion_length": 618.1646057128906, + "epoch": 0.5642502800448072, + "grad_norm": 0.23282131552696228, + "kl": 0.2503409251570702, + "learning_rate": 9.50286997816398e-06, + "loss": 0.0868, + "reward": 1.0854166865348815, + "reward_std": 0.1403332645073533, + "rewards/accuracy_reward": 0.11041666939854622, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9729166865348816, + "step": 1763 + }, + { + "clip_ratio": 0.0, + "completion_length": 603.9833465576172, + "epoch": 0.5645703312530005, + "grad_norm": 0.5210441946983337, + "kl": 0.37920553535223006, + "learning_rate": 9.491708041887017e-06, + "loss": 0.1104, + "reward": 1.127604204416275, + "reward_std": 0.15253622308373452, + "rewards/accuracy_reward": 0.15625000447034837, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9692708551883698, + "step": 1764 + }, + { + "clip_ratio": 0.0, + "completion_length": 619.0166809082032, + "epoch": 0.5648903824611938, + "grad_norm": 0.18722592294216156, + "kl": 0.6182475075125694, + "learning_rate": 9.480546740489468e-06, + "loss": 0.0809, + "reward": 1.0296875417232514, + "reward_std": 0.15643419921398163, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9588541924953461, + "step": 1765 + }, + { + "clip_ratio": 0.0, + "completion_length": 611.4625183105469, + "epoch": 0.5652104336693871, + "grad_norm": 0.42721042037010193, + "kl": 0.5760065197944642, + "learning_rate": 9.469386087912302e-06, + "loss": 0.1066, + "reward": 1.1250000238418578, + "reward_std": 0.1428021177649498, + "rewards/accuracy_reward": 0.1562500052154064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9687500119209289, + "step": 1766 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.5666809082031, + "epoch": 0.5655304848775804, + "grad_norm": 0.2159615010023117, + "kl": 0.4528600886464119, + "learning_rate": 9.458226098095675e-06, + "loss": 0.1271, + "reward": 1.0973958671092987, + "reward_std": 0.15849269963800908, + "rewards/accuracy_reward": 0.1395833384245634, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9578125178813934, + "step": 1767 + }, + { + "clip_ratio": 0.0, + "completion_length": 621.2062683105469, + "epoch": 0.5658505360857737, + "grad_norm": 0.2607513964176178, + "kl": 0.5638085767626763, + "learning_rate": 9.447066784978914e-06, + "loss": 0.0944, + "reward": 1.0526041984558105, + "reward_std": 0.15217989590018988, + "rewards/accuracy_reward": 0.09375000204890967, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9567708492279052, + "step": 1768 + }, + { + "clip_ratio": 0.0, + "completion_length": 620.6875244140625, + "epoch": 0.5661705872939671, + "grad_norm": 0.28668591380119324, + "kl": 0.6076577290892601, + "learning_rate": 9.435908162500499e-06, + "loss": 0.0706, + "reward": 1.0057291746139527, + "reward_std": 0.14748432487249374, + "rewards/accuracy_reward": 0.039583333395421506, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9661458432674408, + "step": 1769 + }, + { + "clip_ratio": 0.0, + "completion_length": 612.2250152587891, + "epoch": 0.5664906385021603, + "grad_norm": 0.2156950682401657, + "kl": 0.48343914821743966, + "learning_rate": 9.42475024459805e-06, + "loss": 0.1229, + "reward": 1.1572916984558106, + "reward_std": 0.2026117168366909, + "rewards/accuracy_reward": 0.20000000428408385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9572916805744172, + "step": 1770 + }, + { + "clip_ratio": 0.0, + "completion_length": 636.0208465576172, + "epoch": 0.5668106897103536, + "grad_norm": 0.25211068987846375, + "kl": 0.4843083009123802, + "learning_rate": 9.413593045208303e-06, + "loss": 0.0723, + "reward": 0.9859375298023224, + "reward_std": 0.14214141964912413, + "rewards/accuracy_reward": 0.01875000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.967187511920929, + "step": 1771 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.2229248046875, + "epoch": 0.567130740918547, + "grad_norm": 0.17530737817287445, + "kl": 0.41530356407165525, + "learning_rate": 9.402436578267106e-06, + "loss": 0.0879, + "reward": 1.0848958492279053, + "reward_std": 0.15454170852899551, + "rewards/accuracy_reward": 0.12291667088866234, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9598958492279053, + "step": 1772 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.9562622070313, + "epoch": 0.5674507921267403, + "grad_norm": 0.1589900255203247, + "kl": 0.5026217520236969, + "learning_rate": 9.391280857709374e-06, + "loss": 0.0764, + "reward": 0.9989583551883697, + "reward_std": 0.16388722117990256, + "rewards/accuracy_reward": 0.03125000074505806, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9677083432674408, + "step": 1773 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.2562744140625, + "epoch": 0.5677708433349336, + "grad_norm": 0.3550090491771698, + "kl": 0.3733747750520706, + "learning_rate": 9.380125897469116e-06, + "loss": 0.0788, + "reward": 1.0442708551883697, + "reward_std": 0.17154857516288757, + "rewards/accuracy_reward": 0.07708333656191826, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9671875059604644, + "step": 1774 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.4750152587891, + "epoch": 0.5680908945431269, + "grad_norm": 0.32089048624038696, + "kl": 0.5282266348600387, + "learning_rate": 9.36897171147937e-06, + "loss": 0.1056, + "reward": 1.0885417103767394, + "reward_std": 0.21879145503044128, + "rewards/accuracy_reward": 0.11666667088866234, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9697916805744171, + "step": 1775 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.164599609375, + "epoch": 0.5684109457513202, + "grad_norm": 0.3526063859462738, + "kl": 0.6023999392986298, + "learning_rate": 9.357818313672216e-06, + "loss": 0.0963, + "reward": 1.0281250178813934, + "reward_std": 0.16391725689172745, + "rewards/accuracy_reward": 0.06458333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9635416686534881, + "step": 1776 + }, + { + "clip_ratio": 0.0, + "completion_length": 616.9229400634765, + "epoch": 0.5687309969595136, + "grad_norm": 0.3334544897079468, + "kl": 0.6702438533306122, + "learning_rate": 9.346665717978742e-06, + "loss": 0.1199, + "reward": 1.0270833611488341, + "reward_std": 0.14344887398183345, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.956250011920929, + "step": 1777 + }, + { + "clip_ratio": 0.0, + "completion_length": 607.3396057128906, + "epoch": 0.5690510481677068, + "grad_norm": 0.3106757700443268, + "kl": 0.7477944895625115, + "learning_rate": 9.335513938329046e-06, + "loss": 0.1181, + "reward": 1.0416666865348816, + "reward_std": 0.21114777252078057, + "rewards/accuracy_reward": 0.08750000167638064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9541666805744171, + "step": 1778 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.0812683105469, + "epoch": 0.5693710993759001, + "grad_norm": 0.1424209475517273, + "kl": 0.5696847230195999, + "learning_rate": 9.324362988652195e-06, + "loss": 0.1224, + "reward": 1.0354166865348815, + "reward_std": 0.169924059510231, + "rewards/accuracy_reward": 0.07083333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9645833432674408, + "step": 1779 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.4000213623046, + "epoch": 0.5696911505840935, + "grad_norm": 0.16978739202022552, + "kl": 0.43719258829951285, + "learning_rate": 9.313212882876228e-06, + "loss": 0.1375, + "reward": 1.085937535762787, + "reward_std": 0.15211977660655976, + "rewards/accuracy_reward": 0.11875000353902579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.967187511920929, + "step": 1780 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.8708465576171, + "epoch": 0.5700112017922868, + "grad_norm": 0.2683226466178894, + "kl": 0.49288763031363486, + "learning_rate": 9.30206363492812e-06, + "loss": 0.1011, + "reward": 1.0385416924953461, + "reward_std": 0.1908488892018795, + "rewards/accuracy_reward": 0.07916666697710753, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.959375011920929, + "step": 1781 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.5250183105469, + "epoch": 0.57033125300048, + "grad_norm": 0.29383039474487305, + "kl": 0.6997188687324524, + "learning_rate": 9.290915258733792e-06, + "loss": 0.1354, + "reward": 0.9651041865348816, + "reward_std": 0.16285497993230819, + "rewards/accuracy_reward": 0.01250000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9526041686534882, + "step": 1782 + }, + { + "clip_ratio": 0.0, + "completion_length": 590.9250122070313, + "epoch": 0.5706513042086734, + "grad_norm": 0.1701168715953827, + "kl": 0.5816778719425202, + "learning_rate": 9.279767768218058e-06, + "loss": 0.1418, + "reward": 1.1244792103767396, + "reward_std": 0.1965101033449173, + "rewards/accuracy_reward": 0.16250000428408384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9619791746139527, + "step": 1783 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.2729431152344, + "epoch": 0.5709713554168667, + "grad_norm": 0.1655789464712143, + "kl": 0.44416755214333536, + "learning_rate": 9.268621177304635e-06, + "loss": 0.0587, + "reward": 1.0848958611488342, + "reward_std": 0.10237012188881636, + "rewards/accuracy_reward": 0.10208333600312472, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.982812511920929, + "step": 1784 + }, + { + "clip_ratio": 0.0, + "completion_length": 604.1750183105469, + "epoch": 0.5712914066250601, + "grad_norm": 0.11165868490934372, + "kl": 0.3100586123764515, + "learning_rate": 9.25747549991611e-06, + "loss": 0.0946, + "reward": 1.0302083551883698, + "reward_std": 0.16860452741384507, + "rewards/accuracy_reward": 0.06250000018626452, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9677083492279053, + "step": 1785 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.6375244140625, + "epoch": 0.5716114578332533, + "grad_norm": 0.19193939864635468, + "kl": 0.3580825373530388, + "learning_rate": 9.246330749973943e-06, + "loss": 0.0696, + "reward": 1.1083333492279053, + "reward_std": 0.14191762804985047, + "rewards/accuracy_reward": 0.12500000540167094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9833333373069764, + "step": 1786 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.5541809082031, + "epoch": 0.5719315090414466, + "grad_norm": 0.06165502592921257, + "kl": 0.18685958310961723, + "learning_rate": 9.235186941398412e-06, + "loss": 0.0567, + "reward": 1.0651041865348816, + "reward_std": 0.08984843343496322, + "rewards/accuracy_reward": 0.07916666865348816, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9859375119209289, + "step": 1787 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.8687713623046, + "epoch": 0.57225156024964, + "grad_norm": 0.21538884937763214, + "kl": 0.3270559675991535, + "learning_rate": 9.224044088108642e-06, + "loss": 0.0818, + "reward": 1.1588541984558105, + "reward_std": 0.10733593087643385, + "rewards/accuracy_reward": 0.18333333935588597, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208432674408, + "step": 1788 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.9062713623047, + "epoch": 0.5725716114578333, + "grad_norm": 0.1343001127243042, + "kl": 0.19680135846138, + "learning_rate": 9.212902204022556e-06, + "loss": 0.057, + "reward": 1.0593750178813934, + "reward_std": 0.09946857746690511, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9885416686534881, + "step": 1789 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.0208587646484, + "epoch": 0.5728916626660265, + "grad_norm": 0.05754992738366127, + "kl": 0.15173882991075516, + "learning_rate": 9.20176130305686e-06, + "loss": 0.0388, + "reward": 1.1447916865348815, + "reward_std": 0.07982183620333672, + "rewards/accuracy_reward": 0.1541666707023978, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.990625, + "step": 1790 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.8437713623047, + "epoch": 0.5732117138742199, + "grad_norm": 0.2448616623878479, + "kl": 0.30400142446160316, + "learning_rate": 9.190621399127045e-06, + "loss": 0.0644, + "reward": 1.0401041865348817, + "reward_std": 0.10776591561734676, + "rewards/accuracy_reward": 0.058333334513008595, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708432674408, + "step": 1791 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.3979431152344, + "epoch": 0.5735317650824132, + "grad_norm": 0.05121641978621483, + "kl": 0.16107941642403603, + "learning_rate": 9.179482506147346e-06, + "loss": 0.0506, + "reward": 1.0645833432674408, + "reward_std": 0.06739432364702225, + "rewards/accuracy_reward": 0.07916666995733976, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9854166746139527, + "step": 1792 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.7750274658204, + "epoch": 0.5738518162906064, + "grad_norm": 0.17669126391410828, + "kl": 0.1536485359072685, + "learning_rate": 9.168344638030743e-06, + "loss": 0.0414, + "reward": 1.0526041984558105, + "reward_std": 0.07714264132082463, + "rewards/accuracy_reward": 0.06250000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9901041746139526, + "step": 1793 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.4916900634765, + "epoch": 0.5741718674987998, + "grad_norm": 0.06108405813574791, + "kl": 0.1886889159679413, + "learning_rate": 9.157207808688925e-06, + "loss": 0.0528, + "reward": 1.0994791984558105, + "reward_std": 0.10481414943933487, + "rewards/accuracy_reward": 0.11458333749324083, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9848958492279053, + "step": 1794 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.9520904541016, + "epoch": 0.5744919187069931, + "grad_norm": 0.0590064600110054, + "kl": 0.25982956662774087, + "learning_rate": 9.146072032032298e-06, + "loss": 0.0622, + "reward": 1.160416692495346, + "reward_std": 0.08663471266627312, + "rewards/accuracy_reward": 0.1791666718199849, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9812500059604645, + "step": 1795 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.2187774658203, + "epoch": 0.5748119699151865, + "grad_norm": 0.09085609018802643, + "kl": 0.24539805799722672, + "learning_rate": 9.134937321969941e-06, + "loss": 0.0423, + "reward": 1.107291692495346, + "reward_std": 0.10802287980914116, + "rewards/accuracy_reward": 0.12083333693444728, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9864583373069763, + "step": 1796 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.8791809082031, + "epoch": 0.5751320211233797, + "grad_norm": 0.05450833588838577, + "kl": 0.15207632929086684, + "learning_rate": 9.123803692409609e-06, + "loss": 0.0306, + "reward": 1.0890625238418579, + "reward_std": 0.07662020195275546, + "rewards/accuracy_reward": 0.10000000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9890625238418579, + "step": 1797 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.5541778564453, + "epoch": 0.575452072331573, + "grad_norm": 0.12327313423156738, + "kl": 0.36361787244677546, + "learning_rate": 9.112671157257698e-06, + "loss": 0.0845, + "reward": 1.0776041984558105, + "reward_std": 0.14283109903335572, + "rewards/accuracy_reward": 0.09375000391155482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9838541746139526, + "step": 1798 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.6666809082031, + "epoch": 0.5757721235397664, + "grad_norm": 0.3154713809490204, + "kl": 0.17768015563488007, + "learning_rate": 9.101539730419247e-06, + "loss": 0.0444, + "reward": 1.0208333611488343, + "reward_std": 0.11578117497265339, + "rewards/accuracy_reward": 0.03333333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.987500011920929, + "step": 1799 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.6021026611328, + "epoch": 0.5760921747479597, + "grad_norm": 0.08551464974880219, + "kl": 0.14677060693502425, + "learning_rate": 9.090409425797908e-06, + "loss": 0.0544, + "reward": 1.0614583492279053, + "reward_std": 0.08336172327399254, + "rewards/accuracy_reward": 0.07291667070239782, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9885416746139526, + "step": 1800 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.9854370117188, + "epoch": 0.5764122259561529, + "grad_norm": 0.0542651005089283, + "kl": 0.19063802286982537, + "learning_rate": 9.07928025729593e-06, + "loss": 0.0169, + "reward": 1.0145833432674407, + "reward_std": 0.05645497292280197, + "rewards/accuracy_reward": 0.022916667722165586, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9916666746139526, + "step": 1801 + }, + { + "clip_ratio": 0.0, + "completion_length": 616.9625061035156, + "epoch": 0.5767322771643463, + "grad_norm": 0.0507982037961483, + "kl": 0.19528093002736568, + "learning_rate": 9.068152238814139e-06, + "loss": 0.0329, + "reward": 1.0057291746139527, + "reward_std": 0.06221659388393164, + "rewards/accuracy_reward": 0.01458333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9911458432674408, + "step": 1802 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.5396057128906, + "epoch": 0.5770523283725396, + "grad_norm": 0.10525275021791458, + "kl": 0.17930835708975792, + "learning_rate": 9.057025384251934e-06, + "loss": 0.0635, + "reward": 1.0838541865348816, + "reward_std": 0.09392364919185639, + "rewards/accuracy_reward": 0.09791667070239782, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9859375119209289, + "step": 1803 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.9771026611328, + "epoch": 0.577372379580733, + "grad_norm": 0.10307978838682175, + "kl": 0.30039278194308283, + "learning_rate": 9.045899707507247e-06, + "loss": 0.0845, + "reward": 1.100000023841858, + "reward_std": 0.1839461788535118, + "rewards/accuracy_reward": 0.1312500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9687500238418579, + "step": 1804 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.1333557128906, + "epoch": 0.5776924307889262, + "grad_norm": 0.15342208743095398, + "kl": 0.27368993014097215, + "learning_rate": 9.034775222476555e-06, + "loss": 0.0676, + "reward": 1.0489583611488342, + "reward_std": 0.12007906846702099, + "rewards/accuracy_reward": 0.0708333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.978125023841858, + "step": 1805 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.8333557128906, + "epoch": 0.5780124819971195, + "grad_norm": 0.08706728368997574, + "kl": 0.26777110919356345, + "learning_rate": 9.023651943054825e-06, + "loss": 0.0508, + "reward": 1.0380208611488342, + "reward_std": 0.11098587065935135, + "rewards/accuracy_reward": 0.05000000055879354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9880208373069763, + "step": 1806 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.747933959961, + "epoch": 0.5783325332053129, + "grad_norm": 0.1062178835272789, + "kl": 0.3234595455229282, + "learning_rate": 9.012529883135548e-06, + "loss": 0.0528, + "reward": 1.0947916984558106, + "reward_std": 0.13509288653731347, + "rewards/accuracy_reward": 0.11875000223517418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9760416746139526, + "step": 1807 + }, + { + "clip_ratio": 0.0, + "completion_length": 610.345849609375, + "epoch": 0.5786525844135062, + "grad_norm": 0.12420543283224106, + "kl": 0.27922215312719345, + "learning_rate": 9.001409056610662e-06, + "loss": 0.0838, + "reward": 0.9921875178813935, + "reward_std": 0.09044673759490252, + "rewards/accuracy_reward": 0.012500000558793545, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.979687511920929, + "step": 1808 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.7291809082031, + "epoch": 0.5789726356216994, + "grad_norm": 0.10455825179815292, + "kl": 0.2904270239174366, + "learning_rate": 8.990289477370587e-06, + "loss": 0.0701, + "reward": 1.1682292103767395, + "reward_std": 0.15833674147725105, + "rewards/accuracy_reward": 0.1812500037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9869791805744171, + "step": 1809 + }, + { + "clip_ratio": 0.0, + "completion_length": 613.3041748046875, + "epoch": 0.5792926868298928, + "grad_norm": 0.12697599828243256, + "kl": 0.43104536086320877, + "learning_rate": 8.979171159304166e-06, + "loss": 0.0888, + "reward": 1.0036458492279052, + "reward_std": 0.11862440332770348, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9682291865348815, + "step": 1810 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.5312622070312, + "epoch": 0.5796127380380861, + "grad_norm": 0.2758425176143646, + "kl": 0.4129337251186371, + "learning_rate": 8.968054116298683e-06, + "loss": 0.0883, + "reward": 1.025000011920929, + "reward_std": 0.1669561004266143, + "rewards/accuracy_reward": 0.05208333376795053, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9729166746139526, + "step": 1811 + }, + { + "clip_ratio": 0.0, + "completion_length": 607.5604370117187, + "epoch": 0.5799327892462794, + "grad_norm": 0.07522869855165482, + "kl": 0.23817719668149948, + "learning_rate": 8.95693836223982e-06, + "loss": 0.0411, + "reward": 1.043750035762787, + "reward_std": 0.13217582330107688, + "rewards/accuracy_reward": 0.0604166679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9833333492279053, + "step": 1812 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.7854370117187, + "epoch": 0.5802528404544727, + "grad_norm": 0.12487686425447464, + "kl": 0.3361851140856743, + "learning_rate": 8.94582391101165e-06, + "loss": 0.0758, + "reward": 1.1651041984558106, + "reward_std": 0.1337714796885848, + "rewards/accuracy_reward": 0.18333333637565374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708373069763, + "step": 1813 + }, + { + "clip_ratio": 0.0, + "completion_length": 609.3562622070312, + "epoch": 0.580572891662666, + "grad_norm": 0.08441471308469772, + "kl": 0.3453087739646435, + "learning_rate": 8.934710776496623e-06, + "loss": 0.0425, + "reward": 1.0348958611488341, + "reward_std": 0.1058173468336463, + "rewards/accuracy_reward": 0.05625000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458551883698, + "step": 1814 + }, + { + "clip_ratio": 0.0, + "completion_length": 614.9521057128907, + "epoch": 0.5808929428708594, + "grad_norm": 0.27958735823631287, + "kl": 0.3278763361275196, + "learning_rate": 8.923598972575537e-06, + "loss": 0.0478, + "reward": 1.1067708611488343, + "reward_std": 0.1621831137686968, + "rewards/accuracy_reward": 0.11875000558793544, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9880208492279052, + "step": 1815 + }, + { + "clip_ratio": 0.0, + "completion_length": 616.8083618164062, + "epoch": 0.5812129940790527, + "grad_norm": 0.1701308786869049, + "kl": 0.27441012263298037, + "learning_rate": 8.912488513127539e-06, + "loss": 0.0623, + "reward": 1.0520833611488343, + "reward_std": 0.13372773118317127, + "rewards/accuracy_reward": 0.07291666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9791666805744171, + "step": 1816 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.4354370117187, + "epoch": 0.5815330452872459, + "grad_norm": 0.11255865544080734, + "kl": 0.484719355404377, + "learning_rate": 8.901379412030089e-06, + "loss": 0.0888, + "reward": 1.1411458551883698, + "reward_std": 0.19930556789040565, + "rewards/accuracy_reward": 0.1645833369344473, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9744791865348816, + "step": 1817 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.7291900634766, + "epoch": 0.5818530964954393, + "grad_norm": 0.1106775552034378, + "kl": 0.16104906797409058, + "learning_rate": 8.89027168315895e-06, + "loss": 0.0416, + "reward": 1.1359375298023224, + "reward_std": 0.0840451443567872, + "rewards/accuracy_reward": 0.14791667014360427, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9880208492279052, + "step": 1818 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.6437622070313, + "epoch": 0.5821731477036326, + "grad_norm": 0.11812810599803925, + "kl": 0.2805390991270542, + "learning_rate": 8.879165340388171e-06, + "loss": 0.059, + "reward": 1.0562500178813934, + "reward_std": 0.14737335927784442, + "rewards/accuracy_reward": 0.08125000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.975000011920929, + "step": 1819 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.6979309082031, + "epoch": 0.5824931989118259, + "grad_norm": 0.10681235790252686, + "kl": 0.1654998004436493, + "learning_rate": 8.868060397590075e-06, + "loss": 0.0493, + "reward": 1.1421875357627869, + "reward_std": 0.10358922835439444, + "rewards/accuracy_reward": 0.1520833395421505, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9880208492279052, + "step": 1820 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.9562622070313, + "epoch": 0.5828132501200192, + "grad_norm": 0.11712974309921265, + "kl": 0.430647674202919, + "learning_rate": 8.856956868635233e-06, + "loss": 0.075, + "reward": 1.1479166984558105, + "reward_std": 0.1634374063462019, + "rewards/accuracy_reward": 0.17708333842456342, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333551883698, + "step": 1821 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.0479339599609, + "epoch": 0.5831333013282125, + "grad_norm": 0.16343854367733002, + "kl": 0.4074979230761528, + "learning_rate": 8.845854767392448e-06, + "loss": 0.079, + "reward": 1.1473958849906922, + "reward_std": 0.1910140451043844, + "rewards/accuracy_reward": 0.1708333410322666, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9765625178813935, + "step": 1822 + }, + { + "clip_ratio": 0.0, + "completion_length": 625.0375244140625, + "epoch": 0.5834533525364058, + "grad_norm": 0.16372530162334442, + "kl": 0.3780060760676861, + "learning_rate": 8.834754107728738e-06, + "loss": 0.032, + "reward": 1.0942708730697632, + "reward_std": 0.1673297893255949, + "rewards/accuracy_reward": 0.11250000204890967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708492279053, + "step": 1823 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.7833526611328, + "epoch": 0.5837734037445992, + "grad_norm": 0.13239432871341705, + "kl": 0.326027612388134, + "learning_rate": 8.82365490350933e-06, + "loss": 0.0631, + "reward": 1.1911458730697633, + "reward_std": 0.16341153010725976, + "rewards/accuracy_reward": 0.2125000050291419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458432674408, + "step": 1824 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.020849609375, + "epoch": 0.5840934549527924, + "grad_norm": 0.32353493571281433, + "kl": 0.23483212813735008, + "learning_rate": 8.812557168597626e-06, + "loss": 0.0763, + "reward": 1.0645833611488342, + "reward_std": 0.1218577940016985, + "rewards/accuracy_reward": 0.0875000024214387, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833551883698, + "step": 1825 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.0958557128906, + "epoch": 0.5844135061609858, + "grad_norm": 0.14850178360939026, + "kl": 0.2920558929443359, + "learning_rate": 8.801460916855194e-06, + "loss": 0.0739, + "reward": 1.0692708611488342, + "reward_std": 0.1560745693743229, + "rewards/accuracy_reward": 0.08750000372529029, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9796875178813934, + "step": 1826 + }, + { + "clip_ratio": 0.0, + "completion_length": 609.6604370117187, + "epoch": 0.5847335573691791, + "grad_norm": 0.15987901389598846, + "kl": 0.47453284710645677, + "learning_rate": 8.790366162141747e-06, + "loss": 0.0701, + "reward": 1.1197916984558105, + "reward_std": 0.1661427855491638, + "rewards/accuracy_reward": 0.1458333371207118, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.9697916865348816, + "step": 1827 + }, + { + "clip_ratio": 0.0, + "completion_length": 612.220849609375, + "epoch": 0.5850536085773724, + "grad_norm": 0.17211389541625977, + "kl": 0.44642241299152374, + "learning_rate": 8.779272918315135e-06, + "loss": 0.0873, + "reward": 1.0687500178813933, + "reward_std": 0.18413979113101958, + "rewards/accuracy_reward": 0.10625000149011612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9625000059604645, + "step": 1828 + }, + { + "clip_ratio": 0.0, + "completion_length": 599.7250183105468, + "epoch": 0.5853736597855657, + "grad_norm": 0.1202344223856926, + "kl": 0.4751662090420723, + "learning_rate": 8.768181199231309e-06, + "loss": 0.0769, + "reward": 1.0192708611488341, + "reward_std": 0.12260491922497749, + "rewards/accuracy_reward": 0.045833334885537626, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9713541805744171, + "step": 1829 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.227099609375, + "epoch": 0.585693710993759, + "grad_norm": 0.26715612411499023, + "kl": 0.5222580231726169, + "learning_rate": 8.757091018744327e-06, + "loss": 0.0813, + "reward": 1.1015625476837159, + "reward_std": 0.15981225967407225, + "rewards/accuracy_reward": 0.1250000052154064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9765625119209289, + "step": 1830 + }, + { + "clip_ratio": 0.0, + "completion_length": 627.545849609375, + "epoch": 0.5860137622019523, + "grad_norm": 0.1669369488954544, + "kl": 0.35020282939076425, + "learning_rate": 8.746002390706318e-06, + "loss": 0.0863, + "reward": 1.110416704416275, + "reward_std": 0.11681613381952047, + "rewards/accuracy_reward": 0.1354166718199849, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9750000178813935, + "step": 1831 + }, + { + "clip_ratio": 0.0, + "completion_length": 623.5604370117187, + "epoch": 0.5863338134101457, + "grad_norm": 0.15815506875514984, + "kl": 0.5524609833955765, + "learning_rate": 8.734915328967484e-06, + "loss": 0.0783, + "reward": 1.0312500178813935, + "reward_std": 0.1657247856259346, + "rewards/accuracy_reward": 0.07291666995733977, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9583333373069763, + "step": 1832 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.268765258789, + "epoch": 0.5866538646183389, + "grad_norm": 0.09822983294725418, + "kl": 0.27492492496967313, + "learning_rate": 8.723829847376054e-06, + "loss": 0.0452, + "reward": 1.042187511920929, + "reward_std": 0.12717793975025415, + "rewards/accuracy_reward": 0.06250000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.979687511920929, + "step": 1833 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.5625183105469, + "epoch": 0.5869739158265322, + "grad_norm": 0.08277434855699539, + "kl": 0.2869084417819977, + "learning_rate": 8.712745959778293e-06, + "loss": 0.0608, + "reward": 1.0411458611488342, + "reward_std": 0.1428369514644146, + "rewards/accuracy_reward": 0.0666666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9744791865348816, + "step": 1834 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.0021057128906, + "epoch": 0.5872939670347256, + "grad_norm": 0.081912100315094, + "kl": 0.14179150089621545, + "learning_rate": 8.70166368001847e-06, + "loss": 0.0047, + "reward": 1.176041692495346, + "reward_std": 0.09121424276381732, + "rewards/accuracy_reward": 0.1854166727513075, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.990625011920929, + "step": 1835 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.1062683105469, + "epoch": 0.5876140182429188, + "grad_norm": 0.10189507901668549, + "kl": 0.19410741589963437, + "learning_rate": 8.690583021938854e-06, + "loss": 0.0435, + "reward": 1.107291692495346, + "reward_std": 0.1382425595074892, + "rewards/accuracy_reward": 0.11666667014360428, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9906250059604644, + "step": 1836 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.6708587646484, + "epoch": 0.5879340694511122, + "grad_norm": 0.08415410667657852, + "kl": 0.15824654512107372, + "learning_rate": 8.679503999379679e-06, + "loss": 0.0542, + "reward": 1.0067708432674407, + "reward_std": 0.12068486250936986, + "rewards/accuracy_reward": 0.02708333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.979687511920929, + "step": 1837 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.6166870117188, + "epoch": 0.5882541206593055, + "grad_norm": 0.17711393535137177, + "kl": 0.18984725177288056, + "learning_rate": 8.66842662617914e-06, + "loss": 0.0495, + "reward": 1.0437500298023223, + "reward_std": 0.116551817022264, + "rewards/accuracy_reward": 0.06041666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9833333492279053, + "step": 1838 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.8062744140625, + "epoch": 0.5885741718674988, + "grad_norm": 0.30450427532196045, + "kl": 0.3485433362424374, + "learning_rate": 8.657350916173376e-06, + "loss": 0.0873, + "reward": 1.0572916924953462, + "reward_std": 0.1278322547674179, + "rewards/accuracy_reward": 0.08333333544433116, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583492279053, + "step": 1839 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.1375122070312, + "epoch": 0.5888942230756921, + "grad_norm": 0.06369006633758545, + "kl": 0.19454945325851442, + "learning_rate": 8.646276883196438e-06, + "loss": 0.0488, + "reward": 1.1505208551883697, + "reward_std": 0.0879446528851986, + "rewards/accuracy_reward": 0.16458333935588598, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9859375059604645, + "step": 1840 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.237515258789, + "epoch": 0.5892142742838854, + "grad_norm": 0.1309588998556137, + "kl": 0.36753388717770574, + "learning_rate": 8.635204541080297e-06, + "loss": 0.0331, + "reward": 1.1442708671092987, + "reward_std": 0.12082125805318356, + "rewards/accuracy_reward": 0.16875000409781932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9755208432674408, + "step": 1841 + }, + { + "clip_ratio": 0.0, + "completion_length": 611.8687713623046, + "epoch": 0.5895343254920787, + "grad_norm": 0.06236157566308975, + "kl": 0.2872423198074102, + "learning_rate": 8.624133903654802e-06, + "loss": 0.0428, + "reward": 1.043750035762787, + "reward_std": 0.14997683018445968, + "rewards/accuracy_reward": 0.06250000093132257, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.981250011920929, + "step": 1842 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.7250183105468, + "epoch": 0.5898543767002721, + "grad_norm": 0.07617057114839554, + "kl": 0.1260071013122797, + "learning_rate": 8.613064984747672e-06, + "loss": 0.0206, + "reward": 1.0890625238418579, + "reward_std": 0.1047076016664505, + "rewards/accuracy_reward": 0.09791666902601719, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9911458492279053, + "step": 1843 + }, + { + "clip_ratio": 0.0, + "completion_length": 630.8041809082031, + "epoch": 0.5901744279084653, + "grad_norm": 0.054941531270742416, + "kl": 0.1285891652107239, + "learning_rate": 8.601997798184486e-06, + "loss": 0.0382, + "reward": 1.1401041984558105, + "reward_std": 0.10463837906718254, + "rewards/accuracy_reward": 0.1500000035390258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9901041686534882, + "step": 1844 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.7166809082031, + "epoch": 0.5904944791166586, + "grad_norm": 0.05618637055158615, + "kl": 0.19999119341373445, + "learning_rate": 8.590932357788652e-06, + "loss": 0.0212, + "reward": 1.1223958551883697, + "reward_std": 0.0716858796775341, + "rewards/accuracy_reward": 0.13333333656191826, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9890625059604645, + "step": 1845 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.1625244140625, + "epoch": 0.590814530324852, + "grad_norm": 0.1865171641111374, + "kl": 0.21524617075920105, + "learning_rate": 8.5798686773814e-06, + "loss": 0.0601, + "reward": 1.1328125238418578, + "reward_std": 0.11431420799344778, + "rewards/accuracy_reward": 0.15416667088866234, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458492279053, + "step": 1846 + }, + { + "clip_ratio": 0.0, + "completion_length": 619.3208557128906, + "epoch": 0.5911345815330453, + "grad_norm": 0.10544977337121964, + "kl": 0.16690328232944013, + "learning_rate": 8.568806770781769e-06, + "loss": 0.0462, + "reward": 1.0468750178813935, + "reward_std": 0.15823222547769547, + "rewards/accuracy_reward": 0.06458333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9822916924953461, + "step": 1847 + }, + { + "clip_ratio": 0.0, + "completion_length": 603.977099609375, + "epoch": 0.5914546327412386, + "grad_norm": 0.13692694902420044, + "kl": 0.3568322047591209, + "learning_rate": 8.557746651806566e-06, + "loss": 0.0936, + "reward": 1.0713541984558106, + "reward_std": 0.17686703354120253, + "rewards/accuracy_reward": 0.10000000260770321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9713541805744171, + "step": 1848 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.2041931152344, + "epoch": 0.5917746839494319, + "grad_norm": 0.07218168675899506, + "kl": 0.15120973512530328, + "learning_rate": 8.546688334270381e-06, + "loss": 0.0523, + "reward": 1.108333373069763, + "reward_std": 0.10372773855924607, + "rewards/accuracy_reward": 0.12083334028720856, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9875000238418579, + "step": 1849 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.3396118164062, + "epoch": 0.5920947351576252, + "grad_norm": 0.12064994126558304, + "kl": 0.28165081068873404, + "learning_rate": 8.53563183198555e-06, + "loss": 0.0522, + "reward": 1.0250000298023223, + "reward_std": 0.12865074295550585, + "rewards/accuracy_reward": 0.04791666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9770833432674408, + "step": 1850 + }, + { + "clip_ratio": 0.0, + "completion_length": 590.3687683105469, + "epoch": 0.5924147863658186, + "grad_norm": 0.051113247871398926, + "kl": 0.16676584184169768, + "learning_rate": 8.524577158762137e-06, + "loss": 0.0378, + "reward": 1.0406250298023223, + "reward_std": 0.0711493318900466, + "rewards/accuracy_reward": 0.05208333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9885416805744172, + "step": 1851 + }, + { + "clip_ratio": 0.0, + "completion_length": 599.1875183105469, + "epoch": 0.5927348375740118, + "grad_norm": 0.07360915094614029, + "kl": 0.13382341675460338, + "learning_rate": 8.51352432840792e-06, + "loss": 0.0258, + "reward": 1.0822916984558106, + "reward_std": 0.1298227585852146, + "rewards/accuracy_reward": 0.08750000279396772, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9927083432674408, + "step": 1852 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.8396057128906, + "epoch": 0.5930548887822051, + "grad_norm": 0.08503315597772598, + "kl": 0.24217786304652691, + "learning_rate": 8.502473354728384e-06, + "loss": 0.0298, + "reward": 1.1484375298023224, + "reward_std": 0.13012812230736018, + "rewards/accuracy_reward": 0.16250000353902577, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9859375119209289, + "step": 1853 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.2250213623047, + "epoch": 0.5933749399903985, + "grad_norm": 0.1039816215634346, + "kl": 0.13602565452456475, + "learning_rate": 8.491424251526688e-06, + "loss": 0.0292, + "reward": 1.0276041865348815, + "reward_std": 0.04552529603242874, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9901041746139526, + "step": 1854 + }, + { + "clip_ratio": 0.0, + "completion_length": 604.8479431152343, + "epoch": 0.5936949911985918, + "grad_norm": 0.06977000087499619, + "kl": 0.23898737505078316, + "learning_rate": 8.480377032603658e-06, + "loss": 0.0305, + "reward": 1.0536458671092988, + "reward_std": 0.08575146589428187, + "rewards/accuracy_reward": 0.06666666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9869791865348816, + "step": 1855 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.564599609375, + "epoch": 0.594015042406785, + "grad_norm": 0.0967579260468483, + "kl": 0.29310873821377753, + "learning_rate": 8.46933171175776e-06, + "loss": 0.0658, + "reward": 1.160937523841858, + "reward_std": 0.15478852652013303, + "rewards/accuracy_reward": 0.17291667368263006, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.9838541746139526, + "step": 1856 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.5770935058594, + "epoch": 0.5943350936149784, + "grad_norm": 0.12447807192802429, + "kl": 0.2548402227461338, + "learning_rate": 8.4582883027851e-06, + "loss": 0.0578, + "reward": 1.0817708611488341, + "reward_std": 0.15226125419139863, + "rewards/accuracy_reward": 0.10416666846722364, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9755208432674408, + "step": 1857 + }, + { + "clip_ratio": 0.0, + "completion_length": 644.5750183105469, + "epoch": 0.5946551448231717, + "grad_norm": 0.1542220264673233, + "kl": 0.2618264824151993, + "learning_rate": 8.44724681947939e-06, + "loss": 0.0396, + "reward": 1.0192708492279052, + "reward_std": 0.0826049368828535, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9817708492279053, + "step": 1858 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.1791931152344, + "epoch": 0.5949751960313651, + "grad_norm": 0.23071099817752838, + "kl": 0.41815656051039696, + "learning_rate": 8.436207275631937e-06, + "loss": 0.0822, + "reward": 1.003125011920929, + "reward_std": 0.13680147156119346, + "rewards/accuracy_reward": 0.027083333395421506, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9760416805744171, + "step": 1859 + }, + { + "clip_ratio": 0.0, + "completion_length": 632.9541870117188, + "epoch": 0.5952952472395583, + "grad_norm": 0.22107675671577454, + "kl": 0.32695833742618563, + "learning_rate": 8.425169685031623e-06, + "loss": 0.0619, + "reward": 1.0614583611488342, + "reward_std": 0.12557398490607738, + "rewards/accuracy_reward": 0.08750000204890966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583551883697, + "step": 1860 + }, + { + "clip_ratio": 0.0, + "completion_length": 611.077099609375, + "epoch": 0.5956152984477516, + "grad_norm": 0.08938566595315933, + "kl": 0.1705992490053177, + "learning_rate": 8.414134061464898e-06, + "loss": 0.0631, + "reward": 1.027604204416275, + "reward_std": 0.14426877107471228, + "rewards/accuracy_reward": 0.050000001676380634, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9776041805744171, + "step": 1861 + }, + { + "clip_ratio": 0.0, + "completion_length": 636.795849609375, + "epoch": 0.595935349655945, + "grad_norm": 0.3400190472602844, + "kl": 0.5977102071046829, + "learning_rate": 8.403100418715743e-06, + "loss": 0.0797, + "reward": 1.0213542044162751, + "reward_std": 0.13304599486291407, + "rewards/accuracy_reward": 0.05833333432674408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9630208611488342, + "step": 1862 + }, + { + "clip_ratio": 0.0, + "completion_length": 613.295849609375, + "epoch": 0.5962554008641383, + "grad_norm": 0.13622289896011353, + "kl": 0.3738606728613377, + "learning_rate": 8.392068770565675e-06, + "loss": 0.0608, + "reward": 1.0859375238418578, + "reward_std": 0.11104068085551262, + "rewards/accuracy_reward": 0.10833333637565375, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9755208492279053, + "step": 1863 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.4291931152344, + "epoch": 0.5965754520723315, + "grad_norm": 0.16210956871509552, + "kl": 0.32467666193842887, + "learning_rate": 8.381039130793718e-06, + "loss": 0.1061, + "reward": 1.051562523841858, + "reward_std": 0.1963793769478798, + "rewards/accuracy_reward": 0.08333333451300859, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9682291865348815, + "step": 1864 + }, + { + "clip_ratio": 0.0, + "completion_length": 609.9354309082031, + "epoch": 0.5968955032805249, + "grad_norm": 0.12312551587820053, + "kl": 0.2595462821424007, + "learning_rate": 8.370011513176381e-06, + "loss": 0.0772, + "reward": 1.0911458730697632, + "reward_std": 0.10535571686923503, + "rewards/accuracy_reward": 0.11250000428408384, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9786458373069763, + "step": 1865 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.1708618164063, + "epoch": 0.5972155544887182, + "grad_norm": 0.3161362111568451, + "kl": 0.3036894164979458, + "learning_rate": 8.35898593148766e-06, + "loss": 0.0434, + "reward": 1.036979192495346, + "reward_std": 0.11786015536636114, + "rewards/accuracy_reward": 0.06041666828095913, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9765625238418579, + "step": 1866 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.6125244140625, + "epoch": 0.5975356056969116, + "grad_norm": 0.14329548180103302, + "kl": 0.3524964414536953, + "learning_rate": 8.347962399498996e-06, + "loss": 0.1037, + "reward": 1.0718750357627869, + "reward_std": 0.159610353410244, + "rewards/accuracy_reward": 0.09791666958481074, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9739583551883697, + "step": 1867 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.6395935058594, + "epoch": 0.5978556569051048, + "grad_norm": 0.16905654966831207, + "kl": 0.3246914021670818, + "learning_rate": 8.336940930979275e-06, + "loss": 0.0681, + "reward": 1.0343750238418579, + "reward_std": 0.17898188009858132, + "rewards/accuracy_reward": 0.05833333563059569, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9760416746139526, + "step": 1868 + }, + { + "clip_ratio": 0.0, + "completion_length": 609.1625122070312, + "epoch": 0.5981757081132981, + "grad_norm": 0.2423020452260971, + "kl": 0.33233404383063314, + "learning_rate": 8.325921539694805e-06, + "loss": 0.0671, + "reward": 1.193229216337204, + "reward_std": 0.2123827485367656, + "rewards/accuracy_reward": 0.22083333879709244, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9703125298023224, + "step": 1869 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.6083526611328, + "epoch": 0.5984957593214915, + "grad_norm": 0.13021445274353027, + "kl": 0.2521624334156513, + "learning_rate": 8.314904239409295e-06, + "loss": 0.0551, + "reward": 1.0932291924953461, + "reward_std": 0.11757631208747625, + "rewards/accuracy_reward": 0.11458333693444729, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.9744791865348816, + "step": 1870 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.9771087646484, + "epoch": 0.5988158105296848, + "grad_norm": 0.1187562495470047, + "kl": 0.3661712847650051, + "learning_rate": 8.303889043883852e-06, + "loss": 0.0749, + "reward": 1.1473958611488342, + "reward_std": 0.17185933999717234, + "rewards/accuracy_reward": 0.1729166703298688, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9744791805744171, + "step": 1871 + }, + { + "clip_ratio": 0.0, + "completion_length": 588.9666870117187, + "epoch": 0.599135861737878, + "grad_norm": 0.1332739293575287, + "kl": 0.34436929896473883, + "learning_rate": 8.292875966876947e-06, + "loss": 0.0807, + "reward": 1.1187500417232514, + "reward_std": 0.14176899399608373, + "rewards/accuracy_reward": 0.147916672937572, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9708333432674408, + "step": 1872 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.145849609375, + "epoch": 0.5994559129460714, + "grad_norm": 0.21813121438026428, + "kl": 0.27012273371219636, + "learning_rate": 8.281865022144403e-06, + "loss": 0.0445, + "reward": 1.0078125178813935, + "reward_std": 0.13042169529944658, + "rewards/accuracy_reward": 0.025000000931322576, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.9786458551883698, + "step": 1873 + }, + { + "clip_ratio": 0.0, + "completion_length": 609.7396057128906, + "epoch": 0.5997759641542647, + "grad_norm": 0.21716895699501038, + "kl": 0.29259502738714216, + "learning_rate": 8.270856223439386e-06, + "loss": 0.0442, + "reward": 1.0255208671092988, + "reward_std": 0.14278821237385272, + "rewards/accuracy_reward": 0.04791666902601719, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9776041805744171, + "step": 1874 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.4291870117188, + "epoch": 0.600096015362458, + "grad_norm": 0.12663888931274414, + "kl": 0.33957659900188447, + "learning_rate": 8.25984958451238e-06, + "loss": 0.0852, + "reward": 1.0302083671092988, + "reward_std": 0.13524105921387672, + "rewards/accuracy_reward": 0.05833333525806665, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9697916805744171, + "step": 1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.2229370117187, + "epoch": 0.6004160665706513, + "grad_norm": 0.1679977923631668, + "kl": 0.4340445719659328, + "learning_rate": 8.248845119111168e-06, + "loss": 0.096, + "reward": 1.0390625298023224, + "reward_std": 0.15016994029283523, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9682291805744171, + "step": 1876 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.52294921875, + "epoch": 0.6007361177788446, + "grad_norm": 0.3022070527076721, + "kl": 0.3699290931224823, + "learning_rate": 8.23784284098082e-06, + "loss": 0.11, + "reward": 1.0911458492279054, + "reward_std": 0.18554365485906602, + "rewards/accuracy_reward": 0.12083333898335695, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9682291805744171, + "step": 1877 + }, + { + "clip_ratio": 0.0, + "completion_length": 604.1500244140625, + "epoch": 0.601056168987038, + "grad_norm": 0.09283298999071121, + "kl": 0.35596207827329635, + "learning_rate": 8.226842763863675e-06, + "loss": 0.0632, + "reward": 1.0843750298023225, + "reward_std": 0.11838657595217228, + "rewards/accuracy_reward": 0.10208333730697632, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.9781250119209289, + "step": 1878 + }, + { + "clip_ratio": 0.0, + "completion_length": 616.0000183105469, + "epoch": 0.6013762201952313, + "grad_norm": 0.15658091008663177, + "kl": 0.21277981698513032, + "learning_rate": 8.21584490149932e-06, + "loss": 0.0449, + "reward": 1.080208384990692, + "reward_std": 0.1628492258489132, + "rewards/accuracy_reward": 0.09791666902601719, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.9781250298023224, + "step": 1879 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.5937561035156, + "epoch": 0.6016962714034245, + "grad_norm": 0.21834583580493927, + "kl": 0.41966616809368135, + "learning_rate": 8.20484926762458e-06, + "loss": 0.0764, + "reward": 1.0187500178813935, + "reward_std": 0.08752396404743194, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1880 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.5562683105469, + "epoch": 0.6020163226116179, + "grad_norm": 0.3363422453403473, + "kl": 0.26540239825844764, + "learning_rate": 8.19385587597349e-06, + "loss": 0.0591, + "reward": 1.058854204416275, + "reward_std": 0.16936119571328162, + "rewards/accuracy_reward": 0.0770833345130086, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.9776041746139527, + "step": 1881 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.0604431152344, + "epoch": 0.6023363738198112, + "grad_norm": 0.21959060430526733, + "kl": 0.3500664710998535, + "learning_rate": 8.182864740277293e-06, + "loss": 0.0697, + "reward": 1.0619792103767396, + "reward_std": 0.13994233533740044, + "rewards/accuracy_reward": 0.08333333749324083, + "rewards/format_reward": 0.006250000186264515, + "rewards/tag_count_reward": 0.9723958432674408, + "step": 1882 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.7271057128906, + "epoch": 0.6026564250280044, + "grad_norm": 0.3588142395019531, + "kl": 0.6075701117515564, + "learning_rate": 8.171875874264408e-06, + "loss": 0.1016, + "reward": 1.0489583611488342, + "reward_std": 0.1500548876821995, + "rewards/accuracy_reward": 0.07500000279396772, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9718750178813934, + "step": 1883 + }, + { + "clip_ratio": 0.0, + "completion_length": 588.9437744140625, + "epoch": 0.6029764762361978, + "grad_norm": 0.16027261316776276, + "kl": 0.2999001145362854, + "learning_rate": 8.160889291660423e-06, + "loss": 0.0407, + "reward": 1.152604204416275, + "reward_std": 0.1476465906947851, + "rewards/accuracy_reward": 0.15833333730697632, + "rewards/format_reward": 0.006250000186264515, + "rewards/tag_count_reward": 0.9880208432674408, + "step": 1884 + }, + { + "clip_ratio": 0.0, + "completion_length": 609.5125213623047, + "epoch": 0.6032965274443911, + "grad_norm": 0.13303539156913757, + "kl": 0.3691695436835289, + "learning_rate": 8.149905006188067e-06, + "loss": 0.0584, + "reward": 1.0328125178813934, + "reward_std": 0.10183481201529503, + "rewards/accuracy_reward": 0.05208333544433117, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9786458432674408, + "step": 1885 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.1396026611328, + "epoch": 0.6036165786525844, + "grad_norm": 0.1396128535270691, + "kl": 0.39802908822894095, + "learning_rate": 8.13892303156721e-06, + "loss": 0.0949, + "reward": 1.1458333671092986, + "reward_std": 0.14314947053790092, + "rewards/accuracy_reward": 0.1687500063329935, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9750000059604644, + "step": 1886 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.9104370117187, + "epoch": 0.6039366298607777, + "grad_norm": 0.20720265805721283, + "kl": 0.47100530862808226, + "learning_rate": 8.127943381514822e-06, + "loss": 0.1295, + "reward": 1.0052083432674408, + "reward_std": 0.16166542023420333, + "rewards/accuracy_reward": 0.0479166679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9572916746139526, + "step": 1887 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.8812744140625, + "epoch": 0.604256681068971, + "grad_norm": 0.17445041239261627, + "kl": 0.3852656245231628, + "learning_rate": 8.116966069744987e-06, + "loss": 0.0527, + "reward": 1.1286458730697633, + "reward_std": 0.1472001016139984, + "rewards/accuracy_reward": 0.14583333861082792, + "rewards/format_reward": 0.006250000186264515, + "rewards/tag_count_reward": 0.9765625119209289, + "step": 1888 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.8521057128906, + "epoch": 0.6045767322771644, + "grad_norm": 0.11102991551160812, + "kl": 0.26756716668605807, + "learning_rate": 8.105991109968846e-06, + "loss": 0.0496, + "reward": 1.028125023841858, + "reward_std": 0.12138459905982017, + "rewards/accuracy_reward": 0.03750000167638064, + "rewards/format_reward": 0.006250000186264515, + "rewards/tag_count_reward": 0.9843750119209289, + "step": 1889 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.4062683105469, + "epoch": 0.6048967834853577, + "grad_norm": 0.10683989524841309, + "kl": 0.3563895784318447, + "learning_rate": 8.095018515894633e-06, + "loss": 0.0677, + "reward": 1.0942708671092987, + "reward_std": 0.16333364136517048, + "rewards/accuracy_reward": 0.11250000279396773, + "rewards/format_reward": 0.01250000037252903, + "rewards/tag_count_reward": 0.9692708492279053, + "step": 1890 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.8750183105469, + "epoch": 0.6052168346935509, + "grad_norm": 0.10681261867284775, + "kl": 0.2178695060312748, + "learning_rate": 8.084048301227597e-06, + "loss": 0.0632, + "reward": 1.0796875357627869, + "reward_std": 0.13646226227283478, + "rewards/accuracy_reward": 0.09166667144745588, + "rewards/format_reward": 0.00416666679084301, + "rewards/tag_count_reward": 0.9838541746139526, + "step": 1891 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.268765258789, + "epoch": 0.6055368859017443, + "grad_norm": 0.1127118170261383, + "kl": 0.1855003535747528, + "learning_rate": 8.073080479670033e-06, + "loss": 0.0417, + "reward": 1.0666666805744172, + "reward_std": 0.1344422660768032, + "rewards/accuracy_reward": 0.08333333600312472, + "rewards/format_reward": 0.00833333358168602, + "rewards/tag_count_reward": 0.975000011920929, + "step": 1892 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.2666809082032, + "epoch": 0.6058569371099376, + "grad_norm": 0.21781377494335175, + "kl": 0.4310750551521778, + "learning_rate": 8.062115064921235e-06, + "loss": 0.0905, + "reward": 1.079687523841858, + "reward_std": 0.14400339033454657, + "rewards/accuracy_reward": 0.09166666977107525, + "rewards/format_reward": 0.01458333358168602, + "rewards/tag_count_reward": 0.9734375178813934, + "step": 1893 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.6437744140625, + "epoch": 0.6061769883181309, + "grad_norm": 0.2137860506772995, + "kl": 0.3845756992697716, + "learning_rate": 8.051152070677504e-06, + "loss": 0.1213, + "reward": 1.0130208671092986, + "reward_std": 0.19951648712158204, + "rewards/accuracy_reward": 0.04583333414047956, + "rewards/format_reward": 0.010416666977107525, + "rewards/tag_count_reward": 0.9567708611488343, + "step": 1894 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.7833435058594, + "epoch": 0.6064970395263242, + "grad_norm": 0.15622647106647491, + "kl": 0.4282746434211731, + "learning_rate": 8.040191510632105e-06, + "loss": 0.1073, + "reward": 1.0755208671092986, + "reward_std": 0.20779597759246826, + "rewards/accuracy_reward": 0.10416666902601719, + "rewards/format_reward": 0.002083333395421505, + "rewards/tag_count_reward": 0.9692708611488342, + "step": 1895 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.120849609375, + "epoch": 0.6068170907345175, + "grad_norm": 0.4617317020893097, + "kl": 0.4920196183025837, + "learning_rate": 8.02923339847527e-06, + "loss": 0.0567, + "reward": 1.1208333551883698, + "reward_std": 0.16462844759225845, + "rewards/accuracy_reward": 0.1479166716337204, + "rewards/format_reward": 0.006250000186264515, + "rewards/tag_count_reward": 0.9666666746139526, + "step": 1896 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.9041870117187, + "epoch": 0.6071371419427108, + "grad_norm": 0.34303173422813416, + "kl": 0.2250536672770977, + "learning_rate": 8.018277747894178e-06, + "loss": 0.0734, + "reward": 1.041666680574417, + "reward_std": 0.16085805594921113, + "rewards/accuracy_reward": 0.0666666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.9750000059604644, + "step": 1897 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.177099609375, + "epoch": 0.6074571931509042, + "grad_norm": 0.23493434488773346, + "kl": 0.6605220437049866, + "learning_rate": 8.007324572572915e-06, + "loss": 0.139, + "reward": 1.0619792103767396, + "reward_std": 0.23571573868393897, + "rewards/accuracy_reward": 0.10208333637565374, + "rewards/format_reward": 0.01666666716337204, + "rewards/tag_count_reward": 0.9432291865348816, + "step": 1898 + }, + { + "clip_ratio": 0.0, + "completion_length": 593.6937683105468, + "epoch": 0.6077772443590974, + "grad_norm": 0.2770790755748749, + "kl": 0.7114527821540833, + "learning_rate": 7.996373886192496e-06, + "loss": 0.1715, + "reward": 1.0718750476837158, + "reward_std": 0.2413769096136093, + "rewards/accuracy_reward": 0.10625000428408385, + "rewards/format_reward": 0.03750000111758709, + "rewards/tag_count_reward": 0.9281250178813935, + "step": 1899 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.602099609375, + "epoch": 0.6080972955672908, + "grad_norm": 0.7002983689308167, + "kl": 0.8839252561330795, + "learning_rate": 7.985425702430821e-06, + "loss": 0.1461, + "reward": 1.0312500238418578, + "reward_std": 0.17215485386550428, + "rewards/accuracy_reward": 0.05833333544433117, + "rewards/format_reward": 0.016666667349636555, + "rewards/tag_count_reward": 0.9562500178813934, + "step": 1900 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.7437713623046, + "epoch": 0.6084173467754841, + "grad_norm": 0.5294923782348633, + "kl": 1.1238539427518845, + "learning_rate": 7.974480034962655e-06, + "loss": 0.1987, + "reward": 1.046354204416275, + "reward_std": 0.2591739296913147, + "rewards/accuracy_reward": 0.08333333637565374, + "rewards/format_reward": 0.043750001676380636, + "rewards/tag_count_reward": 0.9192708551883697, + "step": 1901 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.6437744140625, + "epoch": 0.6087373979836774, + "grad_norm": 0.22682341933250427, + "kl": 0.9062155365943909, + "learning_rate": 7.96353689745963e-06, + "loss": 0.1727, + "reward": 1.0520833492279054, + "reward_std": 0.2488498877733946, + "rewards/accuracy_reward": 0.09375000130385161, + "rewards/format_reward": 0.025000000931322576, + "rewards/tag_count_reward": 0.9333333492279052, + "step": 1902 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.6833557128906, + "epoch": 0.6090574491918707, + "grad_norm": 0.234841451048851, + "kl": 0.6701559334993362, + "learning_rate": 7.952596303590215e-06, + "loss": 0.1411, + "reward": 1.0953125357627869, + "reward_std": 0.23830147199332713, + "rewards/accuracy_reward": 0.14375000428408385, + "rewards/format_reward": 0.02291666716337204, + "rewards/tag_count_reward": 0.9286458432674408, + "step": 1903 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.452099609375, + "epoch": 0.609377500400064, + "grad_norm": 0.21650651097297668, + "kl": 0.7510048195719718, + "learning_rate": 7.9416582670197e-06, + "loss": 0.1216, + "reward": 1.2067708730697633, + "reward_std": 0.28640821799635885, + "rewards/accuracy_reward": 0.18958333786576986, + "rewards/format_reward": 0.08333333432674409, + "rewards/tag_count_reward": 0.9338541865348816, + "step": 1904 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.0521118164063, + "epoch": 0.6096975516082573, + "grad_norm": 0.5106697678565979, + "kl": 0.5842062175273895, + "learning_rate": 7.930722801410184e-06, + "loss": 0.1217, + "reward": 0.9880208492279052, + "reward_std": 0.2233804479241371, + "rewards/accuracy_reward": 0.01458333432674408, + "rewards/format_reward": 0.045833333395421505, + "rewards/tag_count_reward": 0.9276041805744171, + "step": 1905 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.2041900634765, + "epoch": 0.6100176028164507, + "grad_norm": 0.3872288763523102, + "kl": 0.8763932049274444, + "learning_rate": 7.91978992042055e-06, + "loss": 0.1593, + "reward": 1.0270833551883698, + "reward_std": 0.2873599737882614, + "rewards/accuracy_reward": 0.08541666883975267, + "rewards/format_reward": 0.027083333767950534, + "rewards/tag_count_reward": 0.9145833551883698, + "step": 1906 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.3521057128906, + "epoch": 0.6103376540246439, + "grad_norm": 0.26322004199028015, + "kl": 0.8161401003599167, + "learning_rate": 7.90885963770646e-06, + "loss": 0.1386, + "reward": 1.0750000536441804, + "reward_std": 0.2801113411784172, + "rewards/accuracy_reward": 0.10833333786576986, + "rewards/format_reward": 0.05416666846722364, + "rewards/tag_count_reward": 0.9125000238418579, + "step": 1907 + }, + { + "clip_ratio": 0.0, + "completion_length": 604.8396118164062, + "epoch": 0.6106577052328372, + "grad_norm": 0.29171696305274963, + "kl": 1.0318253219127655, + "learning_rate": 7.89793196692033e-06, + "loss": 0.1855, + "reward": 1.0354166865348815, + "reward_std": 0.26935647130012513, + "rewards/accuracy_reward": 0.08958333563059569, + "rewards/format_reward": 0.03750000111758709, + "rewards/tag_count_reward": 0.9083333432674408, + "step": 1908 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.5166900634765, + "epoch": 0.6109777564410306, + "grad_norm": 0.6068216562271118, + "kl": 1.0604799330234527, + "learning_rate": 7.887006921711301e-06, + "loss": 0.176, + "reward": 1.0729166924953462, + "reward_std": 0.29732812345027926, + "rewards/accuracy_reward": 0.05208333544433117, + "rewards/format_reward": 0.11875000447034836, + "rewards/tag_count_reward": 0.9020833492279052, + "step": 1909 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.8021057128906, + "epoch": 0.6112978076492239, + "grad_norm": 0.15164311230182648, + "kl": 0.815485092997551, + "learning_rate": 7.876084515725248e-06, + "loss": 0.1439, + "reward": 1.0776041984558105, + "reward_std": 0.26183063685894015, + "rewards/accuracy_reward": 0.05208333618938923, + "rewards/format_reward": 0.11458333488553762, + "rewards/tag_count_reward": 0.9109375178813934, + "step": 1910 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.3104370117187, + "epoch": 0.6116178588574172, + "grad_norm": 0.16586171090602875, + "kl": 0.7920496329665184, + "learning_rate": 7.865164762604749e-06, + "loss": 0.1445, + "reward": 1.0729166924953462, + "reward_std": 0.19199963212013244, + "rewards/accuracy_reward": 0.0854166692122817, + "rewards/format_reward": 0.05208333358168602, + "rewards/tag_count_reward": 0.9354166865348816, + "step": 1911 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.5645965576172, + "epoch": 0.6119379100656105, + "grad_norm": 0.1570732742547989, + "kl": 0.758229385316372, + "learning_rate": 7.854247675989057e-06, + "loss": 0.1651, + "reward": 1.0875000298023223, + "reward_std": 0.21725860238075256, + "rewards/accuracy_reward": 0.1250000050291419, + "rewards/format_reward": 0.025000000186264516, + "rewards/tag_count_reward": 0.9375000119209289, + "step": 1912 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.1791778564453, + "epoch": 0.6122579612738038, + "grad_norm": 0.20079578459262848, + "kl": 0.49525588750839233, + "learning_rate": 7.84333326951411e-06, + "loss": 0.1341, + "reward": 1.123437523841858, + "reward_std": 0.2613053783774376, + "rewards/accuracy_reward": 0.07916666679084301, + "rewards/format_reward": 0.11250000335276127, + "rewards/tag_count_reward": 0.9317708492279053, + "step": 1913 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.6000183105468, + "epoch": 0.6125780124819972, + "grad_norm": 0.11362726986408234, + "kl": 0.5724338337779045, + "learning_rate": 7.83242155681248e-06, + "loss": 0.1292, + "reward": 1.1770833551883697, + "reward_std": 0.24515315815806388, + "rewards/accuracy_reward": 0.0854166679084301, + "rewards/format_reward": 0.17708333805203438, + "rewards/tag_count_reward": 0.9145833492279053, + "step": 1914 + }, + { + "clip_ratio": 0.0, + "completion_length": 533.677099609375, + "epoch": 0.6128980636901904, + "grad_norm": 0.295296311378479, + "kl": 0.3709353081882, + "learning_rate": 7.821512551513395e-06, + "loss": 0.1497, + "reward": 1.2864583730697632, + "reward_std": 0.3537163957953453, + "rewards/accuracy_reward": 0.17708333861082792, + "rewards/format_reward": 0.22291667349636554, + "rewards/tag_count_reward": 0.8864583551883698, + "step": 1915 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.900015258789, + "epoch": 0.6132181148983837, + "grad_norm": 0.10389820486307144, + "kl": 0.4146296791732311, + "learning_rate": 7.810606267242687e-06, + "loss": 0.154, + "reward": 1.1968750476837158, + "reward_std": 0.3052997462451458, + "rewards/accuracy_reward": 0.1166666703298688, + "rewards/format_reward": 0.16875000316649674, + "rewards/tag_count_reward": 0.9114583492279053, + "step": 1916 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.7396026611328, + "epoch": 0.6135381661065771, + "grad_norm": 0.08782447874546051, + "kl": 0.4644440606236458, + "learning_rate": 7.799702717622796e-06, + "loss": 0.0892, + "reward": 1.2354167103767395, + "reward_std": 0.237884309142828, + "rewards/accuracy_reward": 0.10416667070239782, + "rewards/format_reward": 0.2208333408460021, + "rewards/tag_count_reward": 0.9104166865348816, + "step": 1917 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.431265258789, + "epoch": 0.6138582173147704, + "grad_norm": 0.26354876160621643, + "kl": 0.7795211791992187, + "learning_rate": 7.788801916272739e-06, + "loss": 0.15, + "reward": 1.1328125417232513, + "reward_std": 0.3334435373544693, + "rewards/accuracy_reward": 0.050000001303851606, + "rewards/format_reward": 0.1916666716337204, + "rewards/tag_count_reward": 0.8911458492279053, + "step": 1918 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.6041870117188, + "epoch": 0.6141782685229636, + "grad_norm": 0.23324386775493622, + "kl": 0.7294996976852417, + "learning_rate": 7.77790387680811e-06, + "loss": 0.1578, + "reward": 1.2291667103767394, + "reward_std": 0.39393016397953035, + "rewards/accuracy_reward": 0.06250000111758709, + "rewards/format_reward": 0.3125000074505806, + "rewards/tag_count_reward": 0.8541666865348816, + "step": 1919 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.5208435058594, + "epoch": 0.614498319731157, + "grad_norm": 0.2352185994386673, + "kl": 0.8153878182172776, + "learning_rate": 7.767008612841045e-06, + "loss": 0.1319, + "reward": 1.1875000476837159, + "reward_std": 0.3735229402780533, + "rewards/accuracy_reward": 0.05208333525806665, + "rewards/format_reward": 0.26041667386889455, + "rewards/tag_count_reward": 0.8750000238418579, + "step": 1920 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.5416809082031, + "epoch": 0.6148183709393503, + "grad_norm": 0.37710386514663696, + "kl": 0.8285433441400528, + "learning_rate": 7.75611613798022e-06, + "loss": 0.1573, + "reward": 1.3359375476837159, + "reward_std": 0.46349499821662904, + "rewards/accuracy_reward": 0.17083333637565373, + "rewards/format_reward": 0.310416679084301, + "rewards/tag_count_reward": 0.8546875178813934, + "step": 1921 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.3041870117188, + "epoch": 0.6151384221475437, + "grad_norm": 0.21071451902389526, + "kl": 0.8673926889896393, + "learning_rate": 7.745226465830817e-06, + "loss": 0.1906, + "reward": 1.3776042103767394, + "reward_std": 0.4966884583234787, + "rewards/accuracy_reward": 0.11458333563059568, + "rewards/format_reward": 0.450000011920929, + "rewards/tag_count_reward": 0.8130208551883698, + "step": 1922 + }, + { + "clip_ratio": 0.0, + "completion_length": 514.7416839599609, + "epoch": 0.6154584733557369, + "grad_norm": 0.13909560441970825, + "kl": 0.46110083162784576, + "learning_rate": 7.734339609994527e-06, + "loss": 0.0851, + "reward": 1.3984375596046448, + "reward_std": 0.43087140917778016, + "rewards/accuracy_reward": 0.025000000186264516, + "rewards/format_reward": 0.5583333432674408, + "rewards/tag_count_reward": 0.8151041865348816, + "step": 1923 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.8729309082031, + "epoch": 0.6157785245639302, + "grad_norm": 0.2250901609659195, + "kl": 0.4949187658727169, + "learning_rate": 7.723455584069524e-06, + "loss": 0.1126, + "reward": 1.5614583611488342, + "reward_std": 0.45720491707324984, + "rewards/accuracy_reward": 0.10208333507180214, + "rewards/format_reward": 0.6750000208616257, + "rewards/tag_count_reward": 0.7843750178813934, + "step": 1924 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.2750183105469, + "epoch": 0.6160985757721236, + "grad_norm": 0.5836819410324097, + "kl": 0.4990528032183647, + "learning_rate": 7.712574401650445e-06, + "loss": 0.1638, + "reward": 1.677083384990692, + "reward_std": 0.40421550869941714, + "rewards/accuracy_reward": 0.16250000800937414, + "rewards/format_reward": 0.7541666924953461, + "rewards/tag_count_reward": 0.7604166805744171, + "step": 1925 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.9500183105469, + "epoch": 0.6164186269803168, + "grad_norm": 0.6810193061828613, + "kl": 0.6582358777523041, + "learning_rate": 7.701696076328368e-06, + "loss": 0.1681, + "reward": 1.634895884990692, + "reward_std": 0.49258761405944823, + "rewards/accuracy_reward": 0.11250000298023224, + "rewards/format_reward": 0.7833333611488342, + "rewards/tag_count_reward": 0.7390625298023223, + "step": 1926 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.8208526611328, + "epoch": 0.6167386781885101, + "grad_norm": 0.5494832396507263, + "kl": 0.7763916999101639, + "learning_rate": 7.690820621690815e-06, + "loss": 0.1614, + "reward": 1.5572917103767394, + "reward_std": 0.46946349143981936, + "rewards/accuracy_reward": 0.1166666692122817, + "rewards/format_reward": 0.6833333551883698, + "rewards/tag_count_reward": 0.7572916865348815, + "step": 1927 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.1000213623047, + "epoch": 0.6170587293967035, + "grad_norm": 0.17400583624839783, + "kl": 0.6718953251838684, + "learning_rate": 7.679948051321708e-06, + "loss": 0.2013, + "reward": 1.6239583849906922, + "reward_std": 0.43792185485363005, + "rewards/accuracy_reward": 0.06250000149011611, + "rewards/format_reward": 0.8395833551883698, + "rewards/tag_count_reward": 0.7218750178813934, + "step": 1928 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.5145935058594, + "epoch": 0.6173787806048968, + "grad_norm": 0.24433894455432892, + "kl": 0.6602077126502991, + "learning_rate": 7.66907837880138e-06, + "loss": 0.1903, + "reward": 1.645312535762787, + "reward_std": 0.4145914763212204, + "rewards/accuracy_reward": 0.05416666734963656, + "rewards/format_reward": 0.8729166805744171, + "rewards/tag_count_reward": 0.7182291865348815, + "step": 1929 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.279183959961, + "epoch": 0.61769883181309, + "grad_norm": 0.20922113955020905, + "kl": 0.6923602253198624, + "learning_rate": 7.65821161770654e-06, + "loss": 0.1745, + "reward": 1.7348958611488343, + "reward_std": 0.34830624908208846, + "rewards/accuracy_reward": 0.12500000260770322, + "rewards/format_reward": 0.8791666865348816, + "rewards/tag_count_reward": 0.7307291924953461, + "step": 1930 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.6875274658203, + "epoch": 0.6180188830212834, + "grad_norm": 0.2695390284061432, + "kl": 0.9710577547550201, + "learning_rate": 7.64734778161025e-06, + "loss": 0.2045, + "reward": 1.6416667103767395, + "reward_std": 0.40317414700984955, + "rewards/accuracy_reward": 0.07708333563059569, + "rewards/format_reward": 0.8562500238418579, + "rewards/tag_count_reward": 0.7083333551883697, + "step": 1931 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.702099609375, + "epoch": 0.6183389342294767, + "grad_norm": 0.2840893268585205, + "kl": 0.859143078327179, + "learning_rate": 7.636486884081937e-06, + "loss": 0.1529, + "reward": 1.7026041865348815, + "reward_std": 0.35204153060913085, + "rewards/accuracy_reward": 0.0854166692122817, + "rewards/format_reward": 0.8958333611488343, + "rewards/tag_count_reward": 0.7213541865348816, + "step": 1932 + }, + { + "clip_ratio": 0.0, + "completion_length": 509.195849609375, + "epoch": 0.6186589854376701, + "grad_norm": 0.18182797729969025, + "kl": 0.4233370810747147, + "learning_rate": 7.625628938687349e-06, + "loss": 0.143, + "reward": 1.802083384990692, + "reward_std": 0.2987508878111839, + "rewards/accuracy_reward": 0.15625000428408384, + "rewards/format_reward": 0.9187500178813934, + "rewards/tag_count_reward": 0.7270833551883698, + "step": 1933 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.0687652587891, + "epoch": 0.6189790366458633, + "grad_norm": 0.4376506805419922, + "kl": 1.092165270447731, + "learning_rate": 7.614773958988539e-06, + "loss": 0.2685, + "reward": 1.6338541984558106, + "reward_std": 0.3932381421327591, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.887500011920929, + "rewards/tag_count_reward": 0.7088541865348816, + "step": 1934 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.0875091552734, + "epoch": 0.6192990878540566, + "grad_norm": 0.36206111311912537, + "kl": 0.5829787597060203, + "learning_rate": 7.6039219585438676e-06, + "loss": 0.1531, + "reward": 1.6567708730697632, + "reward_std": 0.31807751953601837, + "rewards/accuracy_reward": 0.020833334140479564, + "rewards/format_reward": 0.9104166924953461, + "rewards/tag_count_reward": 0.7255208611488342, + "step": 1935 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.9791870117188, + "epoch": 0.61961913906225, + "grad_norm": 0.37123608589172363, + "kl": 0.6474206149578094, + "learning_rate": 7.593072950907969e-06, + "loss": 0.1671, + "reward": 1.6927083611488343, + "reward_std": 0.3547057643532753, + "rewards/accuracy_reward": 0.05833333469927311, + "rewards/format_reward": 0.9125000298023224, + "rewards/tag_count_reward": 0.721875011920929, + "step": 1936 + }, + { + "clip_ratio": 0.0, + "completion_length": 526.4437622070312, + "epoch": 0.6199391902704433, + "grad_norm": 0.2424151450395584, + "kl": 0.39393432140350343, + "learning_rate": 7.582226949631737e-06, + "loss": 0.1283, + "reward": 1.7213542222976685, + "reward_std": 0.25976524502038956, + "rewards/accuracy_reward": 0.050000001303851606, + "rewards/format_reward": 0.9416666924953461, + "rewards/tag_count_reward": 0.7296875238418579, + "step": 1937 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.4000213623046, + "epoch": 0.6202592414786365, + "grad_norm": 0.1958591789007187, + "kl": 0.5948910281062126, + "learning_rate": 7.571383968262317e-06, + "loss": 0.1273, + "reward": 1.8114583611488342, + "reward_std": 0.2821238741278648, + "rewards/accuracy_reward": 0.1645833384245634, + "rewards/format_reward": 0.9250000238418579, + "rewards/tag_count_reward": 0.7218750298023224, + "step": 1938 + }, + { + "clip_ratio": 0.0, + "completion_length": 504.9333526611328, + "epoch": 0.6205792926868299, + "grad_norm": 0.17749886214733124, + "kl": 0.2720214515924454, + "learning_rate": 7.560544020343071e-06, + "loss": 0.0659, + "reward": 1.8145833611488342, + "reward_std": 0.1834358900785446, + "rewards/accuracy_reward": 0.10625000409781933, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7458333551883698, + "step": 1939 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.8000183105469, + "epoch": 0.6208993438950232, + "grad_norm": 0.25111111998558044, + "kl": 0.45597586706280707, + "learning_rate": 7.5497071194135875e-06, + "loss": 0.1531, + "reward": 1.692187535762787, + "reward_std": 0.27718111127614975, + "rewards/accuracy_reward": 0.03333333451300859, + "rewards/format_reward": 0.931250023841858, + "rewards/tag_count_reward": 0.727604192495346, + "step": 1940 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.0354309082031, + "epoch": 0.6212193951032166, + "grad_norm": 0.17664361000061035, + "kl": 0.3269217021763325, + "learning_rate": 7.538873279009637e-06, + "loss": 0.1029, + "reward": 1.7447916984558105, + "reward_std": 0.23195823952555655, + "rewards/accuracy_reward": 0.05625000055879355, + "rewards/format_reward": 0.9458333551883698, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 1941 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.0333404541016, + "epoch": 0.6215394463114098, + "grad_norm": 0.18127109110355377, + "kl": 0.3480118840932846, + "learning_rate": 7.528042512663174e-06, + "loss": 0.0889, + "reward": 1.6937500357627868, + "reward_std": 0.2795826196670532, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.9166666865348816, + "rewards/tag_count_reward": 0.743750023841858, + "step": 1942 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.0354309082031, + "epoch": 0.6218594975196031, + "grad_norm": 0.1579395979642868, + "kl": 0.2698191873729229, + "learning_rate": 7.517214833902307e-06, + "loss": 0.0553, + "reward": 1.882812535762787, + "reward_std": 0.16483215913176535, + "rewards/accuracy_reward": 0.16458333637565375, + "rewards/format_reward": 0.9729166805744172, + "rewards/tag_count_reward": 0.745312511920929, + "step": 1943 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.779183959961, + "epoch": 0.6221795487277965, + "grad_norm": 0.17572399973869324, + "kl": 0.28989646807312963, + "learning_rate": 7.506390256251294e-06, + "loss": 0.0957, + "reward": 1.7708333611488343, + "reward_std": 0.21848259344697, + "rewards/accuracy_reward": 0.07083333525806665, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7395833611488343, + "step": 1944 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.0583465576171, + "epoch": 0.6224995999359898, + "grad_norm": 0.2514660060405731, + "kl": 0.25404314175248144, + "learning_rate": 7.495568793230516e-06, + "loss": 0.0653, + "reward": 1.7812500476837159, + "reward_std": 0.1805768422782421, + "rewards/accuracy_reward": 0.07916666883975268, + "rewards/format_reward": 0.9625000119209289, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 1945 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.3854309082031, + "epoch": 0.622819651144183, + "grad_norm": 0.09252886474132538, + "kl": 0.20499701499938966, + "learning_rate": 7.484750458356467e-06, + "loss": 0.0657, + "reward": 1.8531250476837158, + "reward_std": 0.25213652551174165, + "rewards/accuracy_reward": 0.15625000335276126, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 1946 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.2666870117188, + "epoch": 0.6231397023523764, + "grad_norm": 0.12097577005624771, + "kl": 0.39013560861349106, + "learning_rate": 7.47393526514173e-06, + "loss": 0.0894, + "reward": 1.754687535762787, + "reward_std": 0.23477007076144218, + "rewards/accuracy_reward": 0.06250000167638063, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.735937523841858, + "step": 1947 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.0771118164063, + "epoch": 0.6234597535605697, + "grad_norm": 0.2490178346633911, + "kl": 0.3661611221730709, + "learning_rate": 7.463123227094962e-06, + "loss": 0.0698, + "reward": 1.8302083611488342, + "reward_std": 0.17300696298480034, + "rewards/accuracy_reward": 0.12291666977107525, + "rewards/format_reward": 0.9687500119209289, + "rewards/tag_count_reward": 0.7385416924953461, + "step": 1948 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.6500122070313, + "epoch": 0.623779804768763, + "grad_norm": 0.151083305478096, + "kl": 0.30013838559389117, + "learning_rate": 7.452314357720888e-06, + "loss": 0.094, + "reward": 1.7109375238418578, + "reward_std": 0.22202819362282752, + "rewards/accuracy_reward": 0.01875000037252903, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 1949 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.785433959961, + "epoch": 0.6240998559769563, + "grad_norm": 0.1321251392364502, + "kl": 0.3549846962094307, + "learning_rate": 7.441508670520271e-06, + "loss": 0.1134, + "reward": 1.7364583611488342, + "reward_std": 0.14900253042578698, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 1950 + }, + { + "clip_ratio": 0.0, + "completion_length": 526.3896026611328, + "epoch": 0.6244199071851496, + "grad_norm": 0.15130139887332916, + "kl": 0.3252772256731987, + "learning_rate": 7.430706178989895e-06, + "loss": 0.0679, + "reward": 1.784375047683716, + "reward_std": 0.1683867707848549, + "rewards/accuracy_reward": 0.07916666865348816, + "rewards/format_reward": 0.9666666924953461, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 1951 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.220849609375, + "epoch": 0.624739958393343, + "grad_norm": 0.11556895822286606, + "kl": 0.2892810679972172, + "learning_rate": 7.419906896622556e-06, + "loss": 0.1023, + "reward": 1.7937500596046447, + "reward_std": 0.2092138223350048, + "rewards/accuracy_reward": 0.0895833358168602, + "rewards/format_reward": 0.9645833611488343, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 1952 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.4854339599609, + "epoch": 0.6250600096015363, + "grad_norm": 0.10080796480178833, + "kl": 0.22869173735380172, + "learning_rate": 7.409110836907041e-06, + "loss": 0.0724, + "reward": 1.8031250596046449, + "reward_std": 0.1844940721988678, + "rewards/accuracy_reward": 0.08750000186264514, + "rewards/format_reward": 0.9729166865348816, + "rewards/tag_count_reward": 0.7427083551883698, + "step": 1953 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.5729370117188, + "epoch": 0.6253800608097295, + "grad_norm": 0.10239440947771072, + "kl": 0.32757807746529577, + "learning_rate": 7.398318013328112e-06, + "loss": 0.1184, + "reward": 1.7677083969116212, + "reward_std": 0.23693208321928977, + "rewards/accuracy_reward": 0.08333333451300859, + "rewards/format_reward": 0.954166692495346, + "rewards/tag_count_reward": 0.7302083551883698, + "step": 1954 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.2791778564454, + "epoch": 0.6257001120179229, + "grad_norm": 0.08784143626689911, + "kl": 0.23789920881390572, + "learning_rate": 7.387528439366491e-06, + "loss": 0.089, + "reward": 1.7901042103767395, + "reward_std": 0.19504887610673904, + "rewards/accuracy_reward": 0.08125000353902578, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 1955 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.0375244140625, + "epoch": 0.6260201632261162, + "grad_norm": 0.09152630716562271, + "kl": 0.23061190843582152, + "learning_rate": 7.376742128498835e-06, + "loss": 0.0863, + "reward": 1.8093750119209289, + "reward_std": 0.20874695628881454, + "rewards/accuracy_reward": 0.1062500026077032, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 1956 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.7562622070312, + "epoch": 0.6263402144343095, + "grad_norm": 0.07687767595052719, + "kl": 0.2703046713024378, + "learning_rate": 7.365959094197734e-06, + "loss": 0.0544, + "reward": 1.8505208492279053, + "reward_std": 0.15929230451583862, + "rewards/accuracy_reward": 0.1395833373069763, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 1957 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.7083618164063, + "epoch": 0.6266602656425028, + "grad_norm": 0.05818747729063034, + "kl": 0.14957574531435966, + "learning_rate": 7.35517934993168e-06, + "loss": 0.0521, + "reward": 1.7531250357627868, + "reward_std": 0.15787961557507516, + "rewards/accuracy_reward": 0.03125000074505806, + "rewards/format_reward": 0.9791666805744171, + "rewards/tag_count_reward": 0.7427083432674408, + "step": 1958 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.4000183105469, + "epoch": 0.6269803168506961, + "grad_norm": 0.1020335853099823, + "kl": 0.3409750394523144, + "learning_rate": 7.344402909165053e-06, + "loss": 0.0373, + "reward": 1.7281250357627869, + "reward_std": 0.1918536826968193, + "rewards/accuracy_reward": 0.018750000558793545, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 1959 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.8187713623047, + "epoch": 0.6273003680588894, + "grad_norm": 0.09709202498197556, + "kl": 0.25619880557060243, + "learning_rate": 7.3336297853581115e-06, + "loss": 0.0861, + "reward": 1.7859375476837158, + "reward_std": 0.18421316221356393, + "rewards/accuracy_reward": 0.08125000242143869, + "rewards/format_reward": 0.9666666984558105, + "rewards/tag_count_reward": 0.7380208611488343, + "step": 1960 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.2333557128907, + "epoch": 0.6276204192670828, + "grad_norm": 0.32538357377052307, + "kl": 0.4842786967754364, + "learning_rate": 7.322859991966973e-06, + "loss": 0.0732, + "reward": 1.7713542103767395, + "reward_std": 0.15907796677201985, + "rewards/accuracy_reward": 0.05833333488553762, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7442708551883698, + "step": 1961 + }, + { + "clip_ratio": 0.0, + "completion_length": 537.0833557128906, + "epoch": 0.627940470475276, + "grad_norm": 0.20227976143360138, + "kl": 0.24760584570467473, + "learning_rate": 7.3120935424435856e-06, + "loss": 0.0577, + "reward": 1.770312535762787, + "reward_std": 0.15020546615123748, + "rewards/accuracy_reward": 0.05833333488553762, + "rewards/format_reward": 0.9666666924953461, + "rewards/tag_count_reward": 0.745312511920929, + "step": 1962 + }, + { + "clip_ratio": 0.0, + "completion_length": 517.2000152587891, + "epoch": 0.6282605216834694, + "grad_norm": 0.12068881839513779, + "kl": 0.23002145811915398, + "learning_rate": 7.301330450235733e-06, + "loss": 0.0691, + "reward": 1.7859375596046447, + "reward_std": 0.20177326947450638, + "rewards/accuracy_reward": 0.07916666772216559, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 1963 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.9041809082031, + "epoch": 0.6285805728916627, + "grad_norm": 0.08962026238441467, + "kl": 0.269534295797348, + "learning_rate": 7.290570728786992e-06, + "loss": 0.0775, + "reward": 1.7885417103767396, + "reward_std": 0.19415293484926224, + "rewards/accuracy_reward": 0.08750000372529029, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 1964 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.2771026611329, + "epoch": 0.628900624099856, + "grad_norm": 0.04769608750939369, + "kl": 0.21004538014531135, + "learning_rate": 7.279814391536744e-06, + "loss": 0.015, + "reward": 1.8588542342185974, + "reward_std": 0.13069503456354142, + "rewards/accuracy_reward": 0.13333333544433118, + "rewards/format_reward": 0.9791666746139527, + "rewards/tag_count_reward": 0.7463541686534881, + "step": 1965 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.3083557128906, + "epoch": 0.6292206753080493, + "grad_norm": 0.0883219763636589, + "kl": 0.14996635988354684, + "learning_rate": 7.2690614519201315e-06, + "loss": 0.0513, + "reward": 1.8463542103767394, + "reward_std": 0.1668414853513241, + "rewards/accuracy_reward": 0.1354166716337204, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.7401041865348816, + "step": 1966 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.1729370117188, + "epoch": 0.6295407265162426, + "grad_norm": 0.10870802402496338, + "kl": 0.22853438630700112, + "learning_rate": 7.258311923368062e-06, + "loss": 0.0778, + "reward": 1.8182292103767395, + "reward_std": 0.2189410574734211, + "rewards/accuracy_reward": 0.11666667070239782, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7390625298023223, + "step": 1967 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.5916809082031, + "epoch": 0.6298607777244359, + "grad_norm": 2.5341570377349854, + "kl": 0.30130406394600867, + "learning_rate": 7.247565819307172e-06, + "loss": 0.0612, + "reward": 1.808333396911621, + "reward_std": 0.18026887029409408, + "rewards/accuracy_reward": 0.09791667014360428, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 1968 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.8312713623047, + "epoch": 0.6301808289326292, + "grad_norm": 0.7344006299972534, + "kl": 0.7502999603748322, + "learning_rate": 7.236823153159832e-06, + "loss": 0.1116, + "reward": 1.283854216337204, + "reward_std": 0.47162422239780427, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.49375001192092893, + "rewards/tag_count_reward": 0.7505208492279053, + "step": 1969 + }, + { + "clip_ratio": 0.0, + "completion_length": 628.1062622070312, + "epoch": 0.6305008801408225, + "grad_norm": 1.0056172609329224, + "kl": 1.3431001484394074, + "learning_rate": 7.226083938344108e-06, + "loss": 0.1292, + "reward": 1.2906250357627869, + "reward_std": 0.49476856291294097, + "rewards/accuracy_reward": 0.05416666883975267, + "rewards/format_reward": 0.4625000149011612, + "rewards/tag_count_reward": 0.7739583671092987, + "step": 1970 + }, + { + "clip_ratio": 0.0, + "completion_length": 595.4312622070313, + "epoch": 0.6308209313490158, + "grad_norm": 0.4282771646976471, + "kl": 0.5381355553865432, + "learning_rate": 7.215348188273768e-06, + "loss": 0.0965, + "reward": 1.5119791984558106, + "reward_std": 0.46053238213062286, + "rewards/accuracy_reward": 0.01875000037252903, + "rewards/format_reward": 0.7354166805744171, + "rewards/tag_count_reward": 0.7578125178813935, + "step": 1971 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.3479370117187, + "epoch": 0.6311409825572092, + "grad_norm": 0.1805924028158188, + "kl": 0.44332694709300996, + "learning_rate": 7.204615916358234e-06, + "loss": 0.1127, + "reward": 1.722395896911621, + "reward_std": 0.33906579315662383, + "rewards/accuracy_reward": 0.1354166716337204, + "rewards/format_reward": 0.8291666865348816, + "rewards/tag_count_reward": 0.7578125238418579, + "step": 1972 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.2000183105469, + "epoch": 0.6314610337654024, + "grad_norm": 0.21334503591060638, + "kl": 0.3289230242371559, + "learning_rate": 7.193887136002599e-06, + "loss": 0.0996, + "reward": 1.7250000476837157, + "reward_std": 0.27289713025093076, + "rewards/accuracy_reward": 0.07916666902601718, + "rewards/format_reward": 0.8875000178813934, + "rewards/tag_count_reward": 0.7583333611488342, + "step": 1973 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.633349609375, + "epoch": 0.6317810849735958, + "grad_norm": 0.16588036715984344, + "kl": 0.34746977835893633, + "learning_rate": 7.183161860607592e-06, + "loss": 0.0993, + "reward": 1.7114583611488343, + "reward_std": 0.3407271146774292, + "rewards/accuracy_reward": 0.09375000279396772, + "rewards/format_reward": 0.8625000238418579, + "rewards/tag_count_reward": 0.7552083551883697, + "step": 1974 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.1104370117188, + "epoch": 0.6321011361817891, + "grad_norm": 0.4449247717857361, + "kl": 0.354316396266222, + "learning_rate": 7.172440103569566e-06, + "loss": 0.1278, + "reward": 1.7458333849906922, + "reward_std": 0.26184590086340903, + "rewards/accuracy_reward": 0.09791666977107524, + "rewards/format_reward": 0.8937500178813934, + "rewards/tag_count_reward": 0.7541666865348816, + "step": 1975 + }, + { + "clip_ratio": 0.0, + "completion_length": 595.2104370117188, + "epoch": 0.6324211873899824, + "grad_norm": 0.343107134103775, + "kl": 0.3772110417485237, + "learning_rate": 7.161721878280467e-06, + "loss": 0.1068, + "reward": 1.7208333730697631, + "reward_std": 0.30226452052593233, + "rewards/accuracy_reward": 0.08750000260770321, + "rewards/format_reward": 0.8791666924953461, + "rewards/tag_count_reward": 0.7541666865348816, + "step": 1976 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.1625122070312, + "epoch": 0.6327412385981757, + "grad_norm": 0.5401050448417664, + "kl": 0.32526273727416993, + "learning_rate": 7.151007198127844e-06, + "loss": 0.1215, + "reward": 1.7130208611488342, + "reward_std": 0.2510280154645443, + "rewards/accuracy_reward": 0.03958333414047956, + "rewards/format_reward": 0.9270833432674408, + "rewards/tag_count_reward": 0.7463541746139526, + "step": 1977 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.6208587646485, + "epoch": 0.633061289806369, + "grad_norm": 0.34722357988357544, + "kl": 0.4616437517106533, + "learning_rate": 7.140296076494809e-06, + "loss": 0.0943, + "reward": 1.7385416984558106, + "reward_std": 0.24983570948243142, + "rewards/accuracy_reward": 0.05833333507180214, + "rewards/format_reward": 0.9333333492279052, + "rewards/tag_count_reward": 0.7468750178813934, + "step": 1978 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.008349609375, + "epoch": 0.6333813410145623, + "grad_norm": 0.18879617750644684, + "kl": 0.3938341312110424, + "learning_rate": 7.129588526760036e-06, + "loss": 0.0789, + "reward": 1.7145833730697633, + "reward_std": 0.3124412089586258, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.9125000238418579, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 1979 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.4187591552734, + "epoch": 0.6337013922227557, + "grad_norm": 0.45781409740448, + "kl": 0.4807680070400238, + "learning_rate": 7.11888456229773e-06, + "loss": 0.0984, + "reward": 1.6822917103767394, + "reward_std": 0.3750412121415138, + "rewards/accuracy_reward": 0.04791666753590107, + "rewards/format_reward": 0.8979166865348815, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 1980 + }, + { + "clip_ratio": 0.0, + "completion_length": 744.9520935058594, + "epoch": 0.6340214434309489, + "grad_norm": 9.180135726928711, + "kl": 2.3556045293807983, + "learning_rate": 7.108184196477622e-06, + "loss": 0.2895, + "reward": 1.3651041984558105, + "reward_std": 0.4459951549768448, + "rewards/accuracy_reward": 0.04791666902601719, + "rewards/format_reward": 0.48541668355464934, + "rewards/tag_count_reward": 0.8317708611488343, + "step": 1981 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.0333557128906, + "epoch": 0.6343414946391422, + "grad_norm": 0.34384164214134216, + "kl": 0.6166922569274902, + "learning_rate": 7.097487442664952e-06, + "loss": 0.1315, + "reward": 1.853645884990692, + "reward_std": 0.30587767958641054, + "rewards/accuracy_reward": 0.20416667461395263, + "rewards/format_reward": 0.9187500178813934, + "rewards/tag_count_reward": 0.7307291865348816, + "step": 1982 + }, + { + "clip_ratio": 0.0, + "completion_length": 537.9729309082031, + "epoch": 0.6346615458473356, + "grad_norm": 0.19648754596710205, + "kl": 0.4563448905944824, + "learning_rate": 7.086794314220445e-06, + "loss": 0.1238, + "reward": 1.7338542103767396, + "reward_std": 0.2317870318889618, + "rewards/accuracy_reward": 0.05625000204890966, + "rewards/format_reward": 0.9416666805744172, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 1983 + }, + { + "clip_ratio": 0.0, + "completion_length": 588.489599609375, + "epoch": 0.6349815970555289, + "grad_norm": 0.3531632125377655, + "kl": 0.7605800554156303, + "learning_rate": 7.076104824500294e-06, + "loss": 0.1186, + "reward": 1.7307291984558106, + "reward_std": 0.2941200569272041, + "rewards/accuracy_reward": 0.07500000223517418, + "rewards/format_reward": 0.9187500178813934, + "rewards/tag_count_reward": 0.736979192495346, + "step": 1984 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.0041809082031, + "epoch": 0.6353016482637222, + "grad_norm": 0.2388213574886322, + "kl": 0.5445457905530929, + "learning_rate": 7.0654189868561515e-06, + "loss": 0.1156, + "reward": 1.7270833611488343, + "reward_std": 0.22447171062231064, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.9458333492279053, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 1985 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.5666870117187, + "epoch": 0.6356216994719155, + "grad_norm": 0.1977159082889557, + "kl": 0.4999621480703354, + "learning_rate": 7.054736814635106e-06, + "loss": 0.0949, + "reward": 1.803645873069763, + "reward_std": 0.24051545932888985, + "rewards/accuracy_reward": 0.12500000335276126, + "rewards/format_reward": 0.9416666805744172, + "rewards/tag_count_reward": 0.7369791865348816, + "step": 1986 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.5979400634766, + "epoch": 0.6359417506801088, + "grad_norm": 0.3220207691192627, + "kl": 0.5277846544981003, + "learning_rate": 7.044058321179671e-06, + "loss": 0.1443, + "reward": 1.7963541984558105, + "reward_std": 0.2664462685585022, + "rewards/accuracy_reward": 0.1125000037252903, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7338541865348815, + "step": 1987 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.0437683105469, + "epoch": 0.6362618018883022, + "grad_norm": 0.14741215109825134, + "kl": 0.372174845635891, + "learning_rate": 7.033383519827763e-06, + "loss": 0.099, + "reward": 1.7000000476837158, + "reward_std": 0.22901730239391327, + "rewards/accuracy_reward": 0.022916667722165586, + "rewards/format_reward": 0.9375000238418579, + "rewards/tag_count_reward": 0.7395833611488343, + "step": 1988 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.4062713623047, + "epoch": 0.6365818530964954, + "grad_norm": 0.19315893948078156, + "kl": 0.5451448887586594, + "learning_rate": 7.022712423912682e-06, + "loss": 0.115, + "reward": 1.7458333730697633, + "reward_std": 0.28968684524297716, + "rewards/accuracy_reward": 0.08125000298023224, + "rewards/format_reward": 0.9333333611488343, + "rewards/tag_count_reward": 0.7312500238418579, + "step": 1989 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.4125213623047, + "epoch": 0.6369019043046887, + "grad_norm": 0.09960630536079407, + "kl": 0.27057635635137556, + "learning_rate": 7.012045046763111e-06, + "loss": 0.0506, + "reward": 1.8739583849906922, + "reward_std": 0.16413490921258928, + "rewards/accuracy_reward": 0.15625000223517418, + "rewards/format_reward": 0.9750000178813935, + "rewards/tag_count_reward": 0.7427083373069763, + "step": 1990 + }, + { + "clip_ratio": 0.0, + "completion_length": 501.5854248046875, + "epoch": 0.6372219555128821, + "grad_norm": 0.11986377090215683, + "kl": 0.22691030353307723, + "learning_rate": 7.00138140170308e-06, + "loss": 0.1244, + "reward": 1.7781250596046447, + "reward_std": 0.19470058530569076, + "rewards/accuracy_reward": 0.08333333544433116, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7447916746139527, + "step": 1991 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.3812683105468, + "epoch": 0.6375420067210754, + "grad_norm": 0.1236194372177124, + "kl": 0.204738799482584, + "learning_rate": 6.990721502051958e-06, + "loss": 0.073, + "reward": 1.8322917222976685, + "reward_std": 0.23166320994496345, + "rewards/accuracy_reward": 0.13750000521540642, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 1992 + }, + { + "clip_ratio": 0.0, + "completion_length": 537.5916809082031, + "epoch": 0.6378620579292686, + "grad_norm": 0.24665555357933044, + "kl": 0.27333122715353964, + "learning_rate": 6.980065361124437e-06, + "loss": 0.0945, + "reward": 1.7286458611488342, + "reward_std": 0.22112812250852584, + "rewards/accuracy_reward": 0.03333333451300859, + "rewards/format_reward": 0.954166692495346, + "rewards/tag_count_reward": 0.7411458551883697, + "step": 1993 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.4541839599609, + "epoch": 0.638182109137462, + "grad_norm": 0.23802857100963593, + "kl": 0.259119226410985, + "learning_rate": 6.969412992230518e-06, + "loss": 0.0884, + "reward": 1.756770873069763, + "reward_std": 0.16711597740650178, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.9500000298023223, + "rewards/tag_count_reward": 0.7380208611488343, + "step": 1994 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.2521026611328, + "epoch": 0.6385021603456553, + "grad_norm": 0.1773439198732376, + "kl": 0.22130618281662465, + "learning_rate": 6.95876440867548e-06, + "loss": 0.1208, + "reward": 1.7510417222976684, + "reward_std": 0.22782276272773744, + "rewards/accuracy_reward": 0.07500000223517418, + "rewards/format_reward": 0.9416666805744172, + "rewards/tag_count_reward": 0.7343750178813935, + "step": 1995 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.5958526611328, + "epoch": 0.6388222115538487, + "grad_norm": 0.13545185327529907, + "kl": 0.15213358253240586, + "learning_rate": 6.948119623759888e-06, + "loss": 0.0606, + "reward": 1.8223958730697631, + "reward_std": 0.18742815032601357, + "rewards/accuracy_reward": 0.11875000204890966, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 1996 + }, + { + "clip_ratio": 0.0, + "completion_length": 522.2375183105469, + "epoch": 0.6391422627620419, + "grad_norm": 0.12271955609321594, + "kl": 0.21166965663433074, + "learning_rate": 6.937478650779548e-06, + "loss": 0.086, + "reward": 1.7151041984558106, + "reward_std": 0.2101954735815525, + "rewards/accuracy_reward": 0.016666666977107523, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 1997 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.2770965576171, + "epoch": 0.6394623139702352, + "grad_norm": 0.10828928649425507, + "kl": 0.31541863903403283, + "learning_rate": 6.926841503025513e-06, + "loss": 0.0909, + "reward": 1.7812500596046448, + "reward_std": 0.17954575568437575, + "rewards/accuracy_reward": 0.07708333563059569, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 1998 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.4833404541016, + "epoch": 0.6397823651784286, + "grad_norm": 0.13755281269550323, + "kl": 0.34499868750572205, + "learning_rate": 6.916208193784062e-06, + "loss": 0.0742, + "reward": 1.8619792222976685, + "reward_std": 0.22622655108571052, + "rewards/accuracy_reward": 0.16666667070239782, + "rewards/format_reward": 0.9541666805744171, + "rewards/tag_count_reward": 0.7411458492279053, + "step": 1999 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.8062744140625, + "epoch": 0.6401024163866219, + "grad_norm": 0.1748357117176056, + "kl": 0.20692237839102745, + "learning_rate": 6.905578736336673e-06, + "loss": 0.1086, + "reward": 1.760937511920929, + "reward_std": 0.18903874084353448, + "rewards/accuracy_reward": 0.05416666716337204, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7421875238418579, + "step": 2000 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.4271026611328, + "epoch": 0.6404224675948151, + "grad_norm": 0.11439496278762817, + "kl": 0.3237125493586063, + "learning_rate": 6.89495314396001e-06, + "loss": 0.0511, + "reward": 1.778125023841858, + "reward_std": 0.17002107575535774, + "rewards/accuracy_reward": 0.07500000111758709, + "rewards/format_reward": 0.9625000059604645, + "rewards/tag_count_reward": 0.740625011920929, + "step": 2001 + }, + { + "clip_ratio": 0.0, + "completion_length": 518.7208526611328, + "epoch": 0.6407425188030085, + "grad_norm": 0.07779530435800552, + "kl": 0.18974074572324753, + "learning_rate": 6.884331429925919e-06, + "loss": 0.0578, + "reward": 1.717708373069763, + "reward_std": 0.20087785869836808, + "rewards/accuracy_reward": 0.01875000074505806, + "rewards/format_reward": 0.9604166746139526, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 2002 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.1646087646484, + "epoch": 0.6410625700112018, + "grad_norm": 0.15299402177333832, + "kl": 0.20888833254575728, + "learning_rate": 6.8737136075013925e-06, + "loss": 0.0718, + "reward": 1.7515625357627869, + "reward_std": 0.12168702185153961, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7453125178813934, + "step": 2003 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.0583526611328, + "epoch": 0.6413826212193952, + "grad_norm": 0.2596868574619293, + "kl": 0.48813455291092395, + "learning_rate": 6.863099689948569e-06, + "loss": 0.123, + "reward": 1.8130208849906921, + "reward_std": 0.26384441256523133, + "rewards/accuracy_reward": 0.11875000055879355, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7338541865348815, + "step": 2004 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.629183959961, + "epoch": 0.6417026724275884, + "grad_norm": 0.19190722703933716, + "kl": 0.1685401350259781, + "learning_rate": 6.852489690524703e-06, + "loss": 0.0524, + "reward": 1.8177083611488343, + "reward_std": 0.12307879701256752, + "rewards/accuracy_reward": 0.08958333544433117, + "rewards/format_reward": 0.9833333492279053, + "rewards/tag_count_reward": 0.7447916805744171, + "step": 2005 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.3729278564454, + "epoch": 0.6420227236357817, + "grad_norm": 0.21922363340854645, + "kl": 0.2787697918713093, + "learning_rate": 6.84188362248216e-06, + "loss": 0.0813, + "reward": 1.786458384990692, + "reward_std": 0.16800421923398973, + "rewards/accuracy_reward": 0.0687500024214387, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.746875011920929, + "step": 2006 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.2312652587891, + "epoch": 0.6423427748439751, + "grad_norm": 0.09753888100385666, + "kl": 0.174386228621006, + "learning_rate": 6.831281499068396e-06, + "loss": 0.0773, + "reward": 1.7661458730697632, + "reward_std": 0.1401790753006935, + "rewards/accuracy_reward": 0.0458333345130086, + "rewards/format_reward": 0.975000011920929, + "rewards/tag_count_reward": 0.7453125178813934, + "step": 2007 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.6979370117188, + "epoch": 0.6426628260521684, + "grad_norm": 0.13030627369880676, + "kl": 0.1947503164410591, + "learning_rate": 6.820683333525942e-06, + "loss": 0.0801, + "reward": 1.7135417222976685, + "reward_std": 0.15063118934631348, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.9645833611488343, + "rewards/tag_count_reward": 0.7447916805744171, + "step": 2008 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.7104400634765, + "epoch": 0.6429828772603616, + "grad_norm": 0.15181896090507507, + "kl": 0.35972325801849364, + "learning_rate": 6.810089139092371e-06, + "loss": 0.178, + "reward": 1.7250000596046449, + "reward_std": 0.2836567386984825, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.9145833611488342, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2009 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.7541931152343, + "epoch": 0.643302928468555, + "grad_norm": 0.19096185266971588, + "kl": 0.5030779674649238, + "learning_rate": 6.7994989290003165e-06, + "loss": 0.2042, + "reward": 1.663020873069763, + "reward_std": 0.33288909047842025, + "rewards/accuracy_reward": 0.027083334513008595, + "rewards/format_reward": 0.8875000238418579, + "rewards/tag_count_reward": 0.7484375178813935, + "step": 2010 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.2125183105469, + "epoch": 0.6436229796767483, + "grad_norm": 0.10490421950817108, + "kl": 0.27069406807422636, + "learning_rate": 6.788912716477417e-06, + "loss": 0.1012, + "reward": 1.7651042103767396, + "reward_std": 0.26789544969797136, + "rewards/accuracy_reward": 0.07708333507180214, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7380208492279052, + "step": 2011 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.5729309082031, + "epoch": 0.6439430308849415, + "grad_norm": 0.0868968516588211, + "kl": 0.19741727262735367, + "learning_rate": 6.7783305147463295e-06, + "loss": 0.06, + "reward": 1.7609375238418579, + "reward_std": 0.1284565381705761, + "rewards/accuracy_reward": 0.04375000130385161, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.7463541746139526, + "step": 2012 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.556265258789, + "epoch": 0.6442630820931349, + "grad_norm": 0.6286680102348328, + "kl": 0.3144658826291561, + "learning_rate": 6.7677523370247e-06, + "loss": 0.0986, + "reward": 1.763020896911621, + "reward_std": 0.21080692261457443, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.9541666805744171, + "rewards/tag_count_reward": 0.7401041805744171, + "step": 2013 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.675015258789, + "epoch": 0.6445831333013282, + "grad_norm": 0.06401374191045761, + "kl": 0.15224939808249474, + "learning_rate": 6.7571781965251405e-06, + "loss": 0.056, + "reward": 1.7734375238418578, + "reward_std": 0.1219717726111412, + "rewards/accuracy_reward": 0.0520833358168602, + "rewards/format_reward": 0.9770833551883698, + "rewards/tag_count_reward": 0.7442708373069763, + "step": 2014 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.820849609375, + "epoch": 0.6449031845095216, + "grad_norm": 0.1311735063791275, + "kl": 0.2862920179963112, + "learning_rate": 6.746608106455231e-06, + "loss": 0.0856, + "reward": 1.8614583730697631, + "reward_std": 0.19478606656193734, + "rewards/accuracy_reward": 0.15208333544433117, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 2015 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.3146026611328, + "epoch": 0.6452232357177148, + "grad_norm": 0.10083205252885818, + "kl": 0.260428823530674, + "learning_rate": 6.736042080017488e-06, + "loss": 0.0828, + "reward": 1.7416666984558105, + "reward_std": 0.19739598780870438, + "rewards/accuracy_reward": 0.03541666828095913, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2016 + }, + { + "clip_ratio": 0.0, + "completion_length": 537.060433959961, + "epoch": 0.6455432869259081, + "grad_norm": 0.1420152336359024, + "kl": 0.3134328491985798, + "learning_rate": 6.725480130409347e-06, + "loss": 0.0788, + "reward": 1.8203125476837159, + "reward_std": 0.20216676890850066, + "rewards/accuracy_reward": 0.11666667275130749, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7390625238418579, + "step": 2017 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.4458526611328, + "epoch": 0.6458633381341015, + "grad_norm": 0.1362895667552948, + "kl": 0.3763896100223064, + "learning_rate": 6.714922270823159e-06, + "loss": 0.0935, + "reward": 1.7828125476837158, + "reward_std": 0.1553657740354538, + "rewards/accuracy_reward": 0.07291666865348816, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7411458611488342, + "step": 2018 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.6896057128906, + "epoch": 0.6461833893422948, + "grad_norm": 0.3411266505718231, + "kl": 0.4690343365073204, + "learning_rate": 6.704368514446165e-06, + "loss": 0.1149, + "reward": 1.7442708730697631, + "reward_std": 0.2608324535191059, + "rewards/accuracy_reward": 0.0666666679084301, + "rewards/format_reward": 0.9437500178813935, + "rewards/tag_count_reward": 0.7338541865348815, + "step": 2019 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.6354339599609, + "epoch": 0.646503440550488, + "grad_norm": 0.24700689315795898, + "kl": 0.6471106797456742, + "learning_rate": 6.693818874460475e-06, + "loss": 0.1784, + "reward": 1.7640625357627868, + "reward_std": 0.3167236685752869, + "rewards/accuracy_reward": 0.09791667070239782, + "rewards/format_reward": 0.9375000238418579, + "rewards/tag_count_reward": 0.7286458611488342, + "step": 2020 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.764599609375, + "epoch": 0.6468234917586814, + "grad_norm": 0.18885403871536255, + "kl": 0.628251314163208, + "learning_rate": 6.683273364043066e-06, + "loss": 0.1355, + "reward": 1.7161458611488343, + "reward_std": 0.2554626792669296, + "rewards/accuracy_reward": 0.03541666809469461, + "rewards/format_reward": 0.9479166805744171, + "rewards/tag_count_reward": 0.7328125178813935, + "step": 2021 + }, + { + "clip_ratio": 0.0, + "completion_length": 590.8416870117187, + "epoch": 0.6471435429668747, + "grad_norm": 0.3838038742542267, + "kl": 0.5382826343178749, + "learning_rate": 6.672731996365749e-06, + "loss": 0.1536, + "reward": 1.7510416984558106, + "reward_std": 0.26691520512104033, + "rewards/accuracy_reward": 0.08125000204890967, + "rewards/format_reward": 0.9395833611488342, + "rewards/tag_count_reward": 0.7302083492279052, + "step": 2022 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.8083557128906, + "epoch": 0.647463594175068, + "grad_norm": 0.47697538137435913, + "kl": 0.7491498619318009, + "learning_rate": 6.662194784595164e-06, + "loss": 0.1175, + "reward": 1.8489584088325501, + "reward_std": 0.20880869925022125, + "rewards/accuracy_reward": 0.1687500050291419, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7302083492279052, + "step": 2023 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.7791809082031, + "epoch": 0.6477836453832613, + "grad_norm": 0.19957475364208221, + "kl": 0.45173963755369184, + "learning_rate": 6.651661741892763e-06, + "loss": 0.0891, + "reward": 1.7526041865348816, + "reward_std": 0.19860148280858994, + "rewards/accuracy_reward": 0.05625, + "rewards/format_reward": 0.9625000119209289, + "rewards/tag_count_reward": 0.7338541865348815, + "step": 2024 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.2416809082031, + "epoch": 0.6481036965914546, + "grad_norm": 0.21536926925182343, + "kl": 0.37086462080478666, + "learning_rate": 6.641132881414791e-06, + "loss": 0.0983, + "reward": 1.7984375357627869, + "reward_std": 0.18779040426015853, + "rewards/accuracy_reward": 0.09583333656191825, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7380208611488343, + "step": 2025 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.7312683105469, + "epoch": 0.648423747799648, + "grad_norm": 0.3578295111656189, + "kl": 0.3105326473712921, + "learning_rate": 6.63060821631226e-06, + "loss": 0.1002, + "reward": 1.8119792222976685, + "reward_std": 0.23218104541301726, + "rewards/accuracy_reward": 0.11875000223517418, + "rewards/format_reward": 0.956250011920929, + "rewards/tag_count_reward": 0.7369791865348816, + "step": 2026 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.6146057128906, + "epoch": 0.6487437990078413, + "grad_norm": 0.24499280750751495, + "kl": 0.26186653785407543, + "learning_rate": 6.6200877597309535e-06, + "loss": 0.0855, + "reward": 1.7635416746139527, + "reward_std": 0.14513484984636307, + "rewards/accuracy_reward": 0.052083334885537624, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7427083432674408, + "step": 2027 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.0458526611328, + "epoch": 0.6490638502160345, + "grad_norm": 0.30822300910949707, + "kl": 0.48599352315068245, + "learning_rate": 6.609571524811387e-06, + "loss": 0.1068, + "reward": 1.7473958611488343, + "reward_std": 0.24330125153064727, + "rewards/accuracy_reward": 0.05625000018626451, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7369791865348816, + "step": 2028 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.8708526611329, + "epoch": 0.6493839014242279, + "grad_norm": 0.16421116888523102, + "kl": 0.3570775203406811, + "learning_rate": 6.599059524688813e-06, + "loss": 0.0979, + "reward": 1.7531250596046448, + "reward_std": 0.19463911652565002, + "rewards/accuracy_reward": 0.05625000149011612, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7427083551883698, + "step": 2029 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.38544921875, + "epoch": 0.6497039526324212, + "grad_norm": 0.24503841996192932, + "kl": 0.3006763093173504, + "learning_rate": 6.588551772493188e-06, + "loss": 0.0805, + "reward": 1.764062511920929, + "reward_std": 0.1812247857451439, + "rewards/accuracy_reward": 0.06041666716337204, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7369791865348816, + "step": 2030 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.9062805175781, + "epoch": 0.6500240038406145, + "grad_norm": 0.17819064855575562, + "kl": 0.44421446323394775, + "learning_rate": 6.578048281349165e-06, + "loss": 0.1009, + "reward": 1.6994791984558106, + "reward_std": 0.2802324160933495, + "rewards/accuracy_reward": 0.020833333395421504, + "rewards/format_reward": 0.9437500298023224, + "rewards/tag_count_reward": 0.7348958432674408, + "step": 2031 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.4812774658203, + "epoch": 0.6503440550488078, + "grad_norm": 0.11345545947551727, + "kl": 0.21580078080296516, + "learning_rate": 6.567549064376078e-06, + "loss": 0.0719, + "reward": 1.7338541984558105, + "reward_std": 0.19010756611824037, + "rewards/accuracy_reward": 0.027083334513008595, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7421875119209289, + "step": 2032 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.2791778564454, + "epoch": 0.6506641062570011, + "grad_norm": 0.12270388752222061, + "kl": 0.2688896611332893, + "learning_rate": 6.557054134687919e-06, + "loss": 0.0888, + "reward": 1.7796875476837157, + "reward_std": 0.25824336111545565, + "rewards/accuracy_reward": 0.0833333384245634, + "rewards/format_reward": 0.9583333611488343, + "rewards/tag_count_reward": 0.7380208611488343, + "step": 2033 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.8437713623047, + "epoch": 0.6509841574651944, + "grad_norm": 0.07305414229631424, + "kl": 0.26273534893989564, + "learning_rate": 6.546563505393321e-06, + "loss": 0.0784, + "reward": 1.8338542222976684, + "reward_std": 0.21032921522855758, + "rewards/accuracy_reward": 0.12916667070239782, + "rewards/format_reward": 0.9645833432674408, + "rewards/tag_count_reward": 0.7401041805744171, + "step": 2034 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.4937622070313, + "epoch": 0.6513042086733878, + "grad_norm": 0.2946823537349701, + "kl": 0.26316218823194504, + "learning_rate": 6.536077189595554e-06, + "loss": 0.0756, + "reward": 1.7817708611488343, + "reward_std": 0.23125024437904357, + "rewards/accuracy_reward": 0.07708333432674408, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 2035 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.3229339599609, + "epoch": 0.651624259881581, + "grad_norm": 0.1420002579689026, + "kl": 0.17707625590264797, + "learning_rate": 6.525595200392492e-06, + "loss": 0.0632, + "reward": 1.8250000119209289, + "reward_std": 0.15849317163228988, + "rewards/accuracy_reward": 0.11666666772216558, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7479166865348816, + "step": 2036 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.462515258789, + "epoch": 0.6519443110897744, + "grad_norm": 0.19155986607074738, + "kl": 0.36504338271915915, + "learning_rate": 6.515117550876615e-06, + "loss": 0.0828, + "reward": 1.8067708730697631, + "reward_std": 0.21143171042203904, + "rewards/accuracy_reward": 0.10625000223517418, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7380208373069763, + "step": 2037 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.7979339599609, + "epoch": 0.6522643622979677, + "grad_norm": 0.07730981707572937, + "kl": 0.13812856636941434, + "learning_rate": 6.504644254134969e-06, + "loss": 0.0477, + "reward": 1.7963541984558105, + "reward_std": 0.16553531140089034, + "rewards/accuracy_reward": 0.07083333469927311, + "rewards/format_reward": 0.9791666865348816, + "rewards/tag_count_reward": 0.7463541805744172, + "step": 2038 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.2937744140625, + "epoch": 0.652584413506161, + "grad_norm": 0.1270231455564499, + "kl": 0.2916695766150951, + "learning_rate": 6.4941753232491725e-06, + "loss": 0.0852, + "reward": 1.734375011920929, + "reward_std": 0.19963812083005905, + "rewards/accuracy_reward": 0.03958333432674408, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2039 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.2229370117187, + "epoch": 0.6529044647143543, + "grad_norm": 0.26288750767707825, + "kl": 0.240986368060112, + "learning_rate": 6.483710771295391e-06, + "loss": 0.0956, + "reward": 1.7734375476837159, + "reward_std": 0.2022160619497299, + "rewards/accuracy_reward": 0.07083333432674407, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7442708611488342, + "step": 2040 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.6500183105469, + "epoch": 0.6532245159225476, + "grad_norm": 0.123477503657341, + "kl": 0.2088707573711872, + "learning_rate": 6.4732506113443215e-06, + "loss": 0.1214, + "reward": 1.8208333849906921, + "reward_std": 0.21984335780143738, + "rewards/accuracy_reward": 0.13333333656191826, + "rewards/format_reward": 0.947916692495346, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2041 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.595849609375, + "epoch": 0.6535445671307409, + "grad_norm": 0.14980222284793854, + "kl": 0.2989436075091362, + "learning_rate": 6.462794856461167e-06, + "loss": 0.0931, + "reward": 1.745312547683716, + "reward_std": 0.17107095420360566, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.9604166924953461, + "rewards/tag_count_reward": 0.7473958551883697, + "step": 2042 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.3958557128906, + "epoch": 0.6538646183389343, + "grad_norm": 0.2861766219139099, + "kl": 0.4346440315246582, + "learning_rate": 6.452343519705637e-06, + "loss": 0.1278, + "reward": 1.735937523841858, + "reward_std": 0.2187936559319496, + "rewards/accuracy_reward": 0.04791666828095913, + "rewards/format_reward": 0.9541666805744171, + "rewards/tag_count_reward": 0.7338541865348815, + "step": 2043 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.0354370117187, + "epoch": 0.6541846695471275, + "grad_norm": 0.09741534292697906, + "kl": 0.24645915627479553, + "learning_rate": 6.441896614131918e-06, + "loss": 0.0712, + "reward": 1.7723958849906922, + "reward_std": 0.16998118087649344, + "rewards/accuracy_reward": 0.06250000298023224, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7411458551883697, + "step": 2044 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.9250183105469, + "epoch": 0.6545047207553208, + "grad_norm": 0.22848017513751984, + "kl": 0.35786485224962233, + "learning_rate": 6.431454152788659e-06, + "loss": 0.0894, + "reward": 1.7375000238418579, + "reward_std": 0.19461821913719177, + "rewards/accuracy_reward": 0.03541666716337204, + "rewards/format_reward": 0.9583333611488343, + "rewards/tag_count_reward": 0.743750023841858, + "step": 2045 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.7562805175781, + "epoch": 0.6548247719635142, + "grad_norm": 0.16886331140995026, + "kl": 0.26599433794617655, + "learning_rate": 6.421016148718968e-06, + "loss": 0.1011, + "reward": 1.8000000357627868, + "reward_std": 0.23989782929420472, + "rewards/accuracy_reward": 0.10625000223517418, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.735416692495346, + "step": 2046 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.939599609375, + "epoch": 0.6551448231717075, + "grad_norm": 0.28325778245925903, + "kl": 0.4531488560140133, + "learning_rate": 6.410582614960375e-06, + "loss": 0.1435, + "reward": 1.8041667222976685, + "reward_std": 0.27296153604984286, + "rewards/accuracy_reward": 0.13750000298023224, + "rewards/format_reward": 0.9270833551883697, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2047 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.6479370117188, + "epoch": 0.6554648743799008, + "grad_norm": 0.14918236434459686, + "kl": 0.2889488823711872, + "learning_rate": 6.400153564544831e-06, + "loss": 0.095, + "reward": 1.8020833730697632, + "reward_std": 0.19902113303542138, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2048 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.733349609375, + "epoch": 0.6557849255880941, + "grad_norm": 0.33905163407325745, + "kl": 0.5450758814811707, + "learning_rate": 6.389729010498693e-06, + "loss": 0.1293, + "reward": 1.751562523841858, + "reward_std": 0.21070124506950377, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7348958611488342, + "step": 2049 + }, + { + "clip_ratio": 0.0, + "completion_length": 620.783349609375, + "epoch": 0.6561049767962874, + "grad_norm": 0.18853819370269775, + "kl": 0.4303215779364109, + "learning_rate": 6.379308965842689e-06, + "loss": 0.1048, + "reward": 1.7203125476837158, + "reward_std": 0.2303944431245327, + "rewards/accuracy_reward": 0.04375000111758709, + "rewards/format_reward": 0.9437500178813935, + "rewards/tag_count_reward": 0.732812511920929, + "step": 2050 + }, + { + "clip_ratio": 0.0, + "completion_length": 612.1937683105468, + "epoch": 0.6564250280044808, + "grad_norm": 0.2386639267206192, + "kl": 0.8055388882756234, + "learning_rate": 6.368893443591924e-06, + "loss": 0.1558, + "reward": 1.6734375476837158, + "reward_std": 0.32045554369688034, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.9104166805744172, + "rewards/tag_count_reward": 0.7234375178813934, + "step": 2051 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.7479248046875, + "epoch": 0.656745079212674, + "grad_norm": 0.3026362955570221, + "kl": 0.38807725757360456, + "learning_rate": 6.3584824567558525e-06, + "loss": 0.0985, + "reward": 1.7463542103767395, + "reward_std": 0.18500390723347665, + "rewards/accuracy_reward": 0.04375000223517418, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7380208611488343, + "step": 2052 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.1458435058594, + "epoch": 0.6570651304208673, + "grad_norm": 0.48022738099098206, + "kl": 0.4987993150949478, + "learning_rate": 6.34807601833826e-06, + "loss": 0.1314, + "reward": 1.6812500476837158, + "reward_std": 0.25787831619381907, + "rewards/accuracy_reward": 0.010416666977107525, + "rewards/format_reward": 0.9395833611488342, + "rewards/tag_count_reward": 0.7312500178813934, + "step": 2053 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.7750213623046, + "epoch": 0.6573851816290607, + "grad_norm": 0.26611456274986267, + "kl": 0.5643174603581429, + "learning_rate": 6.33767414133726e-06, + "loss": 0.1348, + "reward": 1.7416667222976685, + "reward_std": 0.2692039854824543, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.9375000119209289, + "rewards/tag_count_reward": 0.7333333492279053, + "step": 2054 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.2396057128906, + "epoch": 0.6577052328372539, + "grad_norm": 0.19766655564308167, + "kl": 0.7188455149531364, + "learning_rate": 6.327276838745257e-06, + "loss": 0.1722, + "reward": 1.702083373069763, + "reward_std": 0.2696689248085022, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.9354166805744171, + "rewards/tag_count_reward": 0.7270833492279053, + "step": 2055 + }, + { + "clip_ratio": 0.0, + "completion_length": 590.5333557128906, + "epoch": 0.6580252840454472, + "grad_norm": 0.38821715116500854, + "kl": 0.7406436443328858, + "learning_rate": 6.316884123548947e-06, + "loss": 0.1643, + "reward": 1.692708384990692, + "reward_std": 0.32253962606191633, + "rewards/accuracy_reward": 0.04166666734963655, + "rewards/format_reward": 0.9208333551883697, + "rewards/tag_count_reward": 0.7302083551883698, + "step": 2056 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.9312713623046, + "epoch": 0.6583453352536406, + "grad_norm": 0.3075641691684723, + "kl": 0.5838876664638519, + "learning_rate": 6.306496008729302e-06, + "loss": 0.1149, + "reward": 1.8093750476837158, + "reward_std": 0.21804522201418877, + "rewards/accuracy_reward": 0.12708333730697632, + "rewards/format_reward": 0.9479166865348816, + "rewards/tag_count_reward": 0.7343750298023224, + "step": 2057 + }, + { + "clip_ratio": 0.0, + "completion_length": 611.214599609375, + "epoch": 0.6586653864618339, + "grad_norm": 0.2144460529088974, + "kl": 1.0372873276472092, + "learning_rate": 6.29611250726154e-06, + "loss": 0.2162, + "reward": 1.6010417103767396, + "reward_std": 0.4080072194337845, + "rewards/accuracy_reward": 0.012500000186264515, + "rewards/format_reward": 0.8895833492279053, + "rewards/tag_count_reward": 0.6989583432674408, + "step": 2058 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.3250244140625, + "epoch": 0.6589854376700272, + "grad_norm": 0.2626437842845917, + "kl": 0.6710316807031631, + "learning_rate": 6.285733632115118e-06, + "loss": 0.1558, + "reward": 1.731770884990692, + "reward_std": 0.3729784592986107, + "rewards/accuracy_reward": 0.09583333507180214, + "rewards/format_reward": 0.916666692495346, + "rewards/tag_count_reward": 0.7192708611488342, + "step": 2059 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.3000183105469, + "epoch": 0.6593054888782205, + "grad_norm": 0.42148250341415405, + "kl": 0.7080471813678741, + "learning_rate": 6.275359396253721e-06, + "loss": 0.1478, + "reward": 1.6453125476837158, + "reward_std": 0.3132076248526573, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.9187500238418579, + "rewards/tag_count_reward": 0.7244791865348816, + "step": 2060 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.527099609375, + "epoch": 0.6596255400864138, + "grad_norm": 0.3597671091556549, + "kl": 0.5296546339988708, + "learning_rate": 6.264989812635227e-06, + "loss": 0.1318, + "reward": 1.7067708492279052, + "reward_std": 0.2610057801008224, + "rewards/accuracy_reward": 0.04375000130385161, + "rewards/format_reward": 0.9333333551883698, + "rewards/tag_count_reward": 0.729687511920929, + "step": 2061 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.3333526611328, + "epoch": 0.6599455912946072, + "grad_norm": 0.3411342203617096, + "kl": 0.5472671233117581, + "learning_rate": 6.2546248942117134e-06, + "loss": 0.1035, + "reward": 1.7197917103767395, + "reward_std": 0.2231937639415264, + "rewards/accuracy_reward": 0.03958333432674408, + "rewards/format_reward": 0.9416666865348816, + "rewards/tag_count_reward": 0.7385416924953461, + "step": 2062 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.7750183105469, + "epoch": 0.6602656425028004, + "grad_norm": 0.22413881123065948, + "kl": 0.5802815616130829, + "learning_rate": 6.244264653929428e-06, + "loss": 0.1309, + "reward": 1.745833396911621, + "reward_std": 0.306681290268898, + "rewards/accuracy_reward": 0.0916666679084301, + "rewards/format_reward": 0.9333333551883698, + "rewards/tag_count_reward": 0.7208333432674408, + "step": 2063 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.4541778564453, + "epoch": 0.6605856937109937, + "grad_norm": 0.3446758985519409, + "kl": 0.7506268709897995, + "learning_rate": 6.2339091047287725e-06, + "loss": 0.1382, + "reward": 1.793750035762787, + "reward_std": 0.32375111281871793, + "rewards/accuracy_reward": 0.16250000502914191, + "rewards/format_reward": 0.9166666805744171, + "rewards/tag_count_reward": 0.7145833492279052, + "step": 2064 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.904183959961, + "epoch": 0.6609057449191871, + "grad_norm": 0.31046417355537415, + "kl": 0.7741656035184861, + "learning_rate": 6.2235582595442935e-06, + "loss": 0.1824, + "reward": 1.8447916984558106, + "reward_std": 0.2975379958748817, + "rewards/accuracy_reward": 0.18541667237877846, + "rewards/format_reward": 0.931250023841858, + "rewards/tag_count_reward": 0.7281250178813934, + "step": 2065 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.283349609375, + "epoch": 0.6612257961273804, + "grad_norm": 0.21272774040699005, + "kl": 0.6825548261404037, + "learning_rate": 6.213212131304664e-06, + "loss": 0.1454, + "reward": 1.722395896911621, + "reward_std": 0.3414647400379181, + "rewards/accuracy_reward": 0.08541666828095913, + "rewards/format_reward": 0.9208333551883697, + "rewards/tag_count_reward": 0.7161458551883697, + "step": 2066 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.404183959961, + "epoch": 0.6615458473355736, + "grad_norm": 0.18681898713111877, + "kl": 0.5913916632533074, + "learning_rate": 6.202870732932656e-06, + "loss": 0.1527, + "reward": 1.7916667103767394, + "reward_std": 0.31743351817131044, + "rewards/accuracy_reward": 0.1458333356305957, + "rewards/format_reward": 0.9187500178813934, + "rewards/tag_count_reward": 0.7270833551883698, + "step": 2067 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.8375244140625, + "epoch": 0.661865898543767, + "grad_norm": 0.2054324448108673, + "kl": 0.5609342604875565, + "learning_rate": 6.19253407734514e-06, + "loss": 0.1721, + "reward": 1.7661458730697632, + "reward_std": 0.2891497790813446, + "rewards/accuracy_reward": 0.11666667014360428, + "rewards/format_reward": 0.9250000298023224, + "rewards/tag_count_reward": 0.7244791865348816, + "step": 2068 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.2208526611328, + "epoch": 0.6621859497519603, + "grad_norm": 0.1147092655301094, + "kl": 0.2838616266846657, + "learning_rate": 6.182202177453063e-06, + "loss": 0.1007, + "reward": 1.7598958611488342, + "reward_std": 0.22217274978756904, + "rewards/accuracy_reward": 0.06875000223517418, + "rewards/format_reward": 0.9520833611488342, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2069 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.927099609375, + "epoch": 0.6625060009601537, + "grad_norm": 0.1614256501197815, + "kl": 0.4955372139811516, + "learning_rate": 6.171875046161429e-06, + "loss": 0.1084, + "reward": 1.7739583611488343, + "reward_std": 0.2446311503648758, + "rewards/accuracy_reward": 0.0979166679084301, + "rewards/format_reward": 0.9437500178813935, + "rewards/tag_count_reward": 0.7322916865348816, + "step": 2070 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.2250244140625, + "epoch": 0.6628260521683469, + "grad_norm": 0.15170036256313324, + "kl": 0.2690488576889038, + "learning_rate": 6.161552696369291e-06, + "loss": 0.0638, + "reward": 1.7848958611488341, + "reward_std": 0.19568712040781974, + "rewards/accuracy_reward": 0.08125000409781932, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7411458432674408, + "step": 2071 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.9646026611329, + "epoch": 0.6631461033765402, + "grad_norm": 0.11659003794193268, + "kl": 0.32981459945440295, + "learning_rate": 6.151235140969719e-06, + "loss": 0.1103, + "reward": 1.7593750476837158, + "reward_std": 0.25664503276348116, + "rewards/accuracy_reward": 0.07500000167638063, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7343750298023224, + "step": 2072 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.8000183105469, + "epoch": 0.6634661545847336, + "grad_norm": 0.11869725584983826, + "kl": 0.30063339732587335, + "learning_rate": 6.1409223928498085e-06, + "loss": 0.0911, + "reward": 1.8427083849906922, + "reward_std": 0.17913474664092063, + "rewards/accuracy_reward": 0.1354166718199849, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7427083551883698, + "step": 2073 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.7979339599609, + "epoch": 0.6637862057929269, + "grad_norm": 0.17836003005504608, + "kl": 0.41804255843162536, + "learning_rate": 6.130614464890645e-06, + "loss": 0.1133, + "reward": 1.7947917342185975, + "reward_std": 0.2575798869132996, + "rewards/accuracy_reward": 0.10625000335276127, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7343750238418579, + "step": 2074 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.6479339599609, + "epoch": 0.6641062570011201, + "grad_norm": 0.2909286916255951, + "kl": 0.6877239581197501, + "learning_rate": 6.120311369967286e-06, + "loss": 0.0902, + "reward": 1.7838542222976685, + "reward_std": 0.179420168697834, + "rewards/accuracy_reward": 0.10833333730697632, + "rewards/format_reward": 0.9416666805744172, + "rewards/tag_count_reward": 0.7338541865348815, + "step": 2075 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.4791809082031, + "epoch": 0.6644263082093135, + "grad_norm": 0.16476832330226898, + "kl": 0.2650918196886778, + "learning_rate": 6.11001312094876e-06, + "loss": 0.0947, + "reward": 1.8890625476837157, + "reward_std": 0.19297146275639535, + "rewards/accuracy_reward": 0.17500000298023224, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7453125178813934, + "step": 2076 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.3083557128906, + "epoch": 0.6647463594175068, + "grad_norm": 0.15465334057807922, + "kl": 0.4669275902211666, + "learning_rate": 6.099719730698046e-06, + "loss": 0.1589, + "reward": 1.7192708849906921, + "reward_std": 0.27212869971990583, + "rewards/accuracy_reward": 0.04375000223517418, + "rewards/format_reward": 0.9395833611488342, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 2077 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.4958435058594, + "epoch": 0.6650664106257002, + "grad_norm": 0.1373804807662964, + "kl": 0.41616974845528604, + "learning_rate": 6.089431212072043e-06, + "loss": 0.098, + "reward": 1.7343750238418578, + "reward_std": 0.2505802020430565, + "rewards/accuracy_reward": 0.039583333767950535, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7427083611488342, + "step": 2078 + }, + { + "clip_ratio": 0.0, + "completion_length": 533.3833557128906, + "epoch": 0.6653864618338934, + "grad_norm": 0.13085642457008362, + "kl": 0.3719322353601456, + "learning_rate": 6.079147577921576e-06, + "loss": 0.1556, + "reward": 1.8166666984558106, + "reward_std": 0.202004524320364, + "rewards/accuracy_reward": 0.12500000447034837, + "rewards/format_reward": 0.9437500238418579, + "rewards/tag_count_reward": 0.7479166865348816, + "step": 2079 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.2479309082031, + "epoch": 0.6657065130420867, + "grad_norm": 0.14317762851715088, + "kl": 0.3922991409897804, + "learning_rate": 6.068868841091361e-06, + "loss": 0.125, + "reward": 1.7104166984558105, + "reward_std": 0.25869596004486084, + "rewards/accuracy_reward": 0.02708333358168602, + "rewards/format_reward": 0.9458333551883698, + "rewards/tag_count_reward": 0.7375000178813934, + "step": 2080 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.2895965576172, + "epoch": 0.6660265642502801, + "grad_norm": 0.15368254482746124, + "kl": 0.27517750635743143, + "learning_rate": 6.05859501442e-06, + "loss": 0.0966, + "reward": 1.7619792222976685, + "reward_std": 0.2081921711564064, + "rewards/accuracy_reward": 0.07291666883975267, + "rewards/format_reward": 0.9416666865348816, + "rewards/tag_count_reward": 0.7473958551883697, + "step": 2081 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.2271057128906, + "epoch": 0.6663466154584734, + "grad_norm": 0.20891135931015015, + "kl": 0.4285693295300007, + "learning_rate": 6.048326110739968e-06, + "loss": 0.1587, + "reward": 1.7369792222976685, + "reward_std": 0.22642296701669692, + "rewards/accuracy_reward": 0.05625000167638063, + "rewards/format_reward": 0.9375000298023224, + "rewards/tag_count_reward": 0.743229192495346, + "step": 2082 + }, + { + "clip_ratio": 0.0, + "completion_length": 503.933349609375, + "epoch": 0.6666666666666666, + "grad_norm": 0.05916117876768112, + "kl": 0.23351404666900635, + "learning_rate": 6.038062142877583e-06, + "loss": 0.06, + "reward": 1.798958384990692, + "reward_std": 0.12938002720475197, + "rewards/accuracy_reward": 0.08125000204890967, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7510416805744171, + "step": 2083 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.4312683105469, + "epoch": 0.66698671787486, + "grad_norm": 0.16329483687877655, + "kl": 0.3161396749317646, + "learning_rate": 6.027803123653e-06, + "loss": 0.1275, + "reward": 1.7046875476837158, + "reward_std": 0.2415686145424843, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.9291666865348815, + "rewards/tag_count_reward": 0.7421875298023224, + "step": 2084 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.9396026611328, + "epoch": 0.6673067690830533, + "grad_norm": 0.09667642414569855, + "kl": 0.3785520136356354, + "learning_rate": 6.0175490658801934e-06, + "loss": 0.1069, + "reward": 1.690625047683716, + "reward_std": 0.22385728284716605, + "rewards/accuracy_reward": 0.010416667163372039, + "rewards/format_reward": 0.9458333492279053, + "rewards/tag_count_reward": 0.7343750178813935, + "step": 2085 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.395849609375, + "epoch": 0.6676268202912466, + "grad_norm": 0.08739566057920456, + "kl": 0.34865760840475557, + "learning_rate": 6.00729998236694e-06, + "loss": 0.1112, + "reward": 1.7708333730697632, + "reward_std": 0.20698952823877334, + "rewards/accuracy_reward": 0.09166666865348816, + "rewards/format_reward": 0.9395833551883698, + "rewards/tag_count_reward": 0.7395833611488343, + "step": 2086 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.7646118164063, + "epoch": 0.6679468714994399, + "grad_norm": 0.12396329641342163, + "kl": 0.40712554156780245, + "learning_rate": 5.997055885914806e-06, + "loss": 0.1277, + "reward": 1.7416667222976685, + "reward_std": 0.2734927900135517, + "rewards/accuracy_reward": 0.0687500013038516, + "rewards/format_reward": 0.9375000119209289, + "rewards/tag_count_reward": 0.7354166865348816, + "step": 2087 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.6958526611328, + "epoch": 0.6682669227076332, + "grad_norm": 0.20949289202690125, + "kl": 0.435461837798357, + "learning_rate": 5.986816789319123e-06, + "loss": 0.1051, + "reward": 1.7489583849906922, + "reward_std": 0.21258105263113974, + "rewards/accuracy_reward": 0.06250000149011611, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.7343750178813935, + "step": 2088 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.9646057128906, + "epoch": 0.6685869739158266, + "grad_norm": 0.08775470405817032, + "kl": 0.4969747729599476, + "learning_rate": 5.976582705368982e-06, + "loss": 0.0677, + "reward": 1.7697917103767395, + "reward_std": 0.2106465920805931, + "rewards/accuracy_reward": 0.07500000223517418, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2089 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.9708465576172, + "epoch": 0.6689070251240199, + "grad_norm": 0.07833829522132874, + "kl": 0.30936159677803515, + "learning_rate": 5.966353646847215e-06, + "loss": 0.115, + "reward": 1.7484375476837157, + "reward_std": 0.23181662410497667, + "rewards/accuracy_reward": 0.060416669212281705, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.735937523841858, + "step": 2090 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.608349609375, + "epoch": 0.6692270763322131, + "grad_norm": 0.10873490571975708, + "kl": 0.26770354211330416, + "learning_rate": 5.956129626530376e-06, + "loss": 0.0866, + "reward": 1.7546875596046447, + "reward_std": 0.1789041481912136, + "rewards/accuracy_reward": 0.052083334885537624, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7401041805744171, + "step": 2091 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.8583465576172, + "epoch": 0.6695471275404065, + "grad_norm": 0.17797769606113434, + "kl": 0.4399689495563507, + "learning_rate": 5.945910657188717e-06, + "loss": 0.1301, + "reward": 1.7484375476837157, + "reward_std": 0.2705027997493744, + "rewards/accuracy_reward": 0.06666666828095913, + "rewards/format_reward": 0.9479166865348816, + "rewards/tag_count_reward": 0.7338541924953461, + "step": 2092 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.7645965576172, + "epoch": 0.6698671787485998, + "grad_norm": 0.2502882480621338, + "kl": 0.25185356885194776, + "learning_rate": 5.9356967515861955e-06, + "loss": 0.0978, + "reward": 1.8630208849906922, + "reward_std": 0.2282662123441696, + "rewards/accuracy_reward": 0.16250000577419996, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7421875059604645, + "step": 2093 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.6791839599609, + "epoch": 0.6701872299567931, + "grad_norm": 0.24098429083824158, + "kl": 0.21436882950365543, + "learning_rate": 5.925487922480431e-06, + "loss": 0.0998, + "reward": 1.8354167222976685, + "reward_std": 0.20333680436015128, + "rewards/accuracy_reward": 0.1291666692122817, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7416666805744171, + "step": 2094 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.0687683105468, + "epoch": 0.6705072811649864, + "grad_norm": 0.16339395940303802, + "kl": 0.5092102646827698, + "learning_rate": 5.9152841826227136e-06, + "loss": 0.1156, + "reward": 1.8442708849906921, + "reward_std": 0.2604293584823608, + "rewards/accuracy_reward": 0.15208333637565374, + "rewards/format_reward": 0.9583333611488343, + "rewards/tag_count_reward": 0.7338541924953461, + "step": 2095 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.2104309082031, + "epoch": 0.6708273323731797, + "grad_norm": 0.09706208854913712, + "kl": 0.4753930263221264, + "learning_rate": 5.905085544757965e-06, + "loss": 0.12, + "reward": 1.8401042103767395, + "reward_std": 0.21673982813954354, + "rewards/accuracy_reward": 0.15208333730697632, + "rewards/format_reward": 0.954166692495346, + "rewards/tag_count_reward": 0.7338541865348815, + "step": 2096 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.1937713623047, + "epoch": 0.671147383581373, + "grad_norm": 0.19295969605445862, + "kl": 0.42010639905929564, + "learning_rate": 5.894892021624744e-06, + "loss": 0.1263, + "reward": 1.7885417103767396, + "reward_std": 0.25435220301151273, + "rewards/accuracy_reward": 0.11041666977107525, + "rewards/format_reward": 0.947916692495346, + "rewards/tag_count_reward": 0.7302083611488343, + "step": 2097 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.1750183105469, + "epoch": 0.6714674347895663, + "grad_norm": 0.11223884671926498, + "kl": 0.31995190382003785, + "learning_rate": 5.884703625955219e-06, + "loss": 0.1004, + "reward": 1.7244791865348816, + "reward_std": 0.22140763103961944, + "rewards/accuracy_reward": 0.03125000149011612, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7348958551883698, + "step": 2098 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.5104370117188, + "epoch": 0.6717874859977596, + "grad_norm": 0.14636114239692688, + "kl": 0.4229055255651474, + "learning_rate": 5.874520370475154e-06, + "loss": 0.0831, + "reward": 1.7130208730697631, + "reward_std": 0.16997253373265267, + "rewards/accuracy_reward": 0.00833333358168602, + "rewards/format_reward": 0.9645833611488343, + "rewards/tag_count_reward": 0.7401041924953461, + "step": 2099 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.6271118164062, + "epoch": 0.672107537205953, + "grad_norm": 0.10089104622602463, + "kl": 0.27489122599363325, + "learning_rate": 5.864342267903885e-06, + "loss": 0.0913, + "reward": 1.754687535762787, + "reward_std": 0.2580702304840088, + "rewards/accuracy_reward": 0.06666666939854622, + "rewards/format_reward": 0.9541666805744171, + "rewards/tag_count_reward": 0.7338541984558106, + "step": 2100 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.5208526611328, + "epoch": 0.6724275884141463, + "grad_norm": 0.10464661568403244, + "kl": 0.20850623920559883, + "learning_rate": 5.854169330954324e-06, + "loss": 0.0723, + "reward": 1.8265625596046449, + "reward_std": 0.15675044655799866, + "rewards/accuracy_reward": 0.10833333786576986, + "rewards/format_reward": 0.9750000178813935, + "rewards/tag_count_reward": 0.743229192495346, + "step": 2101 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.170849609375, + "epoch": 0.6727476396223395, + "grad_norm": 0.18542306125164032, + "kl": 0.2565278984606266, + "learning_rate": 5.84400157233292e-06, + "loss": 0.0743, + "reward": 1.7557292222976684, + "reward_std": 0.227911539375782, + "rewards/accuracy_reward": 0.06458333544433117, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7411458671092988, + "step": 2102 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.0125183105469, + "epoch": 0.6730676908305329, + "grad_norm": 0.12582838535308838, + "kl": 0.4552136674523354, + "learning_rate": 5.833839004739662e-06, + "loss": 0.1309, + "reward": 1.7572917103767396, + "reward_std": 0.26424730867147445, + "rewards/accuracy_reward": 0.07291666772216558, + "rewards/format_reward": 0.9541666805744171, + "rewards/tag_count_reward": 0.7302083432674408, + "step": 2103 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.989599609375, + "epoch": 0.6733877420387262, + "grad_norm": 0.10714533179998398, + "kl": 0.242107355594635, + "learning_rate": 5.823681640868049e-06, + "loss": 0.092, + "reward": 1.7453125357627868, + "reward_std": 0.1701894871890545, + "rewards/accuracy_reward": 0.0312500013038516, + "rewards/format_reward": 0.9729166865348816, + "rewards/tag_count_reward": 0.7411458551883697, + "step": 2104 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.629183959961, + "epoch": 0.6737077932469195, + "grad_norm": 0.10627640783786774, + "kl": 0.43605479300022126, + "learning_rate": 5.8135294934050855e-06, + "loss": 0.0705, + "reward": 1.7822916984558106, + "reward_std": 0.16257481276988983, + "rewards/accuracy_reward": 0.07500000223517418, + "rewards/format_reward": 0.9687500119209289, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 2105 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.1437683105469, + "epoch": 0.6740278444551128, + "grad_norm": 0.16275854408740997, + "kl": 0.2939316496253014, + "learning_rate": 5.803382575031257e-06, + "loss": 0.1094, + "reward": 1.7255208849906922, + "reward_std": 0.21441070288419722, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7380208492279052, + "step": 2106 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.0083526611328, + "epoch": 0.6743478956633061, + "grad_norm": 0.2260814607143402, + "kl": 0.22412323877215384, + "learning_rate": 5.793240898420521e-06, + "loss": 0.0938, + "reward": 1.7854167103767395, + "reward_std": 0.16013515293598174, + "rewards/accuracy_reward": 0.07500000298023224, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7416666805744171, + "step": 2107 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.847933959961, + "epoch": 0.6746679468714994, + "grad_norm": 0.14437879621982574, + "kl": 0.19934857040643691, + "learning_rate": 5.783104476240284e-06, + "loss": 0.0919, + "reward": 1.806770884990692, + "reward_std": 0.19802356511354446, + "rewards/accuracy_reward": 0.09791666865348816, + "rewards/format_reward": 0.9666666924953461, + "rewards/tag_count_reward": 0.7421875298023224, + "step": 2108 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.7958557128907, + "epoch": 0.6749879980796928, + "grad_norm": 0.14482718706130981, + "kl": 0.23657144084572793, + "learning_rate": 5.772973321151392e-06, + "loss": 0.0593, + "reward": 1.7963541865348815, + "reward_std": 0.12375809252262115, + "rewards/accuracy_reward": 0.08125000204890967, + "rewards/format_reward": 0.9729166805744172, + "rewards/tag_count_reward": 0.7421875119209289, + "step": 2109 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.0229370117188, + "epoch": 0.675308049287886, + "grad_norm": 0.15969114005565643, + "kl": 0.32010622769594194, + "learning_rate": 5.762847445808111e-06, + "loss": 0.0906, + "reward": 1.7276041865348817, + "reward_std": 0.20104559063911437, + "rewards/accuracy_reward": 0.02708333432674408, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 2110 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.560433959961, + "epoch": 0.6756281004960794, + "grad_norm": 0.09771303087472916, + "kl": 0.2898152723908424, + "learning_rate": 5.7527268628581175e-06, + "loss": 0.0973, + "reward": 1.7750000357627869, + "reward_std": 0.20687551125884057, + "rewards/accuracy_reward": 0.07291666902601719, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 2111 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.3541931152344, + "epoch": 0.6759481517042727, + "grad_norm": 0.12097357213497162, + "kl": 0.20418170019984244, + "learning_rate": 5.7426115849424635e-06, + "loss": 0.0705, + "reward": 1.7692708849906922, + "reward_std": 0.17754657715559005, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.9625000119209289, + "rewards/tag_count_reward": 0.7380208492279052, + "step": 2112 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.9541870117188, + "epoch": 0.676268202912466, + "grad_norm": 0.14658434689044952, + "kl": 0.29820197224617007, + "learning_rate": 5.73250162469559e-06, + "loss": 0.054, + "reward": 1.7385416984558106, + "reward_std": 0.18357955366373063, + "rewards/accuracy_reward": 0.03541666809469461, + "rewards/format_reward": 0.9625000119209289, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 2113 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.6000152587891, + "epoch": 0.6765882541206593, + "grad_norm": 0.15589158236980438, + "kl": 0.24293862506747246, + "learning_rate": 5.722396994745284e-06, + "loss": 0.0957, + "reward": 1.790625023841858, + "reward_std": 0.2170632876455784, + "rewards/accuracy_reward": 0.08750000260770321, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7406250238418579, + "step": 2114 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.4604370117188, + "epoch": 0.6769083053288526, + "grad_norm": 0.13007795810699463, + "kl": 0.1325427707284689, + "learning_rate": 5.712297707712694e-06, + "loss": 0.0346, + "reward": 1.7927083730697633, + "reward_std": 0.0649714283645153, + "rewards/accuracy_reward": 0.05625000149011612, + "rewards/format_reward": 0.987500011920929, + "rewards/tag_count_reward": 0.7489583373069764, + "step": 2115 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.7604431152344, + "epoch": 0.6772283565370459, + "grad_norm": 0.09760616719722748, + "kl": 0.18721306025981904, + "learning_rate": 5.702203776212269e-06, + "loss": 0.0737, + "reward": 1.8114583849906922, + "reward_std": 0.1576378509402275, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 2116 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.0771026611328, + "epoch": 0.6775484077452393, + "grad_norm": 0.24982647597789764, + "kl": 0.19170795604586602, + "learning_rate": 5.692115212851786e-06, + "loss": 0.0645, + "reward": 1.8317708730697633, + "reward_std": 0.17303552776575087, + "rewards/accuracy_reward": 0.10833333544433117, + "rewards/format_reward": 0.9770833551883698, + "rewards/tag_count_reward": 0.7463541805744172, + "step": 2117 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.4708557128906, + "epoch": 0.6778684589534325, + "grad_norm": 0.1240825280547142, + "kl": 0.2379318844527006, + "learning_rate": 5.682032030232314e-06, + "loss": 0.0579, + "reward": 1.778125035762787, + "reward_std": 0.1435864046216011, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 2118 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.1812683105469, + "epoch": 0.6781885101616258, + "grad_norm": 0.13775448501110077, + "kl": 0.38389052599668505, + "learning_rate": 5.6719542409482e-06, + "loss": 0.0865, + "reward": 1.7364583730697631, + "reward_std": 0.22209640294313432, + "rewards/accuracy_reward": 0.05416666846722364, + "rewards/format_reward": 0.9437500178813935, + "rewards/tag_count_reward": 0.7385416924953461, + "step": 2119 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.2541870117187, + "epoch": 0.6785085613698192, + "grad_norm": 0.1977604329586029, + "kl": 0.2928472336381674, + "learning_rate": 5.6618818575870486e-06, + "loss": 0.0951, + "reward": 1.7718750476837157, + "reward_std": 0.2150444954633713, + "rewards/accuracy_reward": 0.07291666809469462, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.740625011920929, + "step": 2120 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.2041870117188, + "epoch": 0.6788286125780125, + "grad_norm": 0.1186528429389, + "kl": 0.27777676060795786, + "learning_rate": 5.6518148927297215e-06, + "loss": 0.0865, + "reward": 1.8359375238418578, + "reward_std": 0.18982706367969512, + "rewards/accuracy_reward": 0.12708333749324083, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7401041865348816, + "step": 2121 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.9166778564453, + "epoch": 0.6791486637862058, + "grad_norm": 0.24591495096683502, + "kl": 0.3266368605196476, + "learning_rate": 5.6417533589503036e-06, + "loss": 0.0709, + "reward": 1.7572916865348815, + "reward_std": 0.1611829034984112, + "rewards/accuracy_reward": 0.047916668094694616, + "rewards/format_reward": 0.9666666924953461, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 2122 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.0229339599609, + "epoch": 0.6794687149943991, + "grad_norm": 0.1680237203836441, + "kl": 0.30815255306661127, + "learning_rate": 5.631697268816114e-06, + "loss": 0.0951, + "reward": 1.7854167103767395, + "reward_std": 0.20621230602264404, + "rewards/accuracy_reward": 0.07708333600312471, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7395833432674408, + "step": 2123 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.6750244140625, + "epoch": 0.6797887662025924, + "grad_norm": 0.19045765697956085, + "kl": 0.41359340101480485, + "learning_rate": 5.621646634887647e-06, + "loss": 0.1132, + "reward": 1.678645873069763, + "reward_std": 0.2574477940797806, + "rewards/accuracy_reward": 0.00833333358168602, + "rewards/format_reward": 0.9375000178813935, + "rewards/tag_count_reward": 0.7328125238418579, + "step": 2124 + }, + { + "clip_ratio": 0.0, + "completion_length": 533.6187683105469, + "epoch": 0.6801088174107858, + "grad_norm": 0.14711131155490875, + "kl": 0.46874346137046813, + "learning_rate": 5.611601469718601e-06, + "loss": 0.1238, + "reward": 1.8359375476837159, + "reward_std": 0.24985048174858093, + "rewards/accuracy_reward": 0.14375000298023224, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 2125 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.0187744140625, + "epoch": 0.680428868618979, + "grad_norm": 0.24069844186306, + "kl": 0.36626454442739487, + "learning_rate": 5.601561785855833e-06, + "loss": 0.1027, + "reward": 1.8125000596046448, + "reward_std": 0.24355322867631912, + "rewards/accuracy_reward": 0.11250000316649675, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7375000238418579, + "step": 2126 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.9208526611328, + "epoch": 0.6807489198271723, + "grad_norm": 0.26626524329185486, + "kl": 0.39541344419121743, + "learning_rate": 5.591527595839365e-06, + "loss": 0.1121, + "reward": 1.7197916746139525, + "reward_std": 0.18664910271763802, + "rewards/accuracy_reward": 0.018750000558793545, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 2127 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.7750183105469, + "epoch": 0.6810689710353657, + "grad_norm": 0.21741662919521332, + "kl": 0.30557314343750475, + "learning_rate": 5.5814989122023385e-06, + "loss": 0.1023, + "reward": 1.8036458849906922, + "reward_std": 0.1822378784418106, + "rewards/accuracy_reward": 0.10416667070239782, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7432291746139527, + "step": 2128 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.6750213623047, + "epoch": 0.681389022243559, + "grad_norm": 0.1766936480998993, + "kl": 0.3221236035227776, + "learning_rate": 5.571475747471036e-06, + "loss": 0.0895, + "reward": 1.7989583611488342, + "reward_std": 0.1792902246117592, + "rewards/accuracy_reward": 0.10000000204890966, + "rewards/format_reward": 0.9604166746139526, + "rewards/tag_count_reward": 0.7385416746139526, + "step": 2129 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.2250183105468, + "epoch": 0.6817090734517522, + "grad_norm": 0.20995013415813446, + "kl": 0.5006738707423211, + "learning_rate": 5.561458114164837e-06, + "loss": 0.156, + "reward": 1.7026041865348815, + "reward_std": 0.28156317621469495, + "rewards/accuracy_reward": 0.0375, + "rewards/format_reward": 0.9375000178813935, + "rewards/tag_count_reward": 0.7276041865348816, + "step": 2130 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.9729248046875, + "epoch": 0.6820291246599456, + "grad_norm": 0.1605108231306076, + "kl": 0.27917307913303374, + "learning_rate": 5.551446024796214e-06, + "loss": 0.0716, + "reward": 1.850520873069763, + "reward_std": 0.19164575263857841, + "rewards/accuracy_reward": 0.13750000596046447, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7442708432674408, + "step": 2131 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.6562683105469, + "epoch": 0.6823491758681389, + "grad_norm": 0.3428249955177307, + "kl": 0.2938417851924896, + "learning_rate": 5.541439491870716e-06, + "loss": 0.1212, + "reward": 1.7604167103767394, + "reward_std": 0.2516637593507767, + "rewards/accuracy_reward": 0.07708333693444729, + "rewards/format_reward": 0.9458333551883698, + "rewards/tag_count_reward": 0.7375000178813934, + "step": 2132 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.7375183105469, + "epoch": 0.6826692270763323, + "grad_norm": 0.12911154329776764, + "kl": 0.3031469196081161, + "learning_rate": 5.53143852788695e-06, + "loss": 0.0998, + "reward": 1.7541667222976685, + "reward_std": 0.24215374365448952, + "rewards/accuracy_reward": 0.07083333507180214, + "rewards/format_reward": 0.9395833611488342, + "rewards/tag_count_reward": 0.7437500178813934, + "step": 2133 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.9583557128906, + "epoch": 0.6829892782845255, + "grad_norm": 0.13713647425174713, + "kl": 0.36711022555828093, + "learning_rate": 5.521443145336568e-06, + "loss": 0.1164, + "reward": 1.757812535762787, + "reward_std": 0.24515254348516463, + "rewards/accuracy_reward": 0.07291666772216558, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.7328125298023224, + "step": 2134 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.520849609375, + "epoch": 0.6833093294927188, + "grad_norm": 0.2623184025287628, + "kl": 0.4985116317868233, + "learning_rate": 5.511453356704251e-06, + "loss": 0.1464, + "reward": 1.6989583492279052, + "reward_std": 0.22973346561193467, + "rewards/accuracy_reward": 0.01666666716337204, + "rewards/format_reward": 0.9458333492279053, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 2135 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.529183959961, + "epoch": 0.6836293807009122, + "grad_norm": 0.2138541042804718, + "kl": 0.3363912686705589, + "learning_rate": 5.501469174467695e-06, + "loss": 0.0797, + "reward": 1.8890625476837157, + "reward_std": 0.17162477001547813, + "rewards/accuracy_reward": 0.17291666977107525, + "rewards/format_reward": 0.9729166865348816, + "rewards/tag_count_reward": 0.7432291805744171, + "step": 2136 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.3000091552734, + "epoch": 0.6839494319091055, + "grad_norm": 0.20941811800003052, + "kl": 0.5751274846494198, + "learning_rate": 5.491490611097586e-06, + "loss": 0.1335, + "reward": 1.8187500476837157, + "reward_std": 0.27915480434894563, + "rewards/accuracy_reward": 0.15000000577419997, + "rewards/format_reward": 0.9354166865348816, + "rewards/tag_count_reward": 0.7333333492279053, + "step": 2137 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.314599609375, + "epoch": 0.6842694831172987, + "grad_norm": 0.1732366532087326, + "kl": 0.4344419322907925, + "learning_rate": 5.481517679057595e-06, + "loss": 0.1137, + "reward": 1.8453125596046447, + "reward_std": 0.21131417751312256, + "rewards/accuracy_reward": 0.15416667181998492, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7390625238418579, + "step": 2138 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.8166931152343, + "epoch": 0.6845895343254921, + "grad_norm": 0.3594232499599457, + "kl": 0.330796167999506, + "learning_rate": 5.4715503908043654e-06, + "loss": 0.0919, + "reward": 1.7781250119209289, + "reward_std": 0.27583343237638475, + "rewards/accuracy_reward": 0.09375000055879354, + "rewards/format_reward": 0.9479166805744171, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 2139 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.20419921875, + "epoch": 0.6849095855336854, + "grad_norm": 0.24212028086185455, + "kl": 0.5013069108128547, + "learning_rate": 5.461588758787484e-06, + "loss": 0.1604, + "reward": 1.7833333730697631, + "reward_std": 0.26907457038760185, + "rewards/accuracy_reward": 0.1062500026077032, + "rewards/format_reward": 0.9416666865348816, + "rewards/tag_count_reward": 0.7354166805744171, + "step": 2140 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.8687744140625, + "epoch": 0.6852296367418788, + "grad_norm": 0.21253329515457153, + "kl": 0.4218080222606659, + "learning_rate": 5.4516327954494764e-06, + "loss": 0.1252, + "reward": 1.7338542103767396, + "reward_std": 0.2643717348575592, + "rewards/accuracy_reward": 0.05000000260770321, + "rewards/format_reward": 0.9437500298023224, + "rewards/tag_count_reward": 0.7401041924953461, + "step": 2141 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.6500183105469, + "epoch": 0.685549687950072, + "grad_norm": 0.22795309126377106, + "kl": 0.4938424080610275, + "learning_rate": 5.441682513225786e-06, + "loss": 0.1201, + "reward": 1.7203125357627869, + "reward_std": 0.2679354429244995, + "rewards/accuracy_reward": 0.050000001676380634, + "rewards/format_reward": 0.9333333551883698, + "rewards/tag_count_reward": 0.736979192495346, + "step": 2142 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.3500122070312, + "epoch": 0.6858697391582653, + "grad_norm": 0.15360769629478455, + "kl": 0.2627089634537697, + "learning_rate": 5.431737924544763e-06, + "loss": 0.0813, + "reward": 1.8484375596046447, + "reward_std": 0.18890125900506974, + "rewards/accuracy_reward": 0.14375000353902578, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 2143 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.5666809082031, + "epoch": 0.6861897903664587, + "grad_norm": 0.31977367401123047, + "kl": 0.3762332767248154, + "learning_rate": 5.421799041827646e-06, + "loss": 0.0954, + "reward": 1.8666667103767396, + "reward_std": 0.22807515114545823, + "rewards/accuracy_reward": 0.18541667200624942, + "rewards/format_reward": 0.9437500238418579, + "rewards/tag_count_reward": 0.7375000238418579, + "step": 2144 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.3625183105469, + "epoch": 0.6865098415746519, + "grad_norm": 0.17487220466136932, + "kl": 0.3942092776298523, + "learning_rate": 5.411865877488536e-06, + "loss": 0.1055, + "reward": 1.7192708849906921, + "reward_std": 0.22079168483614922, + "rewards/accuracy_reward": 0.03333333395421505, + "rewards/format_reward": 0.943750011920929, + "rewards/tag_count_reward": 0.7421875119209289, + "step": 2145 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.1604461669922, + "epoch": 0.6868298927828452, + "grad_norm": 0.15001332759857178, + "kl": 0.41321970373392103, + "learning_rate": 5.401938443934405e-06, + "loss": 0.1174, + "reward": 1.7677083730697631, + "reward_std": 0.2351425528526306, + "rewards/accuracy_reward": 0.07916666883975268, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7385416924953461, + "step": 2146 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.3229370117188, + "epoch": 0.6871499439910386, + "grad_norm": 0.30220097303390503, + "kl": 0.42144326865673065, + "learning_rate": 5.392016753565059e-06, + "loss": 0.181, + "reward": 1.6854167103767395, + "reward_std": 0.3153300307691097, + "rewards/accuracy_reward": 0.027083334140479566, + "rewards/format_reward": 0.9250000178813934, + "rewards/tag_count_reward": 0.7333333432674408, + "step": 2147 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.4375213623047, + "epoch": 0.6874699951992319, + "grad_norm": 0.13343147933483124, + "kl": 0.2602052837610245, + "learning_rate": 5.382100818773144e-06, + "loss": 0.0989, + "reward": 1.7572917103767396, + "reward_std": 0.1719522811472416, + "rewards/accuracy_reward": 0.05000000149011612, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7427083432674408, + "step": 2148 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.0833618164063, + "epoch": 0.6877900464074251, + "grad_norm": 0.37386593222618103, + "kl": 0.31635802537202834, + "learning_rate": 5.3721906519440945e-06, + "loss": 0.1481, + "reward": 1.781250035762787, + "reward_std": 0.2873902410268784, + "rewards/accuracy_reward": 0.08958333656191826, + "rewards/format_reward": 0.954166692495346, + "rewards/tag_count_reward": 0.7375000178813934, + "step": 2149 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.1146026611328, + "epoch": 0.6881100976156185, + "grad_norm": 0.1896275281906128, + "kl": 0.336704520881176, + "learning_rate": 5.362286265456158e-06, + "loss": 0.1424, + "reward": 1.7984375476837158, + "reward_std": 0.2562291607260704, + "rewards/accuracy_reward": 0.11041666977107525, + "rewards/format_reward": 0.9479166865348816, + "rewards/tag_count_reward": 0.7401041805744171, + "step": 2150 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.7875244140625, + "epoch": 0.6884301488238118, + "grad_norm": 0.19244590401649475, + "kl": 0.5367755405604839, + "learning_rate": 5.352387671680357e-06, + "loss": 0.1137, + "reward": 1.7083333611488343, + "reward_std": 0.2284008175134659, + "rewards/accuracy_reward": 0.020833334326744078, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7354166865348816, + "step": 2151 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.3708526611329, + "epoch": 0.6887502000320052, + "grad_norm": 0.290466845035553, + "kl": 0.4612713187932968, + "learning_rate": 5.34249488298048e-06, + "loss": 0.1148, + "reward": 1.8708333730697633, + "reward_std": 0.228383469581604, + "rewards/accuracy_reward": 0.18333333935588597, + "rewards/format_reward": 0.9541666805744171, + "rewards/tag_count_reward": 0.7333333551883697, + "step": 2152 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.0833435058594, + "epoch": 0.6890702512401984, + "grad_norm": 0.2586331367492676, + "kl": 0.47631366848945617, + "learning_rate": 5.332607911713057e-06, + "loss": 0.1424, + "reward": 1.7338542222976685, + "reward_std": 0.33738535493612287, + "rewards/accuracy_reward": 0.0770833358168602, + "rewards/format_reward": 0.9333333551883698, + "rewards/tag_count_reward": 0.7234375178813934, + "step": 2153 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.283349609375, + "epoch": 0.6893903024483917, + "grad_norm": 0.3231147229671478, + "kl": 0.4204250156879425, + "learning_rate": 5.3227267702273625e-06, + "loss": 0.1248, + "reward": 1.7630208611488343, + "reward_std": 0.25813994109630584, + "rewards/accuracy_reward": 0.07500000167638063, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 2154 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.6062744140625, + "epoch": 0.6897103536565851, + "grad_norm": 0.30079278349876404, + "kl": 0.36308655291795733, + "learning_rate": 5.312851470865383e-06, + "loss": 0.1368, + "reward": 1.733333373069763, + "reward_std": 0.26057887747883796, + "rewards/accuracy_reward": 0.054166669771075246, + "rewards/format_reward": 0.9458333551883698, + "rewards/tag_count_reward": 0.7333333492279053, + "step": 2155 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.1395965576172, + "epoch": 0.6900304048647784, + "grad_norm": 0.2262878268957138, + "kl": 0.3142340861260891, + "learning_rate": 5.30298202596181e-06, + "loss": 0.1499, + "reward": 1.778125035762787, + "reward_std": 0.2502134948968887, + "rewards/accuracy_reward": 0.09583333544433117, + "rewards/format_reward": 0.947916692495346, + "rewards/tag_count_reward": 0.7343750238418579, + "step": 2156 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.9521118164063, + "epoch": 0.6903504560729716, + "grad_norm": 0.21603688597679138, + "kl": 0.2521523617208004, + "learning_rate": 5.293118447844023e-06, + "loss": 0.0752, + "reward": 1.7458333611488341, + "reward_std": 0.16236398369073868, + "rewards/accuracy_reward": 0.03125000111758709, + "rewards/format_reward": 0.9729166805744172, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2157 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.2062683105469, + "epoch": 0.690670507281165, + "grad_norm": 0.17151077091693878, + "kl": 0.4505059730261564, + "learning_rate": 5.283260748832072e-06, + "loss": 0.1216, + "reward": 1.825520896911621, + "reward_std": 0.2066534325480461, + "rewards/accuracy_reward": 0.13333333730697633, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.735937523841858, + "step": 2158 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.445849609375, + "epoch": 0.6909905584893583, + "grad_norm": 0.20686721801757812, + "kl": 0.38486229777336123, + "learning_rate": 5.2734089412386646e-06, + "loss": 0.0973, + "reward": 1.7302083611488341, + "reward_std": 0.193883565813303, + "rewards/accuracy_reward": 0.03333333469927311, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2159 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.3666870117188, + "epoch": 0.6913106096975516, + "grad_norm": 0.16719911992549896, + "kl": 0.39740680269896983, + "learning_rate": 5.26356303736915e-06, + "loss": 0.0701, + "reward": 1.7463542103767395, + "reward_std": 0.20027967020869256, + "rewards/accuracy_reward": 0.05625000167638063, + "rewards/format_reward": 0.956250011920929, + "rewards/tag_count_reward": 0.7338541805744171, + "step": 2160 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.6208526611329, + "epoch": 0.6916306609057449, + "grad_norm": 0.1335284560918808, + "kl": 0.22143305391073226, + "learning_rate": 5.253723049521507e-06, + "loss": 0.0538, + "reward": 1.8437500596046448, + "reward_std": 0.19260309934616088, + "rewards/accuracy_reward": 0.13333333805203437, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7416666746139526, + "step": 2161 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.5687683105468, + "epoch": 0.6919507121139382, + "grad_norm": 0.11407246440649033, + "kl": 0.3099120303988457, + "learning_rate": 5.243888989986312e-06, + "loss": 0.1122, + "reward": 1.8453125357627869, + "reward_std": 0.2336819589138031, + "rewards/accuracy_reward": 0.1354166692122817, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2162 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.5437683105469, + "epoch": 0.6922707633221316, + "grad_norm": 0.26538652181625366, + "kl": 0.1981400392949581, + "learning_rate": 5.234060871046751e-06, + "loss": 0.0868, + "reward": 1.7760417222976685, + "reward_std": 0.2001117028295994, + "rewards/accuracy_reward": 0.06666666828095913, + "rewards/format_reward": 0.9687500298023224, + "rewards/tag_count_reward": 0.7406250238418579, + "step": 2163 + }, + { + "clip_ratio": 0.0, + "completion_length": 537.464599609375, + "epoch": 0.6925908145303249, + "grad_norm": 0.12414728850126266, + "kl": 0.18391840681433677, + "learning_rate": 5.224238704978584e-06, + "loss": 0.0712, + "reward": 1.7281250357627869, + "reward_std": 0.17280828654766084, + "rewards/accuracy_reward": 0.016666666977107523, + "rewards/format_reward": 0.9708333432674408, + "rewards/tag_count_reward": 0.740625011920929, + "step": 2164 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.6791839599609, + "epoch": 0.6929108657385181, + "grad_norm": 0.15423992276191711, + "kl": 0.2515277363359928, + "learning_rate": 5.2144225040501375e-06, + "loss": 0.0848, + "reward": 1.7729167103767396, + "reward_std": 0.1920185036957264, + "rewards/accuracy_reward": 0.07500000298023224, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2165 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.2208557128906, + "epoch": 0.6932309169467115, + "grad_norm": 0.20855121314525604, + "kl": 0.4149386554956436, + "learning_rate": 5.2046122805222845e-06, + "loss": 0.1062, + "reward": 1.7333333492279053, + "reward_std": 0.234011735022068, + "rewards/accuracy_reward": 0.04583333358168602, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7375000298023224, + "step": 2166 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.0125152587891, + "epoch": 0.6935509681549048, + "grad_norm": 0.17261765897274017, + "kl": 0.31774858236312864, + "learning_rate": 5.194808046648434e-06, + "loss": 0.0679, + "reward": 1.743750011920929, + "reward_std": 0.19351360499858855, + "rewards/accuracy_reward": 0.0479166679084301, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7354166865348816, + "step": 2167 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.5458435058594, + "epoch": 0.6938710193630981, + "grad_norm": 0.1321432739496231, + "kl": 0.27640740275382997, + "learning_rate": 5.185009814674513e-06, + "loss": 0.1257, + "reward": 1.7958333849906922, + "reward_std": 0.2482768900692463, + "rewards/accuracy_reward": 0.10208333749324083, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7354166805744171, + "step": 2168 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.337515258789, + "epoch": 0.6941910705712914, + "grad_norm": 0.13096804916858673, + "kl": 0.26496610417962074, + "learning_rate": 5.175217596838956e-06, + "loss": 0.0941, + "reward": 1.7447916984558105, + "reward_std": 0.18921037912368774, + "rewards/accuracy_reward": 0.04583333432674408, + "rewards/format_reward": 0.9625000119209289, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2169 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.9062805175781, + "epoch": 0.6945111217794847, + "grad_norm": 0.11738302558660507, + "kl": 0.23022876232862471, + "learning_rate": 5.165431405372674e-06, + "loss": 0.0782, + "reward": 1.7848958492279052, + "reward_std": 0.21770973801612853, + "rewards/accuracy_reward": 0.07708333358168602, + "rewards/format_reward": 0.9666666924953461, + "rewards/tag_count_reward": 0.7411458551883697, + "step": 2170 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.9062622070312, + "epoch": 0.694831172987678, + "grad_norm": 0.16358114778995514, + "kl": 0.3719410330057144, + "learning_rate": 5.1556512524990636e-06, + "loss": 0.0994, + "reward": 1.7494791984558105, + "reward_std": 0.2786561943590641, + "rewards/accuracy_reward": 0.06250000130385161, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7348958551883698, + "step": 2171 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.7125213623046, + "epoch": 0.6951512241958714, + "grad_norm": 0.21294313669204712, + "kl": 0.2740899085998535, + "learning_rate": 5.145877150433967e-06, + "loss": 0.1084, + "reward": 1.7979166984558106, + "reward_std": 0.23426204025745392, + "rewards/accuracy_reward": 0.10416667088866234, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7354166805744171, + "step": 2172 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.1500183105469, + "epoch": 0.6954712754040646, + "grad_norm": 0.1394650638103485, + "kl": 0.3359750546514988, + "learning_rate": 5.1361091113856875e-06, + "loss": 0.1008, + "reward": 1.7921875357627868, + "reward_std": 0.15876233726739883, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 2173 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.2875061035156, + "epoch": 0.695791326612258, + "grad_norm": 0.10760974884033203, + "kl": 0.25749983713030816, + "learning_rate": 5.126347147554936e-06, + "loss": 0.0922, + "reward": 1.7937500476837158, + "reward_std": 0.18401784151792527, + "rewards/accuracy_reward": 0.08958333600312471, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7375000178813934, + "step": 2174 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.1146026611328, + "epoch": 0.6961113778204513, + "grad_norm": 0.21234045922756195, + "kl": 0.38290523290634154, + "learning_rate": 5.116591271134839e-06, + "loss": 0.0837, + "reward": 1.7427083849906921, + "reward_std": 0.17233860194683076, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.9625000119209289, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2175 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.0083557128906, + "epoch": 0.6964314290286446, + "grad_norm": 0.29094812273979187, + "kl": 0.4088446289300919, + "learning_rate": 5.106841494310929e-06, + "loss": 0.0907, + "reward": 1.7270833730697632, + "reward_std": 0.20447328686714172, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.9562500298023224, + "rewards/tag_count_reward": 0.7354166805744171, + "step": 2176 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.1562744140625, + "epoch": 0.6967514802368379, + "grad_norm": 0.24387161433696747, + "kl": 0.5301667034626008, + "learning_rate": 5.097097829261115e-06, + "loss": 0.0933, + "reward": 1.7411458969116211, + "reward_std": 0.19163210839033126, + "rewards/accuracy_reward": 0.04375000130385161, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7369791865348816, + "step": 2177 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.839599609375, + "epoch": 0.6970715314450312, + "grad_norm": 0.31514298915863037, + "kl": 0.340422347933054, + "learning_rate": 5.087360288155664e-06, + "loss": 0.098, + "reward": 1.7343750596046448, + "reward_std": 0.24370125085115432, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 2178 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.7562713623047, + "epoch": 0.6973915826532245, + "grad_norm": 0.09657612442970276, + "kl": 0.2676196489483118, + "learning_rate": 5.077628883157205e-06, + "loss": 0.0499, + "reward": 1.756250023841858, + "reward_std": 0.12914905995130538, + "rewards/accuracy_reward": 0.03958333432674408, + "rewards/format_reward": 0.9729166805744172, + "rewards/tag_count_reward": 0.7437500119209289, + "step": 2179 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.1396087646484, + "epoch": 0.6977116338614179, + "grad_norm": 0.24548500776290894, + "kl": 0.45806365422904494, + "learning_rate": 5.067903626420698e-06, + "loss": 0.1327, + "reward": 1.795312523841858, + "reward_std": 0.2129717141389847, + "rewards/accuracy_reward": 0.10208333730697632, + "rewards/format_reward": 0.9604166924953461, + "rewards/tag_count_reward": 0.7328125178813935, + "step": 2180 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.3729370117187, + "epoch": 0.6980316850696111, + "grad_norm": 0.15102331340312958, + "kl": 0.37719000279903414, + "learning_rate": 5.058184530093424e-06, + "loss": 0.0906, + "reward": 1.7322916865348816, + "reward_std": 0.19234516769647597, + "rewards/accuracy_reward": 0.029166667722165585, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2181 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.0646057128906, + "epoch": 0.6983517362778044, + "grad_norm": 0.1571054458618164, + "kl": 0.33296659886837005, + "learning_rate": 5.048471606314971e-06, + "loss": 0.0758, + "reward": 1.7744791865348817, + "reward_std": 0.17921509444713593, + "rewards/accuracy_reward": 0.07083333786576987, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.739062511920929, + "step": 2182 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.8687622070313, + "epoch": 0.6986717874859978, + "grad_norm": 0.33309119939804077, + "kl": 0.4173093684017658, + "learning_rate": 5.038764867217214e-06, + "loss": 0.0955, + "reward": 1.7833333730697631, + "reward_std": 0.20730995163321495, + "rewards/accuracy_reward": 0.08333333749324083, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7375000238418579, + "step": 2183 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.502099609375, + "epoch": 0.6989918386941911, + "grad_norm": 0.15169481933116913, + "kl": 0.22802985832095146, + "learning_rate": 5.0290643249243065e-06, + "loss": 0.0639, + "reward": 1.760937511920929, + "reward_std": 0.13749925792217255, + "rewards/accuracy_reward": 0.04166666772216558, + "rewards/format_reward": 0.9791666805744171, + "rewards/tag_count_reward": 0.7401041865348816, + "step": 2184 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.8312683105469, + "epoch": 0.6993118899023844, + "grad_norm": 0.17143678665161133, + "kl": 0.23197558745741845, + "learning_rate": 5.019369991552658e-06, + "loss": 0.0367, + "reward": 1.8010416984558106, + "reward_std": 0.08791158646345139, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.9854166746139527, + "rewards/tag_count_reward": 0.7447916805744171, + "step": 2185 + }, + { + "clip_ratio": 0.0, + "completion_length": 526.4062683105469, + "epoch": 0.6996319411105777, + "grad_norm": 0.05793720856308937, + "kl": 0.16285086199641227, + "learning_rate": 5.00968187921093e-06, + "loss": 0.045, + "reward": 1.848437523841858, + "reward_std": 0.1568957671523094, + "rewards/accuracy_reward": 0.12083333637565374, + "rewards/format_reward": 0.9833333432674408, + "rewards/tag_count_reward": 0.7442708432674408, + "step": 2186 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.0666931152343, + "epoch": 0.699951992318771, + "grad_norm": 0.42837584018707275, + "kl": 0.21833952143788338, + "learning_rate": 5.000000000000003e-06, + "loss": 0.0511, + "reward": 1.7682292103767394, + "reward_std": 0.12465300261974335, + "rewards/accuracy_reward": 0.05000000149011612, + "rewards/format_reward": 0.9770833492279053, + "rewards/tag_count_reward": 0.7411458492279053, + "step": 2187 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.4041870117187, + "epoch": 0.7002720435269643, + "grad_norm": 0.08884342014789581, + "kl": 0.3156105622649193, + "learning_rate": 4.990324366012977e-06, + "loss": 0.0828, + "reward": 1.7864583492279054, + "reward_std": 0.1695403054356575, + "rewards/accuracy_reward": 0.07916666995733976, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.740625011920929, + "step": 2188 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.4437622070312, + "epoch": 0.7005920947351576, + "grad_norm": 0.23606713116168976, + "kl": 0.1369132250547409, + "learning_rate": 4.980654989335156e-06, + "loss": 0.0579, + "reward": 1.8208333611488343, + "reward_std": 0.10346375182271003, + "rewards/accuracy_reward": 0.10000000298023223, + "rewards/format_reward": 0.9750000178813935, + "rewards/tag_count_reward": 0.7458333551883698, + "step": 2189 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.3271057128907, + "epoch": 0.7009121459433509, + "grad_norm": 0.13106216490268707, + "kl": 0.3166439961642027, + "learning_rate": 4.970991882044024e-06, + "loss": 0.0709, + "reward": 1.7213541746139527, + "reward_std": 0.14932389408349991, + "rewards/accuracy_reward": 0.018750001117587088, + "rewards/format_reward": 0.9625000119209289, + "rewards/tag_count_reward": 0.7401041865348816, + "step": 2190 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.6271026611328, + "epoch": 0.7012321971515443, + "grad_norm": 0.19997036457061768, + "kl": 0.41348587200045583, + "learning_rate": 4.961335056209234e-06, + "loss": 0.0585, + "reward": 1.7145833611488341, + "reward_std": 0.1433960720896721, + "rewards/accuracy_reward": 0.006250000186264515, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 2191 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.4062683105469, + "epoch": 0.7015522483597375, + "grad_norm": 0.20145119726657867, + "kl": 0.4204010270535946, + "learning_rate": 4.9516845238925926e-06, + "loss": 0.1152, + "reward": 1.7380208849906922, + "reward_std": 0.23682481199502944, + "rewards/accuracy_reward": 0.052083334885537624, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.7338541805744171, + "step": 2192 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.3208557128906, + "epoch": 0.7018722995679308, + "grad_norm": 0.1150142028927803, + "kl": 0.34649395793676374, + "learning_rate": 4.942040297148049e-06, + "loss": 0.0949, + "reward": 1.8109375596046449, + "reward_std": 0.2290053188800812, + "rewards/accuracy_reward": 0.11875000298023224, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 2193 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.8625122070313, + "epoch": 0.7021923507761242, + "grad_norm": 0.20946632325649261, + "kl": 0.3540042258799076, + "learning_rate": 4.932402388021677e-06, + "loss": 0.1114, + "reward": 1.7692708611488341, + "reward_std": 0.24689008817076682, + "rewards/accuracy_reward": 0.07916666865348816, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7401041924953461, + "step": 2194 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.9604431152344, + "epoch": 0.7025124019843175, + "grad_norm": 0.21829591691493988, + "kl": 0.3825660213828087, + "learning_rate": 4.922770808551649e-06, + "loss": 0.0676, + "reward": 1.7973958849906921, + "reward_std": 0.21412076726555823, + "rewards/accuracy_reward": 0.10208333637565374, + "rewards/format_reward": 0.9583333671092987, + "rewards/tag_count_reward": 0.7369791865348816, + "step": 2195 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.0666809082031, + "epoch": 0.7028324531925108, + "grad_norm": 0.21612143516540527, + "kl": 0.23135574162006378, + "learning_rate": 4.913145570768243e-06, + "loss": 0.1101, + "reward": 1.7859375476837158, + "reward_std": 0.17264212369918824, + "rewards/accuracy_reward": 0.07916666902601718, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7380208611488343, + "step": 2196 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.3833618164062, + "epoch": 0.7031525044007041, + "grad_norm": 0.10298977792263031, + "kl": 0.20634137317538262, + "learning_rate": 4.9035266866938125e-06, + "loss": 0.0891, + "reward": 1.8015625476837158, + "reward_std": 0.23462976813316344, + "rewards/accuracy_reward": 0.10625000149011612, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7369791865348816, + "step": 2197 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.5396057128906, + "epoch": 0.7034725556088974, + "grad_norm": 0.17319689691066742, + "kl": 0.4252075083553791, + "learning_rate": 4.8939141683427735e-06, + "loss": 0.1317, + "reward": 1.7807291865348815, + "reward_std": 0.28460691273212435, + "rewards/accuracy_reward": 0.09583333767950535, + "rewards/format_reward": 0.9500000357627869, + "rewards/tag_count_reward": 0.7348958492279053, + "step": 2198 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.5041809082031, + "epoch": 0.7037926068170908, + "grad_norm": 0.37925511598587036, + "kl": 0.25806584507226943, + "learning_rate": 4.884308027721593e-06, + "loss": 0.1236, + "reward": 1.7067708730697633, + "reward_std": 0.16876881793141366, + "rewards/accuracy_reward": 0.00625, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7421875238418579, + "step": 2199 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.7271057128906, + "epoch": 0.704112658025284, + "grad_norm": 0.1057218387722969, + "kl": 0.24062796980142592, + "learning_rate": 4.87470827682877e-06, + "loss": 0.0878, + "reward": 1.7885417222976685, + "reward_std": 0.19179236218333245, + "rewards/accuracy_reward": 0.08125000260770321, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 2200 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.097933959961, + "epoch": 0.7044327092334773, + "grad_norm": 0.1587941199541092, + "kl": 0.3910080552101135, + "learning_rate": 4.865114927654824e-06, + "loss": 0.1293, + "reward": 1.8296875357627869, + "reward_std": 0.24853597730398178, + "rewards/accuracy_reward": 0.1500000050291419, + "rewards/format_reward": 0.9458333551883698, + "rewards/tag_count_reward": 0.7338541865348815, + "step": 2201 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.4812683105469, + "epoch": 0.7047527604416707, + "grad_norm": 0.16094599664211273, + "kl": 0.3398930035531521, + "learning_rate": 4.855527992182281e-06, + "loss": 0.09, + "reward": 1.7609375238418579, + "reward_std": 0.21207387149333953, + "rewards/accuracy_reward": 0.06250000335276126, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.735937523841858, + "step": 2202 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.9229309082032, + "epoch": 0.705072811649864, + "grad_norm": 0.32247263193130493, + "kl": 0.5623447112739086, + "learning_rate": 4.8459474823856445e-06, + "loss": 0.11, + "reward": 1.6973958730697631, + "reward_std": 0.22016318514943123, + "rewards/accuracy_reward": 0.018750000186264514, + "rewards/format_reward": 0.947916692495346, + "rewards/tag_count_reward": 0.7307291924953461, + "step": 2203 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.1750244140625, + "epoch": 0.7053928628580572, + "grad_norm": 0.2458748072385788, + "kl": 0.33539998829364776, + "learning_rate": 4.836373410231405e-06, + "loss": 0.1112, + "reward": 1.7250000357627868, + "reward_std": 0.2504465445876122, + "rewards/accuracy_reward": 0.0479166692122817, + "rewards/format_reward": 0.9395833671092987, + "rewards/tag_count_reward": 0.7375000178813934, + "step": 2204 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.4750244140625, + "epoch": 0.7057129140662506, + "grad_norm": 0.16893352568149567, + "kl": 0.3260126397013664, + "learning_rate": 4.8268057876780075e-06, + "loss": 0.0876, + "reward": 1.7765625357627868, + "reward_std": 0.2559411033987999, + "rewards/accuracy_reward": 0.07500000186264515, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2205 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.6479248046875, + "epoch": 0.7060329652744439, + "grad_norm": 0.21566948294639587, + "kl": 0.4895564019680023, + "learning_rate": 4.81724462667584e-06, + "loss": 0.1139, + "reward": 1.6729167103767395, + "reward_std": 0.22808781638741493, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.9437500238418579, + "rewards/tag_count_reward": 0.7270833492279053, + "step": 2206 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.2021057128907, + "epoch": 0.7063530164826373, + "grad_norm": 0.13912886381149292, + "kl": 0.29945429414510727, + "learning_rate": 4.807689939167222e-06, + "loss": 0.0767, + "reward": 1.7656250476837159, + "reward_std": 0.17697276175022125, + "rewards/accuracy_reward": 0.06250000204890967, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 2207 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.4666900634766, + "epoch": 0.7066730676908305, + "grad_norm": 0.19941870868206024, + "kl": 0.3455564148724079, + "learning_rate": 4.798141737086384e-06, + "loss": 0.1115, + "reward": 1.90208340883255, + "reward_std": 0.1985380657017231, + "rewards/accuracy_reward": 0.202083339355886, + "rewards/format_reward": 0.9625000119209289, + "rewards/tag_count_reward": 0.737500011920929, + "step": 2208 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.191683959961, + "epoch": 0.7069931188990238, + "grad_norm": 0.32233765721321106, + "kl": 0.278411491215229, + "learning_rate": 4.788600032359461e-06, + "loss": 0.0535, + "reward": 1.825000023841858, + "reward_std": 0.16070781499147416, + "rewards/accuracy_reward": 0.11250000204890967, + "rewards/format_reward": 0.9729166865348816, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2209 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.3521026611328, + "epoch": 0.7073131701072172, + "grad_norm": 0.1364719420671463, + "kl": 0.49797850996255877, + "learning_rate": 4.77906483690447e-06, + "loss": 0.1008, + "reward": 1.7989583611488342, + "reward_std": 0.2100510597229004, + "rewards/accuracy_reward": 0.10000000204890966, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7343750298023224, + "step": 2210 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.9500274658203, + "epoch": 0.7076332213154105, + "grad_norm": 0.20895767211914062, + "kl": 0.31710937693715097, + "learning_rate": 4.769536162631292e-06, + "loss": 0.0932, + "reward": 1.8171875476837158, + "reward_std": 0.19292281717061996, + "rewards/accuracy_reward": 0.11250000409781932, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7421875238418579, + "step": 2211 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.5812683105469, + "epoch": 0.7079532725236037, + "grad_norm": 0.12749753892421722, + "kl": 0.40187211334705353, + "learning_rate": 4.760014021441671e-06, + "loss": 0.1104, + "reward": 1.7026042103767396, + "reward_std": 0.2228372722864151, + "rewards/accuracy_reward": 0.016666667349636555, + "rewards/format_reward": 0.947916692495346, + "rewards/tag_count_reward": 0.7380208671092987, + "step": 2212 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.4541839599609, + "epoch": 0.7082733237317971, + "grad_norm": 0.18957151472568512, + "kl": 0.18363816738128663, + "learning_rate": 4.750498425229188e-06, + "loss": 0.0541, + "reward": 1.8494792342185975, + "reward_std": 0.22633131146430968, + "rewards/accuracy_reward": 0.13750000670552254, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7432291805744171, + "step": 2213 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.0437683105469, + "epoch": 0.7085933749399904, + "grad_norm": 0.14039163291454315, + "kl": 0.4518029972910881, + "learning_rate": 4.740989385879248e-06, + "loss": 0.0622, + "reward": 1.8098958492279054, + "reward_std": 0.23249467983841896, + "rewards/accuracy_reward": 0.11666667088866234, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7348958551883698, + "step": 2214 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.1229309082031, + "epoch": 0.7089134261481838, + "grad_norm": 0.1878720074892044, + "kl": 0.1566619262099266, + "learning_rate": 4.731486915269066e-06, + "loss": 0.0492, + "reward": 1.8062500357627869, + "reward_std": 0.13992664515972136, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.9770833492279053, + "rewards/tag_count_reward": 0.7458333492279052, + "step": 2215 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.652099609375, + "epoch": 0.709233477356377, + "grad_norm": 0.11067891865968704, + "kl": 0.2782533496618271, + "learning_rate": 4.721991025267657e-06, + "loss": 0.0475, + "reward": 1.8005208849906922, + "reward_std": 0.21060936525464058, + "rewards/accuracy_reward": 0.09375000391155482, + "rewards/format_reward": 0.9666666924953461, + "rewards/tag_count_reward": 0.7401041865348816, + "step": 2216 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.7771087646485, + "epoch": 0.7095535285645703, + "grad_norm": 0.2800833582878113, + "kl": 0.2611836478114128, + "learning_rate": 4.712501727735808e-06, + "loss": 0.1179, + "reward": 1.8302083730697631, + "reward_std": 0.2340967148542404, + "rewards/accuracy_reward": 0.1395833384245634, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7343750178813935, + "step": 2217 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.495849609375, + "epoch": 0.7098735797727637, + "grad_norm": 0.11880511045455933, + "kl": 0.31443496271967886, + "learning_rate": 4.703019034526082e-06, + "loss": 0.0788, + "reward": 1.7661458492279052, + "reward_std": 0.18282609283924103, + "rewards/accuracy_reward": 0.06458333488553762, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2218 + }, + { + "clip_ratio": 0.0, + "completion_length": 611.8687683105469, + "epoch": 0.710193630980957, + "grad_norm": 0.14947304129600525, + "kl": 0.4382303521037102, + "learning_rate": 4.693542957482786e-06, + "loss": 0.0875, + "reward": 1.7578125238418578, + "reward_std": 0.22819079458713531, + "rewards/accuracy_reward": 0.0687500013038516, + "rewards/format_reward": 0.9541666805744171, + "rewards/tag_count_reward": 0.7348958551883698, + "step": 2219 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.245849609375, + "epoch": 0.7105136821891502, + "grad_norm": 0.23497365415096283, + "kl": 0.28563379757106305, + "learning_rate": 4.684073508441961e-06, + "loss": 0.0807, + "reward": 1.8682292342185973, + "reward_std": 0.1950744114816189, + "rewards/accuracy_reward": 0.17708334028720857, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7411458432674408, + "step": 2220 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.0354309082031, + "epoch": 0.7108337333973436, + "grad_norm": 0.09888350963592529, + "kl": 0.2779009331017733, + "learning_rate": 4.674610699231373e-06, + "loss": 0.0489, + "reward": 1.7447916984558105, + "reward_std": 0.18470503836870195, + "rewards/accuracy_reward": 0.04166666772216558, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2221 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.5125152587891, + "epoch": 0.7111537846055369, + "grad_norm": 0.23047621548175812, + "kl": 0.2806026488542557, + "learning_rate": 4.665154541670498e-06, + "loss": 0.1104, + "reward": 1.7427083611488343, + "reward_std": 0.20498983785510064, + "rewards/accuracy_reward": 0.041666666977107525, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 2222 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.8708557128906, + "epoch": 0.7114738358137302, + "grad_norm": 0.19911795854568481, + "kl": 0.13476757146418095, + "learning_rate": 4.655705047570498e-06, + "loss": 0.0467, + "reward": 1.8401041984558106, + "reward_std": 0.1357252113521099, + "rewards/accuracy_reward": 0.11875000353902579, + "rewards/format_reward": 0.9770833432674408, + "rewards/tag_count_reward": 0.7442708432674408, + "step": 2223 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.1687744140625, + "epoch": 0.7117938870219235, + "grad_norm": 0.16080540418624878, + "kl": 0.5152991503477097, + "learning_rate": 4.6462622287342154e-06, + "loss": 0.122, + "reward": 1.7302083849906922, + "reward_std": 0.23456745445728303, + "rewards/accuracy_reward": 0.04583333544433117, + "rewards/format_reward": 0.9416666805744172, + "rewards/tag_count_reward": 0.7427083551883698, + "step": 2224 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.8166870117187, + "epoch": 0.7121139382301168, + "grad_norm": 0.2366468459367752, + "kl": 0.670868530869484, + "learning_rate": 4.636826096956153e-06, + "loss": 0.1181, + "reward": 1.7375000357627868, + "reward_std": 0.2714173913002014, + "rewards/accuracy_reward": 0.05000000223517418, + "rewards/format_reward": 0.9520833611488342, + "rewards/tag_count_reward": 0.7354166865348816, + "step": 2225 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.1687713623047, + "epoch": 0.7124339894383102, + "grad_norm": 0.19582653045654297, + "kl": 0.33867536932229997, + "learning_rate": 4.627396664022462e-06, + "loss": 0.0905, + "reward": 1.806250023841858, + "reward_std": 0.23281241804361344, + "rewards/accuracy_reward": 0.10416667144745588, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.737500011920929, + "step": 2226 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.6937713623047, + "epoch": 0.7127540406465035, + "grad_norm": 0.20600047707557678, + "kl": 0.41766203939914703, + "learning_rate": 4.617973941710932e-06, + "loss": 0.0986, + "reward": 1.7557292222976684, + "reward_std": 0.2621150605380535, + "rewards/accuracy_reward": 0.0729166679084301, + "rewards/format_reward": 0.9479166865348816, + "rewards/tag_count_reward": 0.7348958551883698, + "step": 2227 + }, + { + "clip_ratio": 0.0, + "completion_length": 604.0687744140625, + "epoch": 0.7130740918546967, + "grad_norm": 0.1924317628145218, + "kl": 0.4410012990236282, + "learning_rate": 4.608557941790954e-06, + "loss": 0.0809, + "reward": 1.7578125238418578, + "reward_std": 0.22074405550956727, + "rewards/accuracy_reward": 0.05416666846722364, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7411458611488342, + "step": 2228 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.3833557128906, + "epoch": 0.7133941430628901, + "grad_norm": 0.2038911134004593, + "kl": 0.3418886814266443, + "learning_rate": 4.5991486760235404e-06, + "loss": 0.0756, + "reward": 1.9088542103767394, + "reward_std": 0.19126099869608879, + "rewards/accuracy_reward": 0.20416667088866233, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 2229 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.6812713623046, + "epoch": 0.7137141942710834, + "grad_norm": 0.33020251989364624, + "kl": 0.2924545969814062, + "learning_rate": 4.5897461561612814e-06, + "loss": 0.0629, + "reward": 1.8562500476837158, + "reward_std": 0.16590869426727295, + "rewards/accuracy_reward": 0.135416672937572, + "rewards/format_reward": 0.9770833432674408, + "rewards/tag_count_reward": 0.7437500178813934, + "step": 2230 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.402099609375, + "epoch": 0.7140342454792766, + "grad_norm": 0.20029504597187042, + "kl": 0.37912337966263293, + "learning_rate": 4.580350393948355e-06, + "loss": 0.0894, + "reward": 1.82239590883255, + "reward_std": 0.15126769095659257, + "rewards/accuracy_reward": 0.10833333730697632, + "rewards/format_reward": 0.975000011920929, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2231 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.1562683105469, + "epoch": 0.71435429668747, + "grad_norm": 0.1418682336807251, + "kl": 0.33628618270158767, + "learning_rate": 4.5709614011204794e-06, + "loss": 0.124, + "reward": 1.8239583849906922, + "reward_std": 0.2332266129553318, + "rewards/accuracy_reward": 0.14166666865348815, + "rewards/format_reward": 0.9458333551883698, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2232 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.0396057128906, + "epoch": 0.7146743478956633, + "grad_norm": 0.17476728558540344, + "kl": 0.34939279705286025, + "learning_rate": 4.561579189404929e-06, + "loss": 0.0956, + "reward": 1.7447917103767394, + "reward_std": 0.24993923604488372, + "rewards/accuracy_reward": 0.056250001303851604, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7343750298023224, + "step": 2233 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.0020935058594, + "epoch": 0.7149943991038566, + "grad_norm": 0.14975003898143768, + "kl": 0.3478496439754963, + "learning_rate": 4.552203770520508e-06, + "loss": 0.0921, + "reward": 1.7145833611488341, + "reward_std": 0.14527590125799178, + "rewards/accuracy_reward": 0.006250000186264515, + "rewards/format_reward": 0.9708333432674408, + "rewards/tag_count_reward": 0.737500011920929, + "step": 2234 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.6812622070313, + "epoch": 0.7153144503120499, + "grad_norm": 0.2850889563560486, + "kl": 0.3510056212544441, + "learning_rate": 4.542835156177537e-06, + "loss": 0.0883, + "reward": 1.7458333730697633, + "reward_std": 0.16523813009262084, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2235 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.1396057128907, + "epoch": 0.7156345015202432, + "grad_norm": 0.15181845426559448, + "kl": 0.4017877370119095, + "learning_rate": 4.5334733580778305e-06, + "loss": 0.128, + "reward": 1.6812500357627869, + "reward_std": 0.27939036712050436, + "rewards/accuracy_reward": 0.00833333358168602, + "rewards/format_reward": 0.9375000178813935, + "rewards/tag_count_reward": 0.7354166865348816, + "step": 2236 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.2750244140625, + "epoch": 0.7159545527284366, + "grad_norm": 0.15312950313091278, + "kl": 0.3317554071545601, + "learning_rate": 4.5241183879146926e-06, + "loss": 0.0944, + "reward": 1.7895833849906921, + "reward_std": 0.2161307230591774, + "rewards/accuracy_reward": 0.08958333618938923, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7375000178813934, + "step": 2237 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.0541900634765, + "epoch": 0.7162746039366299, + "grad_norm": 0.21131104230880737, + "kl": 0.3239736631512642, + "learning_rate": 4.5147702573729015e-06, + "loss": 0.108, + "reward": 1.8375000476837158, + "reward_std": 0.3055271409451962, + "rewards/accuracy_reward": 0.15625000558793545, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7312500298023223, + "step": 2238 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.0250213623046, + "epoch": 0.7165946551448231, + "grad_norm": 0.2556743621826172, + "kl": 0.29245643988251685, + "learning_rate": 4.505428978128693e-06, + "loss": 0.0769, + "reward": 1.7364583730697631, + "reward_std": 0.16923416927456855, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.9541666805744171, + "rewards/tag_count_reward": 0.740625011920929, + "step": 2239 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.1437744140625, + "epoch": 0.7169147063530165, + "grad_norm": 0.2496984452009201, + "kl": 0.4537731699645519, + "learning_rate": 4.496094561849741e-06, + "loss": 0.117, + "reward": 1.844270896911621, + "reward_std": 0.22066160291433334, + "rewards/accuracy_reward": 0.1500000050291419, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 2240 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.0208618164063, + "epoch": 0.7172347575612098, + "grad_norm": 0.23430490493774414, + "kl": 0.41818406283855436, + "learning_rate": 4.486767020195151e-06, + "loss": 0.0875, + "reward": 1.7734375715255737, + "reward_std": 0.24914529621601106, + "rewards/accuracy_reward": 0.08958333563059569, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7338541805744171, + "step": 2241 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.6146087646484, + "epoch": 0.7175548087694031, + "grad_norm": 0.16890211403369904, + "kl": 0.4942522332072258, + "learning_rate": 4.4774463648154375e-06, + "loss": 0.1103, + "reward": 1.7401042222976684, + "reward_std": 0.21844895631074907, + "rewards/accuracy_reward": 0.0479166679084301, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.735937523841858, + "step": 2242 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.2750213623046, + "epoch": 0.7178748599775964, + "grad_norm": 0.12553976476192474, + "kl": 0.27406597658991816, + "learning_rate": 4.46813260735252e-06, + "loss": 0.1205, + "reward": 1.756250023841858, + "reward_std": 0.24123900830745698, + "rewards/accuracy_reward": 0.06875000260770321, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7375000178813934, + "step": 2243 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.3750244140625, + "epoch": 0.7181949111857897, + "grad_norm": 0.1890048384666443, + "kl": 0.30552619621157645, + "learning_rate": 4.4588257594397e-06, + "loss": 0.0785, + "reward": 1.725000023841858, + "reward_std": 0.18632035106420516, + "rewards/accuracy_reward": 0.025, + "rewards/format_reward": 0.9625000119209289, + "rewards/tag_count_reward": 0.7375000238418579, + "step": 2244 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.6854309082031, + "epoch": 0.718514962393983, + "grad_norm": 0.19801272451877594, + "kl": 0.32198435068130493, + "learning_rate": 4.4495258327016415e-06, + "loss": 0.0965, + "reward": 1.8354167342185974, + "reward_std": 0.2720773681998253, + "rewards/accuracy_reward": 0.14583333916962146, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7333333551883697, + "step": 2245 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.7771057128906, + "epoch": 0.7188350136021764, + "grad_norm": 0.14209610223770142, + "kl": 0.2674247484654188, + "learning_rate": 4.44023283875437e-06, + "loss": 0.0892, + "reward": 1.8104167103767395, + "reward_std": 0.20731791108846664, + "rewards/accuracy_reward": 0.11041666902601718, + "rewards/format_reward": 0.9604166746139526, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2246 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.7958526611328, + "epoch": 0.7191550648103696, + "grad_norm": 0.2517361044883728, + "kl": 0.29758902490139005, + "learning_rate": 4.430946789205255e-06, + "loss": 0.1018, + "reward": 1.7119791746139525, + "reward_std": 0.22636782750487328, + "rewards/accuracy_reward": 0.027083333395421506, + "rewards/format_reward": 0.954166692495346, + "rewards/tag_count_reward": 0.7307291865348816, + "step": 2247 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.150015258789, + "epoch": 0.719475116018563, + "grad_norm": 0.16266785562038422, + "kl": 0.2869046814739704, + "learning_rate": 4.421667695652987e-06, + "loss": 0.0691, + "reward": 1.732812523841858, + "reward_std": 0.20949894338846206, + "rewards/accuracy_reward": 0.0375000013038516, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2248 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.993765258789, + "epoch": 0.7197951672267563, + "grad_norm": 0.09789406508207321, + "kl": 0.16912736520171165, + "learning_rate": 4.412395569687568e-06, + "loss": 0.0375, + "reward": 1.7848958611488341, + "reward_std": 0.15943676605820656, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.9750000178813935, + "rewards/tag_count_reward": 0.7432291746139527, + "step": 2249 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.8875183105469, + "epoch": 0.7201152184349496, + "grad_norm": 0.38918501138687134, + "kl": 0.31766297519207, + "learning_rate": 4.403130422890299e-06, + "loss": 0.0914, + "reward": 1.7619791865348815, + "reward_std": 0.23368189930915834, + "rewards/accuracy_reward": 0.06666666716337204, + "rewards/format_reward": 0.9583333611488343, + "rewards/tag_count_reward": 0.7369791865348816, + "step": 2250 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.5479248046875, + "epoch": 0.7204352696431429, + "grad_norm": 0.15137572586536407, + "kl": 0.4091979868710041, + "learning_rate": 4.393872266833764e-06, + "loss": 0.1025, + "reward": 1.784375047683716, + "reward_std": 0.1803455211222172, + "rewards/accuracy_reward": 0.08125000223517417, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 2251 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.0750244140625, + "epoch": 0.7207553208513362, + "grad_norm": 0.4936150014400482, + "kl": 0.4002502106130123, + "learning_rate": 4.3846211130818185e-06, + "loss": 0.1167, + "reward": 1.7380208611488341, + "reward_std": 0.23237907737493516, + "rewards/accuracy_reward": 0.054166667722165586, + "rewards/format_reward": 0.9500000298023223, + "rewards/tag_count_reward": 0.7338541865348815, + "step": 2252 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.0875183105469, + "epoch": 0.7210753720595295, + "grad_norm": 0.18475115299224854, + "kl": 0.27582458928227427, + "learning_rate": 4.375376973189559e-06, + "loss": 0.0916, + "reward": 1.7911458373069764, + "reward_std": 0.19789444506168366, + "rewards/accuracy_reward": 0.07708333488553762, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7453125238418579, + "step": 2253 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.302099609375, + "epoch": 0.7213954232677229, + "grad_norm": 0.3201963007450104, + "kl": 0.3628829248249531, + "learning_rate": 4.3661398587033355e-06, + "loss": 0.0856, + "reward": 1.8302083730697631, + "reward_std": 0.17892763316631316, + "rewards/accuracy_reward": 0.12291667032986879, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7406250238418579, + "step": 2254 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.0312683105469, + "epoch": 0.7217154744759161, + "grad_norm": 0.29653018712997437, + "kl": 0.3145171828567982, + "learning_rate": 4.356909781160716e-06, + "loss": 0.1128, + "reward": 1.8046875715255737, + "reward_std": 0.21797730773687363, + "rewards/accuracy_reward": 0.11250000298023224, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.735937523841858, + "step": 2255 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.9687683105469, + "epoch": 0.7220355256841094, + "grad_norm": 0.22039726376533508, + "kl": 0.4378112189471722, + "learning_rate": 4.347686752090482e-06, + "loss": 0.1191, + "reward": 1.7296875596046448, + "reward_std": 0.2500815257430077, + "rewards/accuracy_reward": 0.047916668094694616, + "rewards/format_reward": 0.9479166865348816, + "rewards/tag_count_reward": 0.7338541865348815, + "step": 2256 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.7437744140625, + "epoch": 0.7223555768923028, + "grad_norm": 0.27176499366760254, + "kl": 0.49834114536643026, + "learning_rate": 4.338470783012609e-06, + "loss": 0.1389, + "reward": 1.7661458849906921, + "reward_std": 0.2512555614113808, + "rewards/accuracy_reward": 0.08333333563059568, + "rewards/format_reward": 0.954166692495346, + "rewards/tag_count_reward": 0.7286458492279053, + "step": 2257 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.535433959961, + "epoch": 0.7226756281004961, + "grad_norm": 0.15212570130825043, + "kl": 0.410233548656106, + "learning_rate": 4.3292618854382564e-06, + "loss": 0.1073, + "reward": 1.7916666984558105, + "reward_std": 0.20430766493082048, + "rewards/accuracy_reward": 0.10208333637565374, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7333333492279053, + "step": 2258 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.9896026611328, + "epoch": 0.7229956793086894, + "grad_norm": 0.21144744753837585, + "kl": 0.49212879687547684, + "learning_rate": 4.320060070869747e-06, + "loss": 0.1128, + "reward": 1.7947917222976684, + "reward_std": 0.2727638013660908, + "rewards/accuracy_reward": 0.11458333786576987, + "rewards/format_reward": 0.947916692495346, + "rewards/tag_count_reward": 0.7322916865348816, + "step": 2259 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.808349609375, + "epoch": 0.7233157305168827, + "grad_norm": 0.26423129439353943, + "kl": 0.5360765296965837, + "learning_rate": 4.310865350800566e-06, + "loss": 0.1251, + "reward": 1.7338542103767396, + "reward_std": 0.2479950025677681, + "rewards/accuracy_reward": 0.08333333637565374, + "rewards/format_reward": 0.9270833492279053, + "rewards/tag_count_reward": 0.723437511920929, + "step": 2260 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.3375244140625, + "epoch": 0.723635781725076, + "grad_norm": 0.48073655366897583, + "kl": 0.543912273645401, + "learning_rate": 4.3016777367153206e-06, + "loss": 0.1057, + "reward": 1.7390625476837158, + "reward_std": 0.24218678548932077, + "rewards/accuracy_reward": 0.05416666828095913, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7307291924953461, + "step": 2261 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.2375122070313, + "epoch": 0.7239558329332694, + "grad_norm": 0.27856066823005676, + "kl": 0.48950769305229186, + "learning_rate": 4.292497240089758e-06, + "loss": 0.1204, + "reward": 1.7489583849906922, + "reward_std": 0.23832616060972214, + "rewards/accuracy_reward": 0.07291666977107525, + "rewards/format_reward": 0.9458333611488342, + "rewards/tag_count_reward": 0.7302083551883698, + "step": 2262 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.6250152587891, + "epoch": 0.7242758841414626, + "grad_norm": 0.2674890458583832, + "kl": 0.5480339720845222, + "learning_rate": 4.283323872390728e-06, + "loss": 0.1184, + "reward": 1.7369791984558105, + "reward_std": 0.24941499531269073, + "rewards/accuracy_reward": 0.060416667722165585, + "rewards/format_reward": 0.9458333551883698, + "rewards/tag_count_reward": 0.7307291984558105, + "step": 2263 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.045849609375, + "epoch": 0.7245959353496559, + "grad_norm": 0.12551912665367126, + "kl": 0.32917521223425866, + "learning_rate": 4.274157645076179e-06, + "loss": 0.0759, + "reward": 1.789062535762787, + "reward_std": 0.2022472068667412, + "rewards/accuracy_reward": 0.08125000149011612, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.7369791865348816, + "step": 2264 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.833349609375, + "epoch": 0.7249159865578493, + "grad_norm": 0.16230449080467224, + "kl": 0.269290691614151, + "learning_rate": 4.264998569595138e-06, + "loss": 0.0549, + "reward": 1.8661459326744079, + "reward_std": 0.19540103599429132, + "rewards/accuracy_reward": 0.15833333861082793, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7411458551883697, + "step": 2265 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.0396118164062, + "epoch": 0.7252360377660426, + "grad_norm": 0.17532478272914886, + "kl": 0.3482740193605423, + "learning_rate": 4.255846657387701e-06, + "loss": 0.0833, + "reward": 1.7963541865348815, + "reward_std": 0.2054486319422722, + "rewards/accuracy_reward": 0.08541666865348815, + "rewards/format_reward": 0.9729166805744172, + "rewards/tag_count_reward": 0.7380208492279052, + "step": 2266 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.8541870117188, + "epoch": 0.7255560889742358, + "grad_norm": 0.11146936565637589, + "kl": 0.29559036940336225, + "learning_rate": 4.246701919885017e-06, + "loss": 0.0928, + "reward": 1.7734375238418578, + "reward_std": 0.20702899396419525, + "rewards/accuracy_reward": 0.08125000149011612, + "rewards/format_reward": 0.956250011920929, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 2267 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.8333679199219, + "epoch": 0.7258761401824292, + "grad_norm": 0.13458676636219025, + "kl": 0.368284372985363, + "learning_rate": 4.2375643685092745e-06, + "loss": 0.0907, + "reward": 1.7427083849906921, + "reward_std": 0.19133371710777283, + "rewards/accuracy_reward": 0.0458333345130086, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7343750238418579, + "step": 2268 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.6687652587891, + "epoch": 0.7261961913906225, + "grad_norm": 0.15233393013477325, + "kl": 0.19312002062797545, + "learning_rate": 4.228434014673679e-06, + "loss": 0.0697, + "reward": 1.7552083611488343, + "reward_std": 0.14367434456944467, + "rewards/accuracy_reward": 0.04375000149011612, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7427083551883698, + "step": 2269 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.3854339599609, + "epoch": 0.7265162425988159, + "grad_norm": 0.19035114347934723, + "kl": 0.3742452569305897, + "learning_rate": 4.219310869782453e-06, + "loss": 0.0632, + "reward": 1.7869791984558105, + "reward_std": 0.17124900594353676, + "rewards/accuracy_reward": 0.07708333563059569, + "rewards/format_reward": 0.9729166924953461, + "rewards/tag_count_reward": 0.736979192495346, + "step": 2270 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.4958435058594, + "epoch": 0.7268362938070091, + "grad_norm": 0.18626444041728973, + "kl": 0.29630909487605095, + "learning_rate": 4.210194945230815e-06, + "loss": 0.0921, + "reward": 1.7609375357627868, + "reward_std": 0.19042362570762633, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.735937523841858, + "step": 2271 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.681265258789, + "epoch": 0.7271563450152024, + "grad_norm": 0.18361890316009521, + "kl": 0.27902993783354757, + "learning_rate": 4.201086252404962e-06, + "loss": 0.0884, + "reward": 1.7515625596046447, + "reward_std": 0.254750494658947, + "rewards/accuracy_reward": 0.06666666846722365, + "rewards/format_reward": 0.9520833611488342, + "rewards/tag_count_reward": 0.732812511920929, + "step": 2272 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.4583465576172, + "epoch": 0.7274763962233958, + "grad_norm": 0.13005219399929047, + "kl": 0.25048135630786417, + "learning_rate": 4.19198480268206e-06, + "loss": 0.0997, + "reward": 1.7901041984558106, + "reward_std": 0.25187977477908136, + "rewards/accuracy_reward": 0.08541666977107525, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 2273 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.5375183105468, + "epoch": 0.727796447431589, + "grad_norm": 0.07397562265396118, + "kl": 0.1687136735767126, + "learning_rate": 4.1828906074302255e-06, + "loss": 0.0406, + "reward": 1.748437523841858, + "reward_std": 0.11814125031232833, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.975000011920929, + "rewards/tag_count_reward": 0.7401041865348816, + "step": 2274 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.7416870117188, + "epoch": 0.7281164986397823, + "grad_norm": 0.10808942466974258, + "kl": 0.22886997163295747, + "learning_rate": 4.1738036780085175e-06, + "loss": 0.0888, + "reward": 1.7609375596046448, + "reward_std": 0.19000280499458314, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.735937523841858, + "step": 2275 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.3750244140625, + "epoch": 0.7284365498479757, + "grad_norm": 0.20324602723121643, + "kl": 0.2584953740239143, + "learning_rate": 4.164724025766917e-06, + "loss": 0.0639, + "reward": 1.795312523841858, + "reward_std": 0.19345357716083528, + "rewards/accuracy_reward": 0.08333333674818277, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7432291865348816, + "step": 2276 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.2333618164063, + "epoch": 0.728756601056169, + "grad_norm": 0.15485358238220215, + "kl": 0.19948984831571578, + "learning_rate": 4.155651662046319e-06, + "loss": 0.0708, + "reward": 1.8125000476837159, + "reward_std": 0.17140211313962936, + "rewards/accuracy_reward": 0.09375000428408384, + "rewards/format_reward": 0.9750000178813935, + "rewards/tag_count_reward": 0.743750023841858, + "step": 2277 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.4083587646485, + "epoch": 0.7290766522643622, + "grad_norm": 0.12642821669578552, + "kl": 0.30407530032098296, + "learning_rate": 4.1465865981785055e-06, + "loss": 0.1133, + "reward": 1.7520833730697631, + "reward_std": 0.22057257741689681, + "rewards/accuracy_reward": 0.05833333507180214, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7354166805744171, + "step": 2278 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.9625213623046, + "epoch": 0.7293967034725556, + "grad_norm": 0.15293292701244354, + "kl": 0.3121527761220932, + "learning_rate": 4.137528845486152e-06, + "loss": 0.085, + "reward": 1.825520896911621, + "reward_std": 0.162899911403656, + "rewards/accuracy_reward": 0.11250000298023224, + "rewards/format_reward": 0.9708333551883698, + "rewards/tag_count_reward": 0.7421875238418579, + "step": 2279 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.7583526611328, + "epoch": 0.7297167546807489, + "grad_norm": 0.25029483437538147, + "kl": 0.4540176376700401, + "learning_rate": 4.128478415282795e-06, + "loss": 0.1197, + "reward": 1.8125000476837159, + "reward_std": 0.2490193247795105, + "rewards/accuracy_reward": 0.12291667070239783, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7312500238418579, + "step": 2280 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.8854431152344, + "epoch": 0.7300368058889423, + "grad_norm": 0.30002540349960327, + "kl": 0.27474406994879247, + "learning_rate": 4.11943531887283e-06, + "loss": 0.0807, + "reward": 1.7744791984558106, + "reward_std": 0.15698974579572678, + "rewards/accuracy_reward": 0.05625000111758709, + "rewards/format_reward": 0.9750000238418579, + "rewards/tag_count_reward": 0.7432291865348816, + "step": 2281 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.2104370117188, + "epoch": 0.7303568570971355, + "grad_norm": 0.09751484543085098, + "kl": 0.2339543603360653, + "learning_rate": 4.1103995675514865e-06, + "loss": 0.1043, + "reward": 1.756770873069763, + "reward_std": 0.1833608940243721, + "rewards/accuracy_reward": 0.05000000149011612, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 2282 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.4479431152344, + "epoch": 0.7306769083053288, + "grad_norm": 0.10962416976690292, + "kl": 0.2689382560551167, + "learning_rate": 4.101371172604823e-06, + "loss": 0.0902, + "reward": 1.748958373069763, + "reward_std": 0.177383716404438, + "rewards/accuracy_reward": 0.04583333432674408, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7364583611488342, + "step": 2283 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.0396118164062, + "epoch": 0.7309969595135222, + "grad_norm": 0.16082532703876495, + "kl": 0.17291892133653164, + "learning_rate": 4.0923501453097115e-06, + "loss": 0.0739, + "reward": 1.7531250357627868, + "reward_std": 0.14365102648735045, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 2284 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.5771026611328, + "epoch": 0.7313170107217155, + "grad_norm": 0.15020890533924103, + "kl": 0.2913985226303339, + "learning_rate": 4.08333649693382e-06, + "loss": 0.0784, + "reward": 1.7869791984558105, + "reward_std": 0.19107620120048524, + "rewards/accuracy_reward": 0.07916666828095913, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7411458492279053, + "step": 2285 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.0666778564453, + "epoch": 0.7316370619299087, + "grad_norm": 0.17711031436920166, + "kl": 0.36008758544921876, + "learning_rate": 4.074330238735592e-06, + "loss": 0.0677, + "reward": 1.7583333611488343, + "reward_std": 0.14769948348402978, + "rewards/accuracy_reward": 0.0479166692122817, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7437500178813934, + "step": 2286 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.268765258789, + "epoch": 0.7319571131381021, + "grad_norm": 0.08795570582151413, + "kl": 0.25694953128695486, + "learning_rate": 4.065331381964252e-06, + "loss": 0.0852, + "reward": 1.8203125238418578, + "reward_std": 0.24367085993289947, + "rewards/accuracy_reward": 0.11875000391155481, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2287 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.7375244140625, + "epoch": 0.7322771643462954, + "grad_norm": 0.13169290125370026, + "kl": 0.21410394608974456, + "learning_rate": 4.056339937859776e-06, + "loss": 0.0555, + "reward": 1.8015625476837158, + "reward_std": 0.1801581375300884, + "rewards/accuracy_reward": 0.08125000391155482, + "rewards/format_reward": 0.975000011920929, + "rewards/tag_count_reward": 0.7453125178813934, + "step": 2288 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.9333587646485, + "epoch": 0.7325972155544888, + "grad_norm": 0.08827610313892365, + "kl": 0.39082359373569486, + "learning_rate": 4.047355917652877e-06, + "loss": 0.0876, + "reward": 1.7973958849906921, + "reward_std": 0.18793805167078972, + "rewards/accuracy_reward": 0.09375000316649676, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7390625238418579, + "step": 2289 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.5562744140625, + "epoch": 0.732917266762682, + "grad_norm": 0.2297748476266861, + "kl": 0.28518550768494605, + "learning_rate": 4.0383793325650025e-06, + "loss": 0.0704, + "reward": 1.7703125476837158, + "reward_std": 0.18034229278564454, + "rewards/accuracy_reward": 0.08958333637565374, + "rewards/format_reward": 0.950000011920929, + "rewards/tag_count_reward": 0.7307291865348816, + "step": 2290 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.1250061035156, + "epoch": 0.7332373179708753, + "grad_norm": 0.11378385126590729, + "kl": 0.3131607033312321, + "learning_rate": 4.0294101938083065e-06, + "loss": 0.0978, + "reward": 1.7062500238418579, + "reward_std": 0.18863984048366547, + "rewards/accuracy_reward": 0.008333333395421505, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7354166865348816, + "step": 2291 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.3500183105468, + "epoch": 0.7335573691790687, + "grad_norm": 0.15199492871761322, + "kl": 0.28899841830134393, + "learning_rate": 4.0204485125856465e-06, + "loss": 0.0806, + "reward": 1.7791666984558105, + "reward_std": 0.12981051132082938, + "rewards/accuracy_reward": 0.05416666846722364, + "rewards/format_reward": 0.9770833551883698, + "rewards/tag_count_reward": 0.7479166746139526, + "step": 2292 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.7479431152344, + "epoch": 0.733877420387262, + "grad_norm": 0.3340560793876648, + "kl": 0.32305283546447755, + "learning_rate": 4.0114943000905645e-06, + "loss": 0.1183, + "reward": 1.7885416984558105, + "reward_std": 0.19187554568052292, + "rewards/accuracy_reward": 0.08541667014360428, + "rewards/format_reward": 0.9666666924953461, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2293 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.970849609375, + "epoch": 0.7341974715954552, + "grad_norm": 0.15596982836723328, + "kl": 0.5113927971571683, + "learning_rate": 4.00254756750727e-06, + "loss": 0.0962, + "reward": 1.7895833849906921, + "reward_std": 0.206942018866539, + "rewards/accuracy_reward": 0.09375000502914191, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7375000178813934, + "step": 2294 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.6208465576171, + "epoch": 0.7345175228036486, + "grad_norm": 0.104178786277771, + "kl": 0.285752671957016, + "learning_rate": 3.993608326010633e-06, + "loss": 0.1134, + "reward": 1.718750035762787, + "reward_std": 0.24399047940969468, + "rewards/accuracy_reward": 0.03125000111758709, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7354166865348816, + "step": 2295 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.2770935058594, + "epoch": 0.7348375740118419, + "grad_norm": 0.31328830122947693, + "kl": 0.3743242934346199, + "learning_rate": 3.984676586766167e-06, + "loss": 0.0678, + "reward": 1.793750035762787, + "reward_std": 0.18041975498199464, + "rewards/accuracy_reward": 0.08750000204890966, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2296 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.2833435058594, + "epoch": 0.7351576252200352, + "grad_norm": 0.09008507430553436, + "kl": 0.17614571936428547, + "learning_rate": 3.975752360930015e-06, + "loss": 0.0706, + "reward": 1.916145884990692, + "reward_std": 0.18779027387499808, + "rewards/accuracy_reward": 0.21041667070239783, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7411458432674408, + "step": 2297 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.4875122070313, + "epoch": 0.7354776764282285, + "grad_norm": 0.17964133620262146, + "kl": 0.31557943001389505, + "learning_rate": 3.9668356596489345e-06, + "loss": 0.087, + "reward": 1.7958333492279053, + "reward_std": 0.22537537813186645, + "rewards/accuracy_reward": 0.0979166692122817, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7354166805744171, + "step": 2298 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.4083404541016, + "epoch": 0.7357977276364218, + "grad_norm": 0.23957663774490356, + "kl": 0.15366022884845734, + "learning_rate": 3.957926494060285e-06, + "loss": 0.0417, + "reward": 1.8901042103767396, + "reward_std": 0.1613088697195053, + "rewards/accuracy_reward": 0.17083333991467953, + "rewards/format_reward": 0.9770833492279053, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 2299 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.9021087646485, + "epoch": 0.7361177788446152, + "grad_norm": 0.14986614882946014, + "kl": 0.2438764087855816, + "learning_rate": 3.9490248752920116e-06, + "loss": 0.0859, + "reward": 1.7635417222976684, + "reward_std": 0.19552550762891768, + "rewards/accuracy_reward": 0.05833333488553762, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 2300 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.9354370117187, + "epoch": 0.7364378300528085, + "grad_norm": 0.07502276450395584, + "kl": 0.18566813617944716, + "learning_rate": 3.9401308144626375e-06, + "loss": 0.038, + "reward": 1.8125000476837159, + "reward_std": 0.23270709663629532, + "rewards/accuracy_reward": 0.09583333693444729, + "rewards/format_reward": 0.975000011920929, + "rewards/tag_count_reward": 0.7416666805744171, + "step": 2301 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.0937744140625, + "epoch": 0.7367578812610017, + "grad_norm": 0.11627375334501266, + "kl": 0.22418252676725386, + "learning_rate": 3.931244322681243e-06, + "loss": 0.0675, + "reward": 1.825520896911621, + "reward_std": 0.1442221499979496, + "rewards/accuracy_reward": 0.10416666883975267, + "rewards/format_reward": 0.9770833432674408, + "rewards/tag_count_reward": 0.7442708373069763, + "step": 2302 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.622933959961, + "epoch": 0.7370779324691951, + "grad_norm": 0.09706476330757141, + "kl": 0.20932482741773129, + "learning_rate": 3.922365411047451e-06, + "loss": 0.0396, + "reward": 1.8552083611488341, + "reward_std": 0.12424775958061218, + "rewards/accuracy_reward": 0.12916667032986878, + "rewards/format_reward": 0.9833333492279053, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 2303 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.7833557128906, + "epoch": 0.7373979836773884, + "grad_norm": 0.1505707949399948, + "kl": 0.15522883646190166, + "learning_rate": 3.913494090651421e-06, + "loss": 0.0709, + "reward": 1.7562500357627868, + "reward_std": 0.1329023189842701, + "rewards/accuracy_reward": 0.031250000558793546, + "rewards/format_reward": 0.9770833432674408, + "rewards/tag_count_reward": 0.7479166865348816, + "step": 2304 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.4521057128907, + "epoch": 0.7377180348855817, + "grad_norm": 0.1870940774679184, + "kl": 0.18360717520117759, + "learning_rate": 3.90463037257383e-06, + "loss": 0.0767, + "reward": 1.864583396911621, + "reward_std": 0.19524949863553048, + "rewards/accuracy_reward": 0.15208333563059567, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7437500178813934, + "step": 2305 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.8229309082031, + "epoch": 0.738038086093775, + "grad_norm": 0.10056561976671219, + "kl": 0.25383531153202055, + "learning_rate": 3.8957742678858575e-06, + "loss": 0.0886, + "reward": 1.696875023841858, + "reward_std": 0.1846102386713028, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7364583611488342, + "step": 2306 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.4895935058594, + "epoch": 0.7383581373019683, + "grad_norm": 0.29490870237350464, + "kl": 0.4113804802298546, + "learning_rate": 3.8869257876491775e-06, + "loss": 0.1208, + "reward": 1.8765625715255738, + "reward_std": 0.17176087722182273, + "rewards/accuracy_reward": 0.17291667275130748, + "rewards/format_reward": 0.9604166924953461, + "rewards/tag_count_reward": 0.7432291865348816, + "step": 2307 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.1604309082031, + "epoch": 0.7386781885101616, + "grad_norm": 0.34679147601127625, + "kl": 0.324555953592062, + "learning_rate": 3.8780849429159365e-06, + "loss": 0.0636, + "reward": 1.773437535762787, + "reward_std": 0.14504209160804749, + "rewards/accuracy_reward": 0.060416667722165585, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7442708551883698, + "step": 2308 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.7812683105469, + "epoch": 0.738998239718355, + "grad_norm": 0.21677498519420624, + "kl": 0.22121107652783395, + "learning_rate": 3.869251744728745e-06, + "loss": 0.0584, + "reward": 1.8776042103767394, + "reward_std": 0.12323407009243965, + "rewards/accuracy_reward": 0.15833333730697632, + "rewards/format_reward": 0.9750000178813935, + "rewards/tag_count_reward": 0.7442708492279053, + "step": 2309 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.7521026611328, + "epoch": 0.7393182909265482, + "grad_norm": 0.1344781219959259, + "kl": 0.2541138086467981, + "learning_rate": 3.8604262041206676e-06, + "loss": 0.0726, + "reward": 1.748958373069763, + "reward_std": 0.17084471583366395, + "rewards/accuracy_reward": 0.04791666865348816, + "rewards/format_reward": 0.9625000119209289, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2310 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.4521118164063, + "epoch": 0.7396383421347416, + "grad_norm": 0.22849154472351074, + "kl": 0.26187874004244804, + "learning_rate": 3.851608332115192e-06, + "loss": 0.055, + "reward": 1.7343750238418578, + "reward_std": 0.14665495604276657, + "rewards/accuracy_reward": 0.022916667722165586, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7447916805744171, + "step": 2311 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.893765258789, + "epoch": 0.7399583933429349, + "grad_norm": 0.13005004823207855, + "kl": 0.2791564010083675, + "learning_rate": 3.842798139726239e-06, + "loss": 0.0738, + "reward": 1.7609375238418579, + "reward_std": 0.18678562864661216, + "rewards/accuracy_reward": 0.05000000111758709, + "rewards/format_reward": 0.9687500119209289, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 2312 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.3458557128906, + "epoch": 0.7402784445511282, + "grad_norm": 0.22451624274253845, + "kl": 0.35189689993858336, + "learning_rate": 3.833995637958134e-06, + "loss": 0.118, + "reward": 1.6958333492279052, + "reward_std": 0.18869037181138992, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7416666924953461, + "step": 2313 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.9021057128906, + "epoch": 0.7405984957593215, + "grad_norm": 0.24994489550590515, + "kl": 0.46310959905385973, + "learning_rate": 3.825200837805595e-06, + "loss": 0.1139, + "reward": 1.7739583849906921, + "reward_std": 0.21757967174053192, + "rewards/accuracy_reward": 0.08125000316649675, + "rewards/format_reward": 0.9562500298023224, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2314 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.2229370117187, + "epoch": 0.7409185469675148, + "grad_norm": 0.16934210062026978, + "kl": 0.5003368586301804, + "learning_rate": 3.8164137502537225e-06, + "loss": 0.1277, + "reward": 1.7718750476837157, + "reward_std": 0.23287726268172265, + "rewards/accuracy_reward": 0.08541667070239782, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7343750178813935, + "step": 2315 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.0041870117187, + "epoch": 0.7412385981757081, + "grad_norm": 0.22636856138706207, + "kl": 0.46233353689312934, + "learning_rate": 3.8076343862779795e-06, + "loss": 0.1022, + "reward": 1.8239583849906922, + "reward_std": 0.18512208759784698, + "rewards/accuracy_reward": 0.1250000035390258, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7385416924953461, + "step": 2316 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.0791900634765, + "epoch": 0.7415586493839014, + "grad_norm": 0.2716544270515442, + "kl": 0.45498904660344125, + "learning_rate": 3.7988627568441884e-06, + "loss": 0.0936, + "reward": 1.8187500357627868, + "reward_std": 0.16201677322387695, + "rewards/accuracy_reward": 0.11041667070239783, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2317 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.6312683105468, + "epoch": 0.7418787005920947, + "grad_norm": 0.2425927370786667, + "kl": 0.4942165374755859, + "learning_rate": 3.7900988729085077e-06, + "loss": 0.1107, + "reward": 1.7463541865348815, + "reward_std": 0.2642780289053917, + "rewards/accuracy_reward": 0.06250000149011611, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7317708551883697, + "step": 2318 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.4625183105469, + "epoch": 0.742198751800288, + "grad_norm": 0.6389156579971313, + "kl": 0.6444176331162452, + "learning_rate": 3.7813427454174158e-06, + "loss": 0.1153, + "reward": 1.7890625238418578, + "reward_std": 0.25294766649603845, + "rewards/accuracy_reward": 0.1020833369344473, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7328125298023224, + "step": 2319 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.5083557128906, + "epoch": 0.7425188030084814, + "grad_norm": 0.17046241462230682, + "kl": 0.42304186820983886, + "learning_rate": 3.7725943853077105e-06, + "loss": 0.0969, + "reward": 1.7640625357627868, + "reward_std": 0.21301912367343903, + "rewards/accuracy_reward": 0.06458333432674408, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2320 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.8041809082031, + "epoch": 0.7428388542166746, + "grad_norm": 0.14227454364299774, + "kl": 0.5013582430779934, + "learning_rate": 3.7638538035064854e-06, + "loss": 0.1181, + "reward": 1.7229166984558106, + "reward_std": 0.20538915917277337, + "rewards/accuracy_reward": 0.03958333432674408, + "rewards/format_reward": 0.9395833492279053, + "rewards/tag_count_reward": 0.743750023841858, + "step": 2321 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.0437652587891, + "epoch": 0.743158905424868, + "grad_norm": 0.09683094918727875, + "kl": 0.28505592197179797, + "learning_rate": 3.7551210109311196e-06, + "loss": 0.0908, + "reward": 1.7859375476837158, + "reward_std": 0.20068425834178924, + "rewards/accuracy_reward": 0.08750000111758709, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7380208492279052, + "step": 2322 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.5416931152344, + "epoch": 0.7434789566330613, + "grad_norm": 0.13085141777992249, + "kl": 0.24737758412957192, + "learning_rate": 3.746396018489261e-06, + "loss": 0.0934, + "reward": 1.8333333730697632, + "reward_std": 0.21514089405536652, + "rewards/accuracy_reward": 0.13125000316649676, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7395833611488343, + "step": 2323 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.2125122070313, + "epoch": 0.7437990078412546, + "grad_norm": 0.11616872996091843, + "kl": 0.39013450406491756, + "learning_rate": 3.7376788370788164e-06, + "loss": 0.0622, + "reward": 1.759895884990692, + "reward_std": 0.15159039273858071, + "rewards/accuracy_reward": 0.05625000204890966, + "rewards/format_reward": 0.9645833432674408, + "rewards/tag_count_reward": 0.7390625, + "step": 2324 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.8666809082031, + "epoch": 0.7441190590494479, + "grad_norm": 0.12471839785575867, + "kl": 0.1726240862160921, + "learning_rate": 3.728969477587935e-06, + "loss": 0.0673, + "reward": 1.8135417103767395, + "reward_std": 0.16715479716658593, + "rewards/accuracy_reward": 0.09791666939854622, + "rewards/format_reward": 0.9729166865348816, + "rewards/tag_count_reward": 0.7427083432674408, + "step": 2325 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.9854431152344, + "epoch": 0.7444391102576412, + "grad_norm": 0.10027755051851273, + "kl": 0.16627274565398692, + "learning_rate": 3.7202679508950015e-06, + "loss": 0.0805, + "reward": 1.7869792103767395, + "reward_std": 0.1703805223107338, + "rewards/accuracy_reward": 0.07708333563059569, + "rewards/format_reward": 0.9687500119209289, + "rewards/tag_count_reward": 0.7411458492279053, + "step": 2326 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.2875244140625, + "epoch": 0.7447591614658345, + "grad_norm": 0.08407598733901978, + "kl": 0.1928509298712015, + "learning_rate": 3.7115742678686053e-06, + "loss": 0.1048, + "reward": 1.7682291865348816, + "reward_std": 0.2047549694776535, + "rewards/accuracy_reward": 0.08541667014360428, + "rewards/format_reward": 0.9458333432674408, + "rewards/tag_count_reward": 0.7369791805744171, + "step": 2327 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.6541778564454, + "epoch": 0.7450792126740279, + "grad_norm": 0.22395098209381104, + "kl": 0.18675260804593563, + "learning_rate": 3.7028884393675478e-06, + "loss": 0.0446, + "reward": 1.847395896911621, + "reward_std": 0.16364309713244438, + "rewards/accuracy_reward": 0.12916667126119136, + "rewards/format_reward": 0.9750000178813935, + "rewards/tag_count_reward": 0.7432291865348816, + "step": 2328 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.3396118164062, + "epoch": 0.7453992638822211, + "grad_norm": 0.20289666950702667, + "kl": 0.16297319643199443, + "learning_rate": 3.6942104762408183e-06, + "loss": 0.0611, + "reward": 1.778125035762787, + "reward_std": 0.17890555337071418, + "rewards/accuracy_reward": 0.07500000204890966, + "rewards/format_reward": 0.9625000119209289, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 2329 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.1729370117188, + "epoch": 0.7457193150904144, + "grad_norm": 0.07707800716161728, + "kl": 0.21096321307122706, + "learning_rate": 3.685540389327583e-06, + "loss": 0.0795, + "reward": 1.756770873069763, + "reward_std": 0.14885179400444032, + "rewards/accuracy_reward": 0.045833334885537626, + "rewards/format_reward": 0.9687500119209289, + "rewards/tag_count_reward": 0.7421875119209289, + "step": 2330 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.7291839599609, + "epoch": 0.7460393662986078, + "grad_norm": 0.0692569687962532, + "kl": 0.13143852166831493, + "learning_rate": 3.676878189457167e-06, + "loss": 0.0346, + "reward": 1.8354166865348815, + "reward_std": 0.15971220657229424, + "rewards/accuracy_reward": 0.11666666772216558, + "rewards/format_reward": 0.9750000059604644, + "rewards/tag_count_reward": 0.7437500119209289, + "step": 2331 + }, + { + "clip_ratio": 0.0, + "completion_length": 513.533349609375, + "epoch": 0.7463594175068011, + "grad_norm": 0.08359574526548386, + "kl": 0.13114451617002487, + "learning_rate": 3.6682238874490463e-06, + "loss": 0.0544, + "reward": 1.8244791984558106, + "reward_std": 0.15698686689138414, + "rewards/accuracy_reward": 0.1020833369344473, + "rewards/format_reward": 0.9791666865348816, + "rewards/tag_count_reward": 0.743229192495346, + "step": 2332 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.0458435058594, + "epoch": 0.7466794687149944, + "grad_norm": 0.14677995443344116, + "kl": 0.20154299661517144, + "learning_rate": 3.6595774941128315e-06, + "loss": 0.057, + "reward": 1.7786458492279054, + "reward_std": 0.15753973126411439, + "rewards/accuracy_reward": 0.05416666697710752, + "rewards/format_reward": 0.9770833492279053, + "rewards/tag_count_reward": 0.7473958551883697, + "step": 2333 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.8000244140625, + "epoch": 0.7469995199231877, + "grad_norm": 0.27684590220451355, + "kl": 0.329647683724761, + "learning_rate": 3.6509390202482553e-06, + "loss": 0.0634, + "reward": 1.7859375476837158, + "reward_std": 0.2338223472237587, + "rewards/accuracy_reward": 0.08541666865348815, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7380208432674408, + "step": 2334 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.5416870117188, + "epoch": 0.747319571131381, + "grad_norm": 0.11186019331216812, + "kl": 0.2218513660132885, + "learning_rate": 3.6423084766451622e-06, + "loss": 0.0894, + "reward": 1.780208373069763, + "reward_std": 0.22022225856781005, + "rewards/accuracy_reward": 0.07916666772216559, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7406250238418579, + "step": 2335 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.0458465576172, + "epoch": 0.7476396223395744, + "grad_norm": 0.13403449952602386, + "kl": 0.15975108332931995, + "learning_rate": 3.63368587408348e-06, + "loss": 0.0551, + "reward": 1.8479166984558106, + "reward_std": 0.1556932583451271, + "rewards/accuracy_reward": 0.12916666977107524, + "rewards/format_reward": 0.9770833551883698, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2336 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.2354400634765, + "epoch": 0.7479596735477676, + "grad_norm": 0.08647426962852478, + "kl": 0.14235255531966687, + "learning_rate": 3.6250712233332297e-06, + "loss": 0.055, + "reward": 1.7786458730697632, + "reward_std": 0.205631835013628, + "rewards/accuracy_reward": 0.06041666716337204, + "rewards/format_reward": 0.9729166924953461, + "rewards/tag_count_reward": 0.7453125178813934, + "step": 2337 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.6270935058594, + "epoch": 0.7482797247559609, + "grad_norm": 0.11648111790418625, + "kl": 0.24137634374201297, + "learning_rate": 3.6164645351544956e-06, + "loss": 0.0716, + "reward": 1.750000035762787, + "reward_std": 0.20163048431277275, + "rewards/accuracy_reward": 0.043750002048909664, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7375000178813934, + "step": 2338 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.1875183105469, + "epoch": 0.7485997759641543, + "grad_norm": 0.1161661297082901, + "kl": 0.1347400803118944, + "learning_rate": 3.607865820297416e-06, + "loss": 0.0517, + "reward": 1.7734375238418578, + "reward_std": 0.15068410485982894, + "rewards/accuracy_reward": 0.05208333563059568, + "rewards/format_reward": 0.9770833551883698, + "rewards/tag_count_reward": 0.7442708551883698, + "step": 2339 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.9354370117187, + "epoch": 0.7489198271723476, + "grad_norm": 0.18194898962974548, + "kl": 0.20232655815780162, + "learning_rate": 3.5992750895021713e-06, + "loss": 0.0326, + "reward": 1.7625000119209289, + "reward_std": 0.11509302705526352, + "rewards/accuracy_reward": 0.029166666977107524, + "rewards/format_reward": 0.987500011920929, + "rewards/tag_count_reward": 0.7458333373069763, + "step": 2340 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.2583557128906, + "epoch": 0.7492398783805408, + "grad_norm": 0.1009315699338913, + "kl": 0.23906343355774878, + "learning_rate": 3.590692353498968e-06, + "loss": 0.0879, + "reward": 1.7583333611488343, + "reward_std": 0.16905978918075562, + "rewards/accuracy_reward": 0.04791666902601719, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.743750023841858, + "step": 2341 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.5229278564453, + "epoch": 0.7495599295887342, + "grad_norm": 0.13877518475055695, + "kl": 0.14381254725158216, + "learning_rate": 3.58211762300803e-06, + "loss": 0.0756, + "reward": 1.7395833730697632, + "reward_std": 0.1424667552113533, + "rewards/accuracy_reward": 0.025000000186264516, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7458333551883698, + "step": 2342 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.8833526611328, + "epoch": 0.7498799807969275, + "grad_norm": 0.1535481959581375, + "kl": 0.211732941493392, + "learning_rate": 3.5735509087395815e-06, + "loss": 0.0572, + "reward": 1.7979166984558106, + "reward_std": 0.16340636983513832, + "rewards/accuracy_reward": 0.07916666883975268, + "rewards/format_reward": 0.9770833492279053, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2343 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.5916900634766, + "epoch": 0.7502000320051209, + "grad_norm": 0.21330109238624573, + "kl": 0.37927271127700807, + "learning_rate": 3.564992221393825e-06, + "loss": 0.0941, + "reward": 1.7723958611488342, + "reward_std": 0.21374720484018325, + "rewards/accuracy_reward": 0.07291666902601719, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7411458492279053, + "step": 2344 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.945849609375, + "epoch": 0.7505200832133141, + "grad_norm": 0.23470190167427063, + "kl": 0.1548861227929592, + "learning_rate": 3.556441571660948e-06, + "loss": 0.0559, + "reward": 1.9057292222976685, + "reward_std": 0.1434769354760647, + "rewards/accuracy_reward": 0.18333334047347308, + "rewards/format_reward": 0.9750000178813935, + "rewards/tag_count_reward": 0.7473958492279053, + "step": 2345 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.0937744140625, + "epoch": 0.7508401344215074, + "grad_norm": 0.09681239724159241, + "kl": 0.17304839566349983, + "learning_rate": 3.5478989702210966e-06, + "loss": 0.0841, + "reward": 1.7692708730697633, + "reward_std": 0.13986653685569764, + "rewards/accuracy_reward": 0.04791666883975267, + "rewards/format_reward": 0.9750000238418579, + "rewards/tag_count_reward": 0.7463541865348816, + "step": 2346 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.3562713623047, + "epoch": 0.7511601856297008, + "grad_norm": 0.11565534770488739, + "kl": 0.1687497179955244, + "learning_rate": 3.5393644277443596e-06, + "loss": 0.0706, + "reward": 1.7510416865348817, + "reward_std": 0.12845450565218924, + "rewards/accuracy_reward": 0.025000000186264516, + "rewards/format_reward": 0.9791666865348816, + "rewards/tag_count_reward": 0.7468750178813934, + "step": 2347 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.5395935058593, + "epoch": 0.7514802368378941, + "grad_norm": 0.13690054416656494, + "kl": 0.25157116502523424, + "learning_rate": 3.5308379548907644e-06, + "loss": 0.0988, + "reward": 1.7223958492279052, + "reward_std": 0.23458851501345634, + "rewards/accuracy_reward": 0.027083334513008595, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7369791865348816, + "step": 2348 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.202099609375, + "epoch": 0.7518002880460873, + "grad_norm": 0.3587300777435303, + "kl": 0.21654266826808452, + "learning_rate": 3.522319562310259e-06, + "loss": 0.0701, + "reward": 1.7614583611488341, + "reward_std": 0.1720878452062607, + "rewards/accuracy_reward": 0.0583333358168602, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7427083432674408, + "step": 2349 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.8146026611328, + "epoch": 0.7521203392542807, + "grad_norm": 0.11443227529525757, + "kl": 0.1734992451965809, + "learning_rate": 3.513809260642694e-06, + "loss": 0.045, + "reward": 1.9213542103767396, + "reward_std": 0.1500195875763893, + "rewards/accuracy_reward": 0.19375000409781934, + "rewards/format_reward": 0.9833333492279053, + "rewards/tag_count_reward": 0.7442708432674408, + "step": 2350 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.5625244140625, + "epoch": 0.752440390462474, + "grad_norm": 0.20784099400043488, + "kl": 0.27498736456036565, + "learning_rate": 3.505307060517823e-06, + "loss": 0.0675, + "reward": 1.8005208611488341, + "reward_std": 0.21704452484846115, + "rewards/accuracy_reward": 0.0958333371207118, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 2351 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.2479461669922, + "epoch": 0.7527604416706674, + "grad_norm": 0.3816712498664856, + "kl": 0.2782451644539833, + "learning_rate": 3.496812972555266e-06, + "loss": 0.0863, + "reward": 1.7848958730697633, + "reward_std": 0.13636522069573404, + "rewards/accuracy_reward": 0.07083333637565374, + "rewards/format_reward": 0.9729166865348816, + "rewards/tag_count_reward": 0.7411458551883697, + "step": 2352 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.8937713623047, + "epoch": 0.7530804928788606, + "grad_norm": 0.20585204660892487, + "kl": 0.2115282118320465, + "learning_rate": 3.488327007364525e-06, + "loss": 0.0555, + "reward": 1.7791667103767395, + "reward_std": 0.13307306319475173, + "rewards/accuracy_reward": 0.05833333544433117, + "rewards/format_reward": 0.9729166865348816, + "rewards/tag_count_reward": 0.7479166805744171, + "step": 2353 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.2354370117188, + "epoch": 0.7534005440870539, + "grad_norm": 0.11561473459005356, + "kl": 0.3394802324473858, + "learning_rate": 3.4798491755449483e-06, + "loss": 0.1463, + "reward": 1.7208333611488342, + "reward_std": 0.24762727022171022, + "rewards/accuracy_reward": 0.02916666753590107, + "rewards/format_reward": 0.947916692495346, + "rewards/tag_count_reward": 0.7437500178813934, + "step": 2354 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.4229339599609, + "epoch": 0.7537205952952473, + "grad_norm": 0.19716666638851166, + "kl": 0.4750838838517666, + "learning_rate": 3.471379487685729e-06, + "loss": 0.1206, + "reward": 1.7885417103767396, + "reward_std": 0.2462085708975792, + "rewards/accuracy_reward": 0.09583333600312471, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7406250298023224, + "step": 2355 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.5479400634765, + "epoch": 0.7540406465034406, + "grad_norm": 0.1148693636059761, + "kl": 0.2551156237721443, + "learning_rate": 3.4629179543658852e-06, + "loss": 0.0645, + "reward": 1.8052083730697632, + "reward_std": 0.16417960971593856, + "rewards/accuracy_reward": 0.09791667070239782, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7406250298023224, + "step": 2356 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.2541870117187, + "epoch": 0.7543606977116338, + "grad_norm": 0.1864629089832306, + "kl": 0.3117987260222435, + "learning_rate": 3.4544645861542525e-06, + "loss": 0.0914, + "reward": 1.7963542103767396, + "reward_std": 0.249758792668581, + "rewards/accuracy_reward": 0.09375000316649676, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7421875238418579, + "step": 2357 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.941683959961, + "epoch": 0.7546807489198272, + "grad_norm": 0.18410681188106537, + "kl": 0.19241276159882545, + "learning_rate": 3.4460193936094644e-06, + "loss": 0.0385, + "reward": 1.8593750476837159, + "reward_std": 0.17560729682445525, + "rewards/accuracy_reward": 0.14166666977107525, + "rewards/format_reward": 0.9708333611488342, + "rewards/tag_count_reward": 0.7468750178813934, + "step": 2358 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.714599609375, + "epoch": 0.7550008001280205, + "grad_norm": 0.16778114438056946, + "kl": 0.36679080240428447, + "learning_rate": 3.437582387279946e-06, + "loss": 0.1082, + "reward": 1.8822917103767396, + "reward_std": 0.20298010110855103, + "rewards/accuracy_reward": 0.19375000800937414, + "rewards/format_reward": 0.950000011920929, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 2359 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.6500213623046, + "epoch": 0.7553208513362137, + "grad_norm": 0.2953540086746216, + "kl": 0.3251816764473915, + "learning_rate": 3.4291535777039e-06, + "loss": 0.1073, + "reward": 1.7552083730697632, + "reward_std": 0.21472963988780974, + "rewards/accuracy_reward": 0.058333334513008595, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 2360 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.9479370117188, + "epoch": 0.7556409025444071, + "grad_norm": 0.2134820818901062, + "kl": 0.388304453343153, + "learning_rate": 3.4207329754092787e-06, + "loss": 0.1032, + "reward": 1.807812547683716, + "reward_std": 0.2219138652086258, + "rewards/accuracy_reward": 0.1104166692122817, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2361 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.189599609375, + "epoch": 0.7559609537526004, + "grad_norm": 0.14765672385692596, + "kl": 0.37944440804421903, + "learning_rate": 3.412320590913796e-06, + "loss": 0.0673, + "reward": 1.783333384990692, + "reward_std": 0.17042958214879037, + "rewards/accuracy_reward": 0.07500000130385161, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7437500059604645, + "step": 2362 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.214599609375, + "epoch": 0.7562810049607938, + "grad_norm": 0.18267126381397247, + "kl": 0.33554785549640653, + "learning_rate": 3.4039164347248953e-06, + "loss": 0.0857, + "reward": 1.7166666746139527, + "reward_std": 0.18649079501628876, + "rewards/accuracy_reward": 0.012500000558793545, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2363 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.3229370117188, + "epoch": 0.756601056168987, + "grad_norm": 0.18736283481121063, + "kl": 0.3915620282292366, + "learning_rate": 3.3955205173397463e-06, + "loss": 0.0988, + "reward": 1.7807292222976685, + "reward_std": 0.22891742140054702, + "rewards/accuracy_reward": 0.09166666958481073, + "rewards/format_reward": 0.9458333492279053, + "rewards/tag_count_reward": 0.743229192495346, + "step": 2364 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.0021057128906, + "epoch": 0.7569211073771803, + "grad_norm": 0.2990589737892151, + "kl": 0.3989905290305614, + "learning_rate": 3.387132849245224e-06, + "loss": 0.1328, + "reward": 1.7213541746139527, + "reward_std": 0.2603048712015152, + "rewards/accuracy_reward": 0.0354166679084301, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.735937523841858, + "step": 2365 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.6687622070312, + "epoch": 0.7572411585853737, + "grad_norm": 0.1770792007446289, + "kl": 0.2988043397665024, + "learning_rate": 3.378753440917901e-06, + "loss": 0.0809, + "reward": 1.7817708492279052, + "reward_std": 0.1883620299398899, + "rewards/accuracy_reward": 0.07708333432674408, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7442708551883698, + "step": 2366 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.7437683105469, + "epoch": 0.757561209793567, + "grad_norm": 0.24669428169727325, + "kl": 0.22290822267532348, + "learning_rate": 3.3703823028240355e-06, + "loss": 0.0688, + "reward": 1.763020884990692, + "reward_std": 0.18148678839206694, + "rewards/accuracy_reward": 0.05208333544433117, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7442708492279053, + "step": 2367 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.8812683105468, + "epoch": 0.7578812610017602, + "grad_norm": 0.2096192091703415, + "kl": 0.24256822615861892, + "learning_rate": 3.3620194454195565e-06, + "loss": 0.0841, + "reward": 1.7984375476837158, + "reward_std": 0.23278064355254174, + "rewards/accuracy_reward": 0.10000000204890966, + "rewards/format_reward": 0.9604166924953461, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 2368 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.6125244140625, + "epoch": 0.7582013122099536, + "grad_norm": 0.2237611711025238, + "kl": 0.38547887057065966, + "learning_rate": 3.353664879150039e-06, + "loss": 0.1108, + "reward": 1.745312547683716, + "reward_std": 0.22282838672399521, + "rewards/accuracy_reward": 0.054166668094694614, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7390625238418579, + "step": 2369 + }, + { + "clip_ratio": 0.0, + "completion_length": 588.5729309082031, + "epoch": 0.7585213634181469, + "grad_norm": 0.3807878792285919, + "kl": 0.3787414848804474, + "learning_rate": 3.3453186144507168e-06, + "loss": 0.0842, + "reward": 1.8406250596046447, + "reward_std": 0.16929481625556947, + "rewards/accuracy_reward": 0.1333333384245634, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7427083551883698, + "step": 2370 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.7750091552734, + "epoch": 0.7588414146263402, + "grad_norm": 0.19828210771083832, + "kl": 0.4050968214869499, + "learning_rate": 3.336980661746446e-06, + "loss": 0.095, + "reward": 1.802083384990692, + "reward_std": 0.249814622849226, + "rewards/accuracy_reward": 0.11041667070239783, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2371 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.7395935058594, + "epoch": 0.7591614658345335, + "grad_norm": 0.18706656992435455, + "kl": 0.3668996267020702, + "learning_rate": 3.3286510314517027e-06, + "loss": 0.0955, + "reward": 1.6937500476837157, + "reward_std": 0.21575065851211547, + "rewards/accuracy_reward": 0.00833333358168602, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7354166865348816, + "step": 2372 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.1312713623047, + "epoch": 0.7594815170427268, + "grad_norm": 0.18040499091148376, + "kl": 0.3536977834999561, + "learning_rate": 3.3203297339705697e-06, + "loss": 0.1014, + "reward": 1.8427083849906922, + "reward_std": 0.20292569175362588, + "rewards/accuracy_reward": 0.14791667070239783, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2373 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.9500183105469, + "epoch": 0.7598015682509202, + "grad_norm": 0.17289505898952484, + "kl": 0.4077607229351997, + "learning_rate": 3.3120167796967195e-06, + "loss": 0.1037, + "reward": 1.7218750238418579, + "reward_std": 0.1947345994412899, + "rewards/accuracy_reward": 0.018750000558793545, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 2374 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.6645965576172, + "epoch": 0.7601216194591135, + "grad_norm": 0.4230034053325653, + "kl": 0.45233857035636904, + "learning_rate": 3.303712179013404e-06, + "loss": 0.0924, + "reward": 1.7479166984558105, + "reward_std": 0.2048986002802849, + "rewards/accuracy_reward": 0.05000000223517418, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 2375 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.5125152587891, + "epoch": 0.7604416706673067, + "grad_norm": 0.23644131422042847, + "kl": 0.47083998322486875, + "learning_rate": 3.295415942293445e-06, + "loss": 0.1508, + "reward": 1.7208333611488342, + "reward_std": 0.30071154832839964, + "rewards/accuracy_reward": 0.07083333563059568, + "rewards/format_reward": 0.922916692495346, + "rewards/tag_count_reward": 0.7270833551883698, + "step": 2376 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.4854370117188, + "epoch": 0.7607617218755001, + "grad_norm": 0.25301748514175415, + "kl": 0.3236188516020775, + "learning_rate": 3.2871280798992065e-06, + "loss": 0.0775, + "reward": 1.8088542103767395, + "reward_std": 0.20637039393186568, + "rewards/accuracy_reward": 0.10833333600312471, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7401041865348816, + "step": 2377 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.7437683105469, + "epoch": 0.7610817730836934, + "grad_norm": 0.2608127295970917, + "kl": 0.34846524000167844, + "learning_rate": 3.278848602182604e-06, + "loss": 0.117, + "reward": 1.7947917222976684, + "reward_std": 0.23584940135478974, + "rewards/accuracy_reward": 0.11875000409781933, + "rewards/format_reward": 0.9416666924953461, + "rewards/tag_count_reward": 0.7343750238418579, + "step": 2378 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.7937713623047, + "epoch": 0.7614018242918867, + "grad_norm": 0.13144290447235107, + "kl": 0.27924820110201837, + "learning_rate": 3.2705775194850754e-06, + "loss": 0.0652, + "reward": 1.8026042103767395, + "reward_std": 0.22813262045383453, + "rewards/accuracy_reward": 0.10000000204890966, + "rewards/format_reward": 0.9666666924953461, + "rewards/tag_count_reward": 0.735937523841858, + "step": 2379 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.1583526611328, + "epoch": 0.76172187550008, + "grad_norm": 0.17167526483535767, + "kl": 0.49086830765008926, + "learning_rate": 3.262314842137573e-06, + "loss": 0.1266, + "reward": 1.6869792103767396, + "reward_std": 0.2478036791086197, + "rewards/accuracy_reward": 0.04791666865348816, + "rewards/format_reward": 0.9187500238418579, + "rewards/tag_count_reward": 0.720312523841858, + "step": 2380 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.5666931152343, + "epoch": 0.7620419267082733, + "grad_norm": 0.3797181248664856, + "kl": 0.4156690865755081, + "learning_rate": 3.2540605804605518e-06, + "loss": 0.0797, + "reward": 1.7578125476837159, + "reward_std": 0.2350001037120819, + "rewards/accuracy_reward": 0.06458333544433117, + "rewards/format_reward": 0.954166692495346, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2381 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.8062744140625, + "epoch": 0.7623619779164666, + "grad_norm": 0.16799494624137878, + "kl": 0.29723322987556455, + "learning_rate": 3.245814744763953e-06, + "loss": 0.0945, + "reward": 1.7552083730697632, + "reward_std": 0.22091315314173698, + "rewards/accuracy_reward": 0.05833333544433117, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2382 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.0979431152343, + "epoch": 0.76268202912466, + "grad_norm": 0.5904622673988342, + "kl": 0.3778699226677418, + "learning_rate": 3.237577345347196e-06, + "loss": 0.1171, + "reward": 1.7447916865348816, + "reward_std": 0.24800491482019424, + "rewards/accuracy_reward": 0.06041666865348816, + "rewards/format_reward": 0.9458333611488342, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2383 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.0146118164063, + "epoch": 0.7630020803328532, + "grad_norm": 0.3403340280056, + "kl": 0.29922411739826205, + "learning_rate": 3.2293483924991632e-06, + "loss": 0.0793, + "reward": 1.754687535762787, + "reward_std": 0.23587894216179847, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.7338541865348815, + "step": 2384 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.1354248046875, + "epoch": 0.7633221315410466, + "grad_norm": 0.13508936762809753, + "kl": 0.3111469350755215, + "learning_rate": 3.2211278964981794e-06, + "loss": 0.0799, + "reward": 1.86770840883255, + "reward_std": 0.24278006702661514, + "rewards/accuracy_reward": 0.17500000540167093, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 2385 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.5812744140625, + "epoch": 0.7636421827492399, + "grad_norm": 0.2847776412963867, + "kl": 0.2592492446303368, + "learning_rate": 3.2129158676120176e-06, + "loss": 0.092, + "reward": 1.7619791865348815, + "reward_std": 0.1997735843062401, + "rewards/accuracy_reward": 0.06250000204890967, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2386 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.4541870117188, + "epoch": 0.7639622339574332, + "grad_norm": 0.15784919261932373, + "kl": 0.3668780118227005, + "learning_rate": 3.2047123160978655e-06, + "loss": 0.1224, + "reward": 1.734375011920929, + "reward_std": 0.25276233106851576, + "rewards/accuracy_reward": 0.05000000260770321, + "rewards/format_reward": 0.9458333611488342, + "rewards/tag_count_reward": 0.7385416924953461, + "step": 2387 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.952099609375, + "epoch": 0.7642822851656265, + "grad_norm": 0.1636374145746231, + "kl": 0.45332918018102647, + "learning_rate": 3.19651725220233e-06, + "loss": 0.1207, + "reward": 1.7656250476837159, + "reward_std": 0.2817038677632809, + "rewards/accuracy_reward": 0.0958333371207118, + "rewards/format_reward": 0.9354166865348816, + "rewards/tag_count_reward": 0.7343750119209289, + "step": 2388 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.2979370117188, + "epoch": 0.7646023363738198, + "grad_norm": 0.28287118673324585, + "kl": 0.43390736877918246, + "learning_rate": 3.1883306861614104e-06, + "loss": 0.1094, + "reward": 1.7614583849906922, + "reward_std": 0.24015092253684997, + "rewards/accuracy_reward": 0.0687500013038516, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2389 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.972933959961, + "epoch": 0.7649223875820131, + "grad_norm": 0.5688726902008057, + "kl": 0.5063559889793396, + "learning_rate": 3.180152628200496e-06, + "loss": 0.1007, + "reward": 1.8213542342185973, + "reward_std": 0.29036264047026633, + "rewards/accuracy_reward": 0.14375000465661286, + "rewards/format_reward": 0.9458333432674408, + "rewards/tag_count_reward": 0.7317708611488343, + "step": 2390 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.9750213623047, + "epoch": 0.7652424387902065, + "grad_norm": 0.4694216251373291, + "kl": 0.37770887836813927, + "learning_rate": 3.171983088534346e-06, + "loss": 0.1105, + "reward": 1.7708333492279054, + "reward_std": 0.2839522875845432, + "rewards/accuracy_reward": 0.09791666977107524, + "rewards/format_reward": 0.9375000178813935, + "rewards/tag_count_reward": 0.735416692495346, + "step": 2391 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.8562561035156, + "epoch": 0.7655624899983997, + "grad_norm": 0.20927488803863525, + "kl": 0.5734367772936821, + "learning_rate": 3.1638220773670825e-06, + "loss": 0.1439, + "reward": 1.7171875476837157, + "reward_std": 0.2636822387576103, + "rewards/accuracy_reward": 0.04583333469927311, + "rewards/format_reward": 0.9395833432674408, + "rewards/tag_count_reward": 0.7317708432674408, + "step": 2392 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.2229309082031, + "epoch": 0.765882541206593, + "grad_norm": 0.3315555453300476, + "kl": 0.4113732993602753, + "learning_rate": 3.1556696048921764e-06, + "loss": 0.0684, + "reward": 1.7453125357627868, + "reward_std": 0.2074427381157875, + "rewards/accuracy_reward": 0.035416666977107526, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7432291865348816, + "step": 2393 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.8875274658203, + "epoch": 0.7662025924147864, + "grad_norm": 0.4421977400779724, + "kl": 0.5268315270543098, + "learning_rate": 3.147525681292425e-06, + "loss": 0.0601, + "reward": 1.7333333611488342, + "reward_std": 0.192195063829422, + "rewards/accuracy_reward": 0.07500000316649676, + "rewards/format_reward": 0.9375000119209289, + "rewards/tag_count_reward": 0.7208333432674408, + "step": 2394 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.9854339599609, + "epoch": 0.7665226436229797, + "grad_norm": 0.21242809295654297, + "kl": 0.3715293690562248, + "learning_rate": 3.1393903167399553e-06, + "loss": 0.0978, + "reward": 1.835937535762787, + "reward_std": 0.2570257142186165, + "rewards/accuracy_reward": 0.1541666707023978, + "rewards/format_reward": 0.9479166805744171, + "rewards/tag_count_reward": 0.7338541746139526, + "step": 2395 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.5520935058594, + "epoch": 0.766842694831173, + "grad_norm": 0.20334164798259735, + "kl": 0.5021124824881553, + "learning_rate": 3.131263521396204e-06, + "loss": 0.1268, + "reward": 1.728645884990692, + "reward_std": 0.22536734342575074, + "rewards/accuracy_reward": 0.04166666772216558, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.736979192495346, + "step": 2396 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.6146057128906, + "epoch": 0.7671627460393663, + "grad_norm": 0.22740711271762848, + "kl": 0.6462334305047989, + "learning_rate": 3.123145305411902e-06, + "loss": 0.1419, + "reward": 1.7661458611488343, + "reward_std": 0.270243152230978, + "rewards/accuracy_reward": 0.09166666883975268, + "rewards/format_reward": 0.9416666865348816, + "rewards/tag_count_reward": 0.7328125178813935, + "step": 2397 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.4958557128906, + "epoch": 0.7674827972475596, + "grad_norm": 0.11030031740665436, + "kl": 0.19400645047426224, + "learning_rate": 3.115035678927063e-06, + "loss": 0.0823, + "reward": 1.8479167222976685, + "reward_std": 0.24711870402097702, + "rewards/accuracy_reward": 0.14791666772216558, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7375000238418579, + "step": 2398 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.727099609375, + "epoch": 0.767802848455753, + "grad_norm": 0.31179970502853394, + "kl": 0.3838733732700348, + "learning_rate": 3.106934652070975e-06, + "loss": 0.1031, + "reward": 1.8020833611488343, + "reward_std": 0.1734007865190506, + "rewards/accuracy_reward": 0.09166666865348816, + "rewards/format_reward": 0.9708333432674408, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2399 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.8354339599609, + "epoch": 0.7681228996639462, + "grad_norm": 0.21114316582679749, + "kl": 0.2608781367540359, + "learning_rate": 3.098842234962183e-06, + "loss": 0.0605, + "reward": 1.8239583730697633, + "reward_std": 0.19166183918714524, + "rewards/accuracy_reward": 0.11666667088866234, + "rewards/format_reward": 0.9625000298023224, + "rewards/tag_count_reward": 0.744791692495346, + "step": 2400 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.9812652587891, + "epoch": 0.7684429508721395, + "grad_norm": 0.15161120891571045, + "kl": 0.43533697724342346, + "learning_rate": 3.090758437708482e-06, + "loss": 0.0964, + "reward": 1.715625023841858, + "reward_std": 0.18822606652975082, + "rewards/accuracy_reward": 0.018750000186264514, + "rewards/format_reward": 0.9583333432674408, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2401 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.7396118164063, + "epoch": 0.7687630020803329, + "grad_norm": 0.1593593806028366, + "kl": 0.32452878206968305, + "learning_rate": 3.08268327040689e-06, + "loss": 0.1009, + "reward": 1.7885417222976685, + "reward_std": 0.22790974006056786, + "rewards/accuracy_reward": 0.09583333600312471, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 2402 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.2666809082032, + "epoch": 0.7690830532885262, + "grad_norm": 0.1515556126832962, + "kl": 0.3837066598236561, + "learning_rate": 3.0746167431436547e-06, + "loss": 0.083, + "reward": 1.7276041984558106, + "reward_std": 0.24149103313684464, + "rewards/accuracy_reward": 0.05000000204890966, + "rewards/format_reward": 0.9458333492279053, + "rewards/tag_count_reward": 0.7317708492279053, + "step": 2403 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.2416839599609, + "epoch": 0.7694031044967194, + "grad_norm": 0.1699807196855545, + "kl": 0.23230726271867752, + "learning_rate": 3.0665588659942314e-06, + "loss": 0.1027, + "reward": 1.8213542103767395, + "reward_std": 0.22609889209270478, + "rewards/accuracy_reward": 0.12083333563059569, + "rewards/format_reward": 0.9604166924953461, + "rewards/tag_count_reward": 0.7401041805744171, + "step": 2404 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.9437683105468, + "epoch": 0.7697231557049128, + "grad_norm": 0.12499313056468964, + "kl": 0.28099107556045055, + "learning_rate": 3.058509649023269e-06, + "loss": 0.1095, + "reward": 1.729687547683716, + "reward_std": 0.18809969127178192, + "rewards/accuracy_reward": 0.02500000111758709, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7401041865348816, + "step": 2405 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.3875183105469, + "epoch": 0.7700432069131061, + "grad_norm": 0.1266588419675827, + "kl": 0.16458383351564407, + "learning_rate": 3.050469102284601e-06, + "loss": 0.0639, + "reward": 1.7901042222976684, + "reward_std": 0.14033205881714822, + "rewards/accuracy_reward": 0.07500000204890966, + "rewards/format_reward": 0.9708333551883698, + "rewards/tag_count_reward": 0.7442708551883698, + "step": 2406 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.993783569336, + "epoch": 0.7703632581212994, + "grad_norm": 0.1295279860496521, + "kl": 0.37859131768345833, + "learning_rate": 3.0424372358212285e-06, + "loss": 0.0713, + "reward": 1.7776041865348815, + "reward_std": 0.22646936923265457, + "rewards/accuracy_reward": 0.0854166692122817, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 2407 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.9500122070312, + "epoch": 0.7706833093294927, + "grad_norm": 0.18353892862796783, + "kl": 0.2817696675658226, + "learning_rate": 3.0344140596653126e-06, + "loss": 0.0866, + "reward": 1.7906250476837158, + "reward_std": 0.19676049128174783, + "rewards/accuracy_reward": 0.08333333488553762, + "rewards/format_reward": 0.9687500119209289, + "rewards/tag_count_reward": 0.7385416746139526, + "step": 2408 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.4458557128906, + "epoch": 0.771003360537686, + "grad_norm": 0.13988955318927765, + "kl": 0.22601069658994674, + "learning_rate": 3.026399583838163e-06, + "loss": 0.0425, + "reward": 1.8536458969116212, + "reward_std": 0.1386600524187088, + "rewards/accuracy_reward": 0.1312500022351742, + "rewards/format_reward": 0.9770833492279053, + "rewards/tag_count_reward": 0.745312511920929, + "step": 2409 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.1979309082031, + "epoch": 0.7713234117458794, + "grad_norm": 0.15479102730751038, + "kl": 0.46245444044470785, + "learning_rate": 3.0183938183502147e-06, + "loss": 0.0515, + "reward": 1.8177083730697632, + "reward_std": 0.22176536172628403, + "rewards/accuracy_reward": 0.11458333861082792, + "rewards/format_reward": 0.9625000298023224, + "rewards/tag_count_reward": 0.7406250238418579, + "step": 2410 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.7250061035156, + "epoch": 0.7716434629540726, + "grad_norm": 0.15402555465698242, + "kl": 0.29928482323884964, + "learning_rate": 3.0103967732010277e-06, + "loss": 0.1084, + "reward": 1.7593750476837158, + "reward_std": 0.1930878482758999, + "rewards/accuracy_reward": 0.05833333507180214, + "rewards/format_reward": 0.9604166924953461, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 2411 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.2312744140625, + "epoch": 0.7719635141622659, + "grad_norm": 0.10418938845396042, + "kl": 0.3145505003631115, + "learning_rate": 3.0024084583792702e-06, + "loss": 0.0554, + "reward": 1.7895833730697632, + "reward_std": 0.19940297231078147, + "rewards/accuracy_reward": 0.0812500050291419, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7416666924953461, + "step": 2412 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.2437683105469, + "epoch": 0.7722835653704593, + "grad_norm": 0.1994236707687378, + "kl": 0.19253274872899057, + "learning_rate": 2.9944288838627055e-06, + "loss": 0.051, + "reward": 1.7958333492279053, + "reward_std": 0.15394851118326186, + "rewards/accuracy_reward": 0.07916666902601718, + "rewards/format_reward": 0.9750000178813935, + "rewards/tag_count_reward": 0.7416666746139526, + "step": 2413 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.1271057128906, + "epoch": 0.7726036165786526, + "grad_norm": 0.17601707577705383, + "kl": 0.3171676769852638, + "learning_rate": 2.986458059618179e-06, + "loss": 0.0766, + "reward": 1.6947917103767396, + "reward_std": 0.240276700258255, + "rewards/accuracy_reward": 0.01875000074505806, + "rewards/format_reward": 0.9395833551883698, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2414 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.7958557128907, + "epoch": 0.7729236677868458, + "grad_norm": 0.09672868251800537, + "kl": 0.23676921837031842, + "learning_rate": 2.978495995601608e-06, + "loss": 0.0733, + "reward": 1.7895833611488343, + "reward_std": 0.15342155173420907, + "rewards/accuracy_reward": 0.07708333730697632, + "rewards/format_reward": 0.9729166805744172, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2415 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.4625213623046, + "epoch": 0.7732437189950392, + "grad_norm": 0.10931958258152008, + "kl": 0.2647506821900606, + "learning_rate": 2.970542701757967e-06, + "loss": 0.0457, + "reward": 1.8171875476837158, + "reward_std": 0.15667049512267112, + "rewards/accuracy_reward": 0.09791667014360428, + "rewards/format_reward": 0.9770833551883698, + "rewards/tag_count_reward": 0.7421875238418579, + "step": 2416 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.3500152587891, + "epoch": 0.7735637702032325, + "grad_norm": 0.12150561064481735, + "kl": 0.19132925122976302, + "learning_rate": 2.962598188021275e-06, + "loss": 0.0552, + "reward": 1.7864583730697632, + "reward_std": 0.13132432252168655, + "rewards/accuracy_reward": 0.06041666977107525, + "rewards/format_reward": 0.9812500178813934, + "rewards/tag_count_reward": 0.7447916865348816, + "step": 2417 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.8458557128906, + "epoch": 0.7738838214114259, + "grad_norm": 0.13992400467395782, + "kl": 0.14227314628660678, + "learning_rate": 2.9546624643145894e-06, + "loss": 0.0547, + "reward": 1.7958333611488342, + "reward_std": 0.11764216274023057, + "rewards/accuracy_reward": 0.07500000204890966, + "rewards/format_reward": 0.975000011920929, + "rewards/tag_count_reward": 0.7458333551883698, + "step": 2418 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.5791870117188, + "epoch": 0.7742038726196191, + "grad_norm": 0.14344719052314758, + "kl": 0.23957008644938468, + "learning_rate": 2.9467355405499788e-06, + "loss": 0.0858, + "reward": 1.754687535762787, + "reward_std": 0.2000661239027977, + "rewards/accuracy_reward": 0.05000000223517418, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7380208492279052, + "step": 2419 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.9083435058594, + "epoch": 0.7745239238278124, + "grad_norm": 0.05815388634800911, + "kl": 0.17854432128369807, + "learning_rate": 2.9388174266285273e-06, + "loss": 0.0459, + "reward": 1.8260417103767395, + "reward_std": 0.16736711636185647, + "rewards/accuracy_reward": 0.10416666865348816, + "rewards/format_reward": 0.9791666805744171, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 2420 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.0437744140625, + "epoch": 0.7748439750360058, + "grad_norm": 0.053522102534770966, + "kl": 0.15169395208358766, + "learning_rate": 2.9309081324403153e-06, + "loss": 0.0783, + "reward": 1.8119791984558105, + "reward_std": 0.15475299432873726, + "rewards/accuracy_reward": 0.09375, + "rewards/format_reward": 0.9729166805744172, + "rewards/tag_count_reward": 0.745312511920929, + "step": 2421 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.0166870117188, + "epoch": 0.7751640262441991, + "grad_norm": 0.3917308449745178, + "kl": 0.21865907534956933, + "learning_rate": 2.923007667864405e-06, + "loss": 0.0946, + "reward": 1.736458384990692, + "reward_std": 0.21290394365787507, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.9604166984558106, + "rewards/tag_count_reward": 0.7427083611488342, + "step": 2422 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.1479339599609, + "epoch": 0.7754840774523923, + "grad_norm": 0.15909360349178314, + "kl": 0.27562965378165244, + "learning_rate": 2.9151160427688296e-06, + "loss": 0.0639, + "reward": 1.7421875476837159, + "reward_std": 0.18707589358091353, + "rewards/accuracy_reward": 0.04166666772216558, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 2423 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.3541778564453, + "epoch": 0.7758041286605857, + "grad_norm": 0.08536235988140106, + "kl": 0.14693028368055822, + "learning_rate": 2.907233267010584e-06, + "loss": 0.0413, + "reward": 1.7802083611488342, + "reward_std": 0.10094428583979606, + "rewards/accuracy_reward": 0.054166667722165586, + "rewards/format_reward": 0.9791666746139527, + "rewards/tag_count_reward": 0.746875011920929, + "step": 2424 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.0291870117187, + "epoch": 0.776124179868779, + "grad_norm": 0.12931646406650543, + "kl": 0.23414733335375787, + "learning_rate": 2.8993593504356065e-06, + "loss": 0.051, + "reward": 1.8479167222976685, + "reward_std": 0.16570336520671844, + "rewards/accuracy_reward": 0.13750000428408385, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2425 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.4229431152344, + "epoch": 0.7764442310769724, + "grad_norm": 0.10547000914812088, + "kl": 0.25148463547229766, + "learning_rate": 2.8914943028787756e-06, + "loss": 0.0588, + "reward": 1.707812535762787, + "reward_std": 0.18785856291651726, + "rewards/accuracy_reward": 0.01041666679084301, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7411458492279053, + "step": 2426 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.931265258789, + "epoch": 0.7767642822851656, + "grad_norm": 0.15780065953731537, + "kl": 0.23954942002892493, + "learning_rate": 2.883638134163882e-06, + "loss": 0.0896, + "reward": 1.7562500357627868, + "reward_std": 0.17177270203828812, + "rewards/accuracy_reward": 0.05000000111758709, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7458333551883698, + "step": 2427 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.5020965576172, + "epoch": 0.7770843334933589, + "grad_norm": 0.240781769156456, + "kl": 0.25432484969496727, + "learning_rate": 2.8757908541036338e-06, + "loss": 0.0615, + "reward": 1.8828125596046448, + "reward_std": 0.15776183307170868, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7473958492279053, + "step": 2428 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.5708557128906, + "epoch": 0.7774043847015523, + "grad_norm": 0.3382117450237274, + "kl": 0.3822079569101334, + "learning_rate": 2.8679524724996354e-06, + "loss": 0.1107, + "reward": 1.7885417103767396, + "reward_std": 0.2685310781002045, + "rewards/accuracy_reward": 0.10000000298023223, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2429 + }, + { + "clip_ratio": 0.0, + "completion_length": 593.7562744140625, + "epoch": 0.7777244359097456, + "grad_norm": 0.2815842926502228, + "kl": 0.20215894728899003, + "learning_rate": 2.8601229991423787e-06, + "loss": 0.0719, + "reward": 1.7885417222976685, + "reward_std": 0.18276797756552696, + "rewards/accuracy_reward": 0.08125000223517417, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7427083551883698, + "step": 2430 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.8583557128907, + "epoch": 0.7780444871179388, + "grad_norm": 0.2211749106645584, + "kl": 0.2775234118103981, + "learning_rate": 2.8523024438112236e-06, + "loss": 0.1116, + "reward": 1.8385417222976685, + "reward_std": 0.18420382663607598, + "rewards/accuracy_reward": 0.13125000353902577, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2431 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.7041748046875, + "epoch": 0.7783645383261322, + "grad_norm": 0.15589836239814758, + "kl": 0.3906994819641113, + "learning_rate": 2.8444908162743957e-06, + "loss": 0.0551, + "reward": 1.8416666865348816, + "reward_std": 0.1611462078988552, + "rewards/accuracy_reward": 0.13125000447034835, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7458333373069763, + "step": 2432 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.9520965576172, + "epoch": 0.7786845895343255, + "grad_norm": 0.15969502925872803, + "kl": 0.29722789898514745, + "learning_rate": 2.836688126288968e-06, + "loss": 0.0927, + "reward": 1.8218750357627869, + "reward_std": 0.17098113447427749, + "rewards/accuracy_reward": 0.1166666716337204, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7447916805744171, + "step": 2433 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.7354370117188, + "epoch": 0.7790046407425189, + "grad_norm": 0.2324097454547882, + "kl": 0.5922939941287041, + "learning_rate": 2.828894383600851e-06, + "loss": 0.1457, + "reward": 1.8229167222976685, + "reward_std": 0.2708844006061554, + "rewards/accuracy_reward": 0.12916667126119136, + "rewards/format_reward": 0.9541666984558106, + "rewards/tag_count_reward": 0.7395833671092987, + "step": 2434 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.7291870117188, + "epoch": 0.7793246919507121, + "grad_norm": 0.1611299216747284, + "kl": 0.2276984043419361, + "learning_rate": 2.8211095979447733e-06, + "loss": 0.0911, + "reward": 1.817187535762787, + "reward_std": 0.18277856037020684, + "rewards/accuracy_reward": 0.10833333693444729, + "rewards/format_reward": 0.9666666924953461, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 2435 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.2291839599609, + "epoch": 0.7796447431589054, + "grad_norm": 0.281716912984848, + "kl": 0.5017144948244094, + "learning_rate": 2.8133337790442838e-06, + "loss": 0.1155, + "reward": 1.7838541984558105, + "reward_std": 0.21507382094860078, + "rewards/accuracy_reward": 0.09166666828095912, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7380208492279052, + "step": 2436 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.5104370117188, + "epoch": 0.7799647943670988, + "grad_norm": 0.2546637952327728, + "kl": 0.32052686661481855, + "learning_rate": 2.805566936611728e-06, + "loss": 0.1056, + "reward": 1.765625035762787, + "reward_std": 0.213297700881958, + "rewards/accuracy_reward": 0.06666666772216559, + "rewards/format_reward": 0.9562500298023224, + "rewards/tag_count_reward": 0.7427083432674408, + "step": 2437 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.7896057128906, + "epoch": 0.7802848455752921, + "grad_norm": 0.2136593610048294, + "kl": 0.37920288145542147, + "learning_rate": 2.7978090803482407e-06, + "loss": 0.0881, + "reward": 1.742187535762787, + "reward_std": 0.22298106253147126, + "rewards/accuracy_reward": 0.05000000111758709, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7401041865348816, + "step": 2438 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.2854370117187, + "epoch": 0.7806048967834853, + "grad_norm": 0.2064886838197708, + "kl": 0.3057727158069611, + "learning_rate": 2.790060219943731e-06, + "loss": 0.0851, + "reward": 1.7432291984558106, + "reward_std": 0.23816211745142937, + "rewards/accuracy_reward": 0.05208333507180214, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.736979192495346, + "step": 2439 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.9708618164062, + "epoch": 0.7809249479916787, + "grad_norm": 0.24506452679634094, + "kl": 0.35978928543627264, + "learning_rate": 2.782320365076874e-06, + "loss": 0.0667, + "reward": 1.814062523841858, + "reward_std": 0.17409721985459328, + "rewards/accuracy_reward": 0.10208333358168602, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7432291746139527, + "step": 2440 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.9250183105469, + "epoch": 0.781244999199872, + "grad_norm": 0.47220608592033386, + "kl": 0.5047030732035637, + "learning_rate": 2.7745895254150924e-06, + "loss": 0.1056, + "reward": 1.856250035762787, + "reward_std": 0.2518445745110512, + "rewards/accuracy_reward": 0.17083333544433116, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7354166746139527, + "step": 2441 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.6875244140625, + "epoch": 0.7815650504080653, + "grad_norm": 0.2580389082431793, + "kl": 0.48905070424079894, + "learning_rate": 2.766867710614557e-06, + "loss": 0.1298, + "reward": 1.7447916865348816, + "reward_std": 0.328003853559494, + "rewards/accuracy_reward": 0.08333333618938923, + "rewards/format_reward": 0.9354166865348816, + "rewards/tag_count_reward": 0.7260416865348815, + "step": 2442 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.9521118164063, + "epoch": 0.7818851016162586, + "grad_norm": 0.1950906366109848, + "kl": 0.34237351566553115, + "learning_rate": 2.7591549303201513e-06, + "loss": 0.1036, + "reward": 1.818750023841858, + "reward_std": 0.24801153615117072, + "rewards/accuracy_reward": 0.12708333767950536, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7333333551883697, + "step": 2443 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.5562683105469, + "epoch": 0.7822051528244519, + "grad_norm": 0.20304733514785767, + "kl": 0.30522238165140153, + "learning_rate": 2.75145119416549e-06, + "loss": 0.1117, + "reward": 1.7088542103767395, + "reward_std": 0.23408942371606828, + "rewards/accuracy_reward": 0.018750000558793545, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7401041924953461, + "step": 2444 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.1750244140625, + "epoch": 0.7825252040326452, + "grad_norm": 0.17914730310440063, + "kl": 0.4073436066508293, + "learning_rate": 2.7437565117728805e-06, + "loss": 0.1271, + "reward": 1.7020833611488342, + "reward_std": 0.22345729991793634, + "rewards/accuracy_reward": 0.01875, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7333333551883697, + "step": 2445 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.3812622070312, + "epoch": 0.7828452552408386, + "grad_norm": 0.42733335494995117, + "kl": 0.38346418291330336, + "learning_rate": 2.7360708927533285e-06, + "loss": 0.1283, + "reward": 1.8026042103767395, + "reward_std": 0.2687413990497589, + "rewards/accuracy_reward": 0.11458333656191826, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7380208492279052, + "step": 2446 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.4437805175781, + "epoch": 0.7831653064490318, + "grad_norm": 0.13012735545635223, + "kl": 0.2576710045337677, + "learning_rate": 2.7283943467065153e-06, + "loss": 0.0592, + "reward": 1.8531250715255738, + "reward_std": 0.17226756662130355, + "rewards/accuracy_reward": 0.14583333879709243, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 2447 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.4604370117188, + "epoch": 0.7834853576572252, + "grad_norm": 0.13805796205997467, + "kl": 0.33923321291804315, + "learning_rate": 2.7207268832207913e-06, + "loss": 0.0902, + "reward": 1.717187523841858, + "reward_std": 0.1830857887864113, + "rewards/accuracy_reward": 0.012500000186264515, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7401041924953461, + "step": 2448 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.3437652587891, + "epoch": 0.7838054088654185, + "grad_norm": 0.18494828045368195, + "kl": 0.2963532693684101, + "learning_rate": 2.7130685118731615e-06, + "loss": 0.0969, + "reward": 1.7625000476837158, + "reward_std": 0.2770323887467384, + "rewards/accuracy_reward": 0.07083333507180214, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7375000178813934, + "step": 2449 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.452099609375, + "epoch": 0.7841254600736117, + "grad_norm": 0.12305200845003128, + "kl": 0.35416630394756793, + "learning_rate": 2.7054192422292737e-06, + "loss": 0.0769, + "reward": 1.8255208730697632, + "reward_std": 0.22889174297451972, + "rewards/accuracy_reward": 0.13750000316649674, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 2450 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.2500183105469, + "epoch": 0.7844455112818051, + "grad_norm": 0.13378387689590454, + "kl": 0.24118001461029054, + "learning_rate": 2.6977790838434126e-06, + "loss": 0.0881, + "reward": 1.8208333849906921, + "reward_std": 0.19077629521489142, + "rewards/accuracy_reward": 0.12291667070239783, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7375000178813934, + "step": 2451 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.7333465576172, + "epoch": 0.7847655624899984, + "grad_norm": 0.14180435240268707, + "kl": 0.2530834019184113, + "learning_rate": 2.6901480462584707e-06, + "loss": 0.093, + "reward": 1.7291666984558105, + "reward_std": 0.23850278556346893, + "rewards/accuracy_reward": 0.037500002235174176, + "rewards/format_reward": 0.9541666984558106, + "rewards/tag_count_reward": 0.7375000238418579, + "step": 2452 + }, + { + "clip_ratio": 0.0, + "completion_length": 590.8187683105468, + "epoch": 0.7850856136981917, + "grad_norm": 0.14379824697971344, + "kl": 0.2346596010029316, + "learning_rate": 2.68252613900596e-06, + "loss": 0.0673, + "reward": 1.7880208611488342, + "reward_std": 0.18742336928844452, + "rewards/accuracy_reward": 0.08958333544433117, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 2453 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.120849609375, + "epoch": 0.785405664906385, + "grad_norm": 0.2423066347837448, + "kl": 0.5001774221658707, + "learning_rate": 2.674913371605984e-06, + "loss": 0.0934, + "reward": 1.7817708611488343, + "reward_std": 0.24941302090883255, + "rewards/accuracy_reward": 0.09583333432674408, + "rewards/format_reward": 0.947916692495346, + "rewards/tag_count_reward": 0.7380208492279052, + "step": 2454 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.8250122070312, + "epoch": 0.7857257161145783, + "grad_norm": 0.10441887378692627, + "kl": 0.2186158448457718, + "learning_rate": 2.6673097535672287e-06, + "loss": 0.0599, + "reward": 1.8078125357627868, + "reward_std": 0.18876262679696082, + "rewards/accuracy_reward": 0.08958333656191826, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.7473958492279053, + "step": 2455 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.7771026611329, + "epoch": 0.7860457673227716, + "grad_norm": 0.1312214881181717, + "kl": 0.22390735670924186, + "learning_rate": 2.6597152943869542e-06, + "loss": 0.0847, + "reward": 1.7880208492279053, + "reward_std": 0.19233649373054504, + "rewards/accuracy_reward": 0.08541666977107525, + "rewards/format_reward": 0.9625000119209289, + "rewards/tag_count_reward": 0.7401041805744171, + "step": 2456 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.6646118164062, + "epoch": 0.786365818530965, + "grad_norm": 0.22349661588668823, + "kl": 0.32182138562202456, + "learning_rate": 2.652130003550981e-06, + "loss": 0.0887, + "reward": 1.7354166984558106, + "reward_std": 0.21236109361052513, + "rewards/accuracy_reward": 0.04375000130385161, + "rewards/format_reward": 0.950000011920929, + "rewards/tag_count_reward": 0.7416666805744171, + "step": 2457 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.320849609375, + "epoch": 0.7866858697391582, + "grad_norm": 0.11371547728776932, + "kl": 0.2892355978488922, + "learning_rate": 2.6445538905336764e-06, + "loss": 0.1003, + "reward": 1.8052083730697632, + "reward_std": 0.2052166000008583, + "rewards/accuracy_reward": 0.11041667181998491, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2458 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.4708435058594, + "epoch": 0.7870059209473516, + "grad_norm": 0.12026123702526093, + "kl": 0.38724498003721236, + "learning_rate": 2.6369869647979474e-06, + "loss": 0.0708, + "reward": 1.7208333611488342, + "reward_std": 0.2167038567364216, + "rewards/accuracy_reward": 0.02500000111758709, + "rewards/format_reward": 0.9562500298023224, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 2459 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.3375183105469, + "epoch": 0.7873259721555449, + "grad_norm": 0.23202356696128845, + "kl": 0.3304444134235382, + "learning_rate": 2.6294292357952166e-06, + "loss": 0.0875, + "reward": 1.7151041984558106, + "reward_std": 0.20461869090795518, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.9541666805744171, + "rewards/tag_count_reward": 0.7401041805744171, + "step": 2460 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.9562591552734, + "epoch": 0.7876460233637382, + "grad_norm": 0.1585400104522705, + "kl": 0.35580268651247027, + "learning_rate": 2.621880712965431e-06, + "loss": 0.1243, + "reward": 1.7515625476837158, + "reward_std": 0.2352686658501625, + "rewards/accuracy_reward": 0.05833333563059569, + "rewards/format_reward": 0.9541666805744171, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2461 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.7437744140625, + "epoch": 0.7879660745719315, + "grad_norm": 0.39485612511634827, + "kl": 0.21709256619215012, + "learning_rate": 2.614341405737032e-06, + "loss": 0.0933, + "reward": 1.7697917222976685, + "reward_std": 0.24684084132313727, + "rewards/accuracy_reward": 0.08125000316649675, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7385416924953461, + "step": 2462 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.4083557128906, + "epoch": 0.7882861257801248, + "grad_norm": 0.1640438437461853, + "kl": 0.2398224614560604, + "learning_rate": 2.606811323526952e-06, + "loss": 0.0811, + "reward": 1.8218750476837158, + "reward_std": 0.20028617680072786, + "rewards/accuracy_reward": 0.12083333730697632, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 2463 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.8500122070312, + "epoch": 0.7886061769883181, + "grad_norm": 0.12999562919139862, + "kl": 0.2820978585630655, + "learning_rate": 2.5992904757406025e-06, + "loss": 0.0903, + "reward": 1.7838542103767394, + "reward_std": 0.212898188829422, + "rewards/accuracy_reward": 0.08125000335276127, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7421875238418579, + "step": 2464 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.145849609375, + "epoch": 0.7889262281965115, + "grad_norm": 0.26787373423576355, + "kl": 0.35498605370521547, + "learning_rate": 2.5917788717718563e-06, + "loss": 0.0853, + "reward": 1.7744791984558106, + "reward_std": 0.20152606964111328, + "rewards/accuracy_reward": 0.07083333637565374, + "rewards/format_reward": 0.9604166924953461, + "rewards/tag_count_reward": 0.7432291865348816, + "step": 2465 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.4375244140625, + "epoch": 0.7892462794047047, + "grad_norm": 0.2707747519016266, + "kl": 0.21967264786362647, + "learning_rate": 2.584276521003046e-06, + "loss": 0.1241, + "reward": 1.775000023841858, + "reward_std": 0.25089699029922485, + "rewards/accuracy_reward": 0.08125000204890967, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7354166865348816, + "step": 2466 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.7021057128907, + "epoch": 0.789566330612898, + "grad_norm": 0.2889306843280792, + "kl": 0.36077115684747696, + "learning_rate": 2.5767834328049444e-06, + "loss": 0.1118, + "reward": 1.7328125357627868, + "reward_std": 0.23334373384714127, + "rewards/accuracy_reward": 0.04791666865348816, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7328125238418579, + "step": 2467 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.4750183105468, + "epoch": 0.7898863818210914, + "grad_norm": 0.12038398534059525, + "kl": 0.3180027477443218, + "learning_rate": 2.56929961653675e-06, + "loss": 0.0808, + "reward": 1.7505208730697632, + "reward_std": 0.20414262339472772, + "rewards/accuracy_reward": 0.05416666921228171, + "rewards/format_reward": 0.9583333611488343, + "rewards/tag_count_reward": 0.7380208492279052, + "step": 2468 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.8916809082032, + "epoch": 0.7902064330292847, + "grad_norm": 0.17740662395954132, + "kl": 0.6491322204470634, + "learning_rate": 2.5618250815460864e-06, + "loss": 0.1696, + "reward": 1.7906250357627869, + "reward_std": 0.3307073026895523, + "rewards/accuracy_reward": 0.13541667070239782, + "rewards/format_reward": 0.9291666865348815, + "rewards/tag_count_reward": 0.7260416924953461, + "step": 2469 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.695849609375, + "epoch": 0.790526484237478, + "grad_norm": 0.19464203715324402, + "kl": 0.40145623236894606, + "learning_rate": 2.5543598371689826e-06, + "loss": 0.1584, + "reward": 1.8687500596046447, + "reward_std": 0.2773083925247192, + "rewards/accuracy_reward": 0.20000000596046447, + "rewards/format_reward": 0.9354166865348816, + "rewards/tag_count_reward": 0.7333333551883697, + "step": 2470 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.8041870117188, + "epoch": 0.7908465354456713, + "grad_norm": 0.41462260484695435, + "kl": 0.38077380508184433, + "learning_rate": 2.546903892729864e-06, + "loss": 0.0834, + "reward": 1.7333333849906922, + "reward_std": 0.19426627084612846, + "rewards/accuracy_reward": 0.0458333345130086, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.735416692495346, + "step": 2471 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.802099609375, + "epoch": 0.7911665866538646, + "grad_norm": 0.31547677516937256, + "kl": 0.5052529156208039, + "learning_rate": 2.539457257541539e-06, + "loss": 0.1091, + "reward": 1.7666667103767395, + "reward_std": 0.2610872372984886, + "rewards/accuracy_reward": 0.09791666772216559, + "rewards/format_reward": 0.9375000238418579, + "rewards/tag_count_reward": 0.7312500238418579, + "step": 2472 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.7895935058593, + "epoch": 0.791486637862058, + "grad_norm": 0.23230285942554474, + "kl": 0.46921139508485793, + "learning_rate": 2.532019940905186e-06, + "loss": 0.1449, + "reward": 1.784895920753479, + "reward_std": 0.2417847713455558, + "rewards/accuracy_reward": 0.11250000409781932, + "rewards/format_reward": 0.9395833492279053, + "rewards/tag_count_reward": 0.7328125238418579, + "step": 2473 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.7812744140625, + "epoch": 0.7918066890702512, + "grad_norm": 0.20156913995742798, + "kl": 0.5675974369049073, + "learning_rate": 2.524591952110349e-06, + "loss": 0.1396, + "reward": 1.6666667103767394, + "reward_std": 0.308664807677269, + "rewards/accuracy_reward": 0.01875000037252903, + "rewards/format_reward": 0.9208333551883697, + "rewards/tag_count_reward": 0.7270833551883698, + "step": 2474 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.689599609375, + "epoch": 0.7921267402784445, + "grad_norm": 0.19317319989204407, + "kl": 0.3741036131978035, + "learning_rate": 2.5171733004349187e-06, + "loss": 0.096, + "reward": 1.842708373069763, + "reward_std": 0.1966053232550621, + "rewards/accuracy_reward": 0.14166667070239783, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7406250238418579, + "step": 2475 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.2812683105469, + "epoch": 0.7924467914866379, + "grad_norm": 0.22295664250850677, + "kl": 0.5214636474847794, + "learning_rate": 2.5097639951451247e-06, + "loss": 0.0917, + "reward": 1.7364583611488342, + "reward_std": 0.20884488373994828, + "rewards/accuracy_reward": 0.05416666883975267, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7322916865348816, + "step": 2476 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.0750183105469, + "epoch": 0.7927668426948312, + "grad_norm": 0.17876215279102325, + "kl": 0.4285578727722168, + "learning_rate": 2.5023640454955167e-06, + "loss": 0.1435, + "reward": 1.7416667222976685, + "reward_std": 0.27694963067770006, + "rewards/accuracy_reward": 0.08333333637565374, + "rewards/format_reward": 0.9270833492279053, + "rewards/tag_count_reward": 0.731250011920929, + "step": 2477 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.6395935058594, + "epoch": 0.7930868939030244, + "grad_norm": 0.3092890679836273, + "kl": 0.3600159421563148, + "learning_rate": 2.4949734607289656e-06, + "loss": 0.1012, + "reward": 1.7359375357627869, + "reward_std": 0.20121132731437683, + "rewards/accuracy_reward": 0.041666668653488156, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7442708551883698, + "step": 2478 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.558349609375, + "epoch": 0.7934069451112178, + "grad_norm": 0.18029294908046722, + "kl": 0.42183038890361785, + "learning_rate": 2.4875922500766414e-06, + "loss": 0.0984, + "reward": 1.7750000357627869, + "reward_std": 0.2342265397310257, + "rewards/accuracy_reward": 0.07291666921228171, + "rewards/format_reward": 0.9625000298023224, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 2479 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.2500244140625, + "epoch": 0.7937269963194111, + "grad_norm": 0.27822908759117126, + "kl": 0.41971677392721174, + "learning_rate": 2.4802204227580095e-06, + "loss": 0.131, + "reward": 1.757812511920929, + "reward_std": 0.23821898847818374, + "rewards/accuracy_reward": 0.07291666828095913, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7328125238418579, + "step": 2480 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.3062683105469, + "epoch": 0.7940470475276045, + "grad_norm": 0.3728947043418884, + "kl": 0.2817161396145821, + "learning_rate": 2.472857987980809e-06, + "loss": 0.1114, + "reward": 1.7854166984558106, + "reward_std": 0.26729344129562377, + "rewards/accuracy_reward": 0.09166667107492685, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.735416692495346, + "step": 2481 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.3041961669921, + "epoch": 0.7943670987357977, + "grad_norm": 0.1849849373102188, + "kl": 0.2868703156709671, + "learning_rate": 2.4655049549410535e-06, + "loss": 0.0632, + "reward": 1.750000035762787, + "reward_std": 0.19100956693291665, + "rewards/accuracy_reward": 0.05208333395421505, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7375000238418579, + "step": 2482 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.050015258789, + "epoch": 0.794687149943991, + "grad_norm": 0.15527111291885376, + "kl": 0.4221248269081116, + "learning_rate": 2.4581613328230093e-06, + "loss": 0.1289, + "reward": 1.7213541984558105, + "reward_std": 0.2918809249997139, + "rewards/accuracy_reward": 0.05208333376795053, + "rewards/format_reward": 0.9333333492279052, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 2483 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.483349609375, + "epoch": 0.7950072011521844, + "grad_norm": 0.2829282581806183, + "kl": 0.4145924270153046, + "learning_rate": 2.450827130799193e-06, + "loss": 0.0803, + "reward": 1.789062535762787, + "reward_std": 0.18743065968155861, + "rewards/accuracy_reward": 0.07500000204890966, + "rewards/format_reward": 0.9729166924953461, + "rewards/tag_count_reward": 0.7411458551883697, + "step": 2484 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.9354309082031, + "epoch": 0.7953272523603777, + "grad_norm": 0.1710597723722458, + "kl": 0.441159937530756, + "learning_rate": 2.443502358030344e-06, + "loss": 0.089, + "reward": 1.762500023841858, + "reward_std": 0.24500463232398034, + "rewards/accuracy_reward": 0.07708333563059569, + "rewards/format_reward": 0.947916692495346, + "rewards/tag_count_reward": 0.7375000178813934, + "step": 2485 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.5791778564453, + "epoch": 0.7956473035685709, + "grad_norm": 0.17100457847118378, + "kl": 0.27078391164541243, + "learning_rate": 2.436187023665435e-06, + "loss": 0.0777, + "reward": 1.842708373069763, + "reward_std": 0.2264217108488083, + "rewards/accuracy_reward": 0.14375000353902578, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7343750178813935, + "step": 2486 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.4645935058594, + "epoch": 0.7959673547767643, + "grad_norm": 0.28210291266441345, + "kl": 0.27822469994425775, + "learning_rate": 2.4288811368416466e-06, + "loss": 0.1004, + "reward": 1.795312523841858, + "reward_std": 0.20817887112498284, + "rewards/accuracy_reward": 0.10833333656191826, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7369791805744171, + "step": 2487 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.908349609375, + "epoch": 0.7962874059849576, + "grad_norm": 0.12771768867969513, + "kl": 0.298288094997406, + "learning_rate": 2.421584706684359e-06, + "loss": 0.0958, + "reward": 1.7255208849906922, + "reward_std": 0.18443159461021424, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.9479166865348816, + "rewards/tag_count_reward": 0.7401041865348816, + "step": 2488 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.8416900634766, + "epoch": 0.796607457193151, + "grad_norm": 0.1272592395544052, + "kl": 0.22488604262471198, + "learning_rate": 2.4142977423071388e-06, + "loss": 0.074, + "reward": 1.8083333492279052, + "reward_std": 0.17383792996406555, + "rewards/accuracy_reward": 0.09166666883975268, + "rewards/format_reward": 0.9729166865348816, + "rewards/tag_count_reward": 0.7437500178813934, + "step": 2489 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.5104431152344, + "epoch": 0.7969275084013442, + "grad_norm": 0.21532893180847168, + "kl": 0.23192031309008598, + "learning_rate": 2.4070202528117326e-06, + "loss": 0.0543, + "reward": 1.7500000476837159, + "reward_std": 0.15162527337670326, + "rewards/accuracy_reward": 0.037500002235174176, + "rewards/format_reward": 0.9687500119209289, + "rewards/tag_count_reward": 0.7437500178813934, + "step": 2490 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.958349609375, + "epoch": 0.7972475596095375, + "grad_norm": 0.12722332775592804, + "kl": 0.274930589646101, + "learning_rate": 2.3997522472880496e-06, + "loss": 0.1231, + "reward": 1.752083384990692, + "reward_std": 0.20301416665315627, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.9458333492279053, + "rewards/tag_count_reward": 0.7437500178813934, + "step": 2491 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.3729309082031, + "epoch": 0.7975676108177309, + "grad_norm": 0.1642562448978424, + "kl": 0.2974217519164085, + "learning_rate": 2.3924937348141574e-06, + "loss": 0.0781, + "reward": 1.7375000238418579, + "reward_std": 0.13625881671905518, + "rewards/accuracy_reward": 0.03125, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7416666805744171, + "step": 2492 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.7937622070312, + "epoch": 0.7978876620259241, + "grad_norm": 0.19371792674064636, + "kl": 0.25802323296666146, + "learning_rate": 2.385244724456256e-06, + "loss": 0.0859, + "reward": 1.8411459088325501, + "reward_std": 0.2050497278571129, + "rewards/accuracy_reward": 0.13958333767950534, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7432291746139527, + "step": 2493 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.6458557128906, + "epoch": 0.7982077132341174, + "grad_norm": 0.23767606914043427, + "kl": 0.22985709607601165, + "learning_rate": 2.378005225268689e-06, + "loss": 0.0946, + "reward": 1.7791666984558105, + "reward_std": 0.22321388572454454, + "rewards/accuracy_reward": 0.08125000298023224, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 2494 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.8916839599609, + "epoch": 0.7985277644423108, + "grad_norm": 0.10459578037261963, + "kl": 0.18133811727166177, + "learning_rate": 2.3707752462939137e-06, + "loss": 0.0511, + "reward": 1.8338542222976684, + "reward_std": 0.12393696308135986, + "rewards/accuracy_reward": 0.11041666995733976, + "rewards/format_reward": 0.9770833492279053, + "rewards/tag_count_reward": 0.7463541865348816, + "step": 2495 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.3854370117188, + "epoch": 0.7988478156505041, + "grad_norm": 0.14623679220676422, + "kl": 0.2732238922268152, + "learning_rate": 2.363554796562498e-06, + "loss": 0.0925, + "reward": 1.7770833730697633, + "reward_std": 0.19980213195085525, + "rewards/accuracy_reward": 0.06458333488553762, + "rewards/format_reward": 0.9729166805744172, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2496 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.018765258789, + "epoch": 0.7991678668586973, + "grad_norm": 0.16956394910812378, + "kl": 0.25806930400431155, + "learning_rate": 2.3563438850931076e-06, + "loss": 0.08, + "reward": 1.8343750357627868, + "reward_std": 0.16371086835861207, + "rewards/accuracy_reward": 0.12708333730697632, + "rewards/format_reward": 0.9666666746139526, + "rewards/tag_count_reward": 0.7406250238418579, + "step": 2497 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.739599609375, + "epoch": 0.7994879180668907, + "grad_norm": 0.2760685086250305, + "kl": 0.35679190531373023, + "learning_rate": 2.3491425208924934e-06, + "loss": 0.0844, + "reward": 1.7109375476837159, + "reward_std": 0.2751652516424656, + "rewards/accuracy_reward": 0.03125000037252903, + "rewards/format_reward": 0.9458333551883698, + "rewards/tag_count_reward": 0.7338541865348815, + "step": 2498 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.708349609375, + "epoch": 0.799807969275084, + "grad_norm": 0.12274476140737534, + "kl": 0.22531968206167222, + "learning_rate": 2.341950712955481e-06, + "loss": 0.0701, + "reward": 1.8500000476837157, + "reward_std": 0.16242174208164215, + "rewards/accuracy_reward": 0.13750000447034835, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7458333551883698, + "step": 2499 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.5625152587891, + "epoch": 0.8001280204832774, + "grad_norm": 0.14290136098861694, + "kl": 0.24403711333870887, + "learning_rate": 2.334768470264963e-06, + "loss": 0.0905, + "reward": 1.7848958730697633, + "reward_std": 0.2028984658420086, + "rewards/accuracy_reward": 0.08750000260770321, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7411458551883697, + "step": 2500 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.9437744140625, + "epoch": 0.8004480716914706, + "grad_norm": 0.18500693142414093, + "kl": 0.35512991696596147, + "learning_rate": 2.3275958017918787e-06, + "loss": 0.0749, + "reward": 1.803645873069763, + "reward_std": 0.23550339192152023, + "rewards/accuracy_reward": 0.10000000260770321, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2501 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.1937744140625, + "epoch": 0.8007681228996639, + "grad_norm": 0.10297524929046631, + "kl": 0.1982058696448803, + "learning_rate": 2.3204327164952135e-06, + "loss": 0.0697, + "reward": 1.7526041984558105, + "reward_std": 0.10015429258346557, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.975000011920929, + "rewards/tag_count_reward": 0.7442708551883698, + "step": 2502 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.1083557128907, + "epoch": 0.8010881741078573, + "grad_norm": 0.20120938122272491, + "kl": 0.24297235794365407, + "learning_rate": 2.3132792233219814e-06, + "loss": 0.0918, + "reward": 1.8713542342185974, + "reward_std": 0.19600575640797616, + "rewards/accuracy_reward": 0.166666672937572, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7401041865348816, + "step": 2503 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.5979370117187, + "epoch": 0.8014082253160506, + "grad_norm": 0.0977979525923729, + "kl": 0.362245024740696, + "learning_rate": 2.3061353312072166e-06, + "loss": 0.1023, + "reward": 1.7375000357627868, + "reward_std": 0.212438702583313, + "rewards/accuracy_reward": 0.04791666828095913, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.737500011920929, + "step": 2504 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.7250183105468, + "epoch": 0.8017282765242438, + "grad_norm": 0.15742014348506927, + "kl": 0.16011352837085724, + "learning_rate": 2.29900104907396e-06, + "loss": 0.0661, + "reward": 1.7265625238418578, + "reward_std": 0.1467660591006279, + "rewards/accuracy_reward": 0.016666667722165584, + "rewards/format_reward": 0.9708333551883698, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2505 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.1125122070313, + "epoch": 0.8020483277324372, + "grad_norm": 0.10794021189212799, + "kl": 0.18489291295409202, + "learning_rate": 2.2918763858332503e-06, + "loss": 0.1016, + "reward": 1.8979167222976685, + "reward_std": 0.21962611973285676, + "rewards/accuracy_reward": 0.1875000052154064, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2506 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.4854370117188, + "epoch": 0.8023683789406305, + "grad_norm": 0.2276093065738678, + "kl": 0.35973372757434846, + "learning_rate": 2.2847613503841094e-06, + "loss": 0.0942, + "reward": 1.7432291984558106, + "reward_std": 0.1830192506313324, + "rewards/accuracy_reward": 0.05000000111758709, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7348958432674408, + "step": 2507 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.9479339599609, + "epoch": 0.8026884301488239, + "grad_norm": 0.23760341107845306, + "kl": 0.18100916631519795, + "learning_rate": 2.2776559516135354e-06, + "loss": 0.0375, + "reward": 1.767187523841858, + "reward_std": 0.12768873944878578, + "rewards/accuracy_reward": 0.04583333358168602, + "rewards/format_reward": 0.9770833432674408, + "rewards/tag_count_reward": 0.7442708432674408, + "step": 2508 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.4854370117188, + "epoch": 0.8030084813570171, + "grad_norm": 0.2181500494480133, + "kl": 0.23914669267833233, + "learning_rate": 2.2705601983964933e-06, + "loss": 0.0716, + "reward": 1.8031250596046449, + "reward_std": 0.18591777086257935, + "rewards/accuracy_reward": 0.09375000428408384, + "rewards/format_reward": 0.9687500119209289, + "rewards/tag_count_reward": 0.740625011920929, + "step": 2509 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.8729278564454, + "epoch": 0.8033285325652104, + "grad_norm": 0.16643783450126648, + "kl": 0.3333664506673813, + "learning_rate": 2.2634740995958904e-06, + "loss": 0.1014, + "reward": 1.7885417103767396, + "reward_std": 0.21033181324601175, + "rewards/accuracy_reward": 0.08541667275130749, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 2510 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.7166809082031, + "epoch": 0.8036485837734038, + "grad_norm": 0.20102167129516602, + "kl": 0.16737534031271933, + "learning_rate": 2.256397664062584e-06, + "loss": 0.0671, + "reward": 1.7489583611488342, + "reward_std": 0.15768709704279898, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 2511 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.3250183105469, + "epoch": 0.8039686349815971, + "grad_norm": 0.16902852058410645, + "kl": 0.383270151168108, + "learning_rate": 2.249330900635359e-06, + "loss": 0.0886, + "reward": 1.8244792342185974, + "reward_std": 0.20306602343916894, + "rewards/accuracy_reward": 0.11875000372529029, + "rewards/format_reward": 0.9645833611488343, + "rewards/tag_count_reward": 0.7411458492279053, + "step": 2512 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.4354309082031, + "epoch": 0.8042886861897903, + "grad_norm": 0.10241694003343582, + "kl": 0.26669327914714813, + "learning_rate": 2.242273818140921e-06, + "loss": 0.0803, + "reward": 1.7666666865348817, + "reward_std": 0.2016952320933342, + "rewards/accuracy_reward": 0.06041666828095913, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7375000238418579, + "step": 2513 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.0687683105468, + "epoch": 0.8046087373979837, + "grad_norm": 0.10145855695009232, + "kl": 0.20489286333322526, + "learning_rate": 2.2352264253938795e-06, + "loss": 0.0594, + "reward": 1.7677083730697631, + "reward_std": 0.1323221020400524, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 2514 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.4312805175781, + "epoch": 0.804928788606177, + "grad_norm": 0.16253073513507843, + "kl": 0.16106326691806316, + "learning_rate": 2.2281887311967454e-06, + "loss": 0.0621, + "reward": 1.7890625238418578, + "reward_std": 0.14671893119812013, + "rewards/accuracy_reward": 0.07291666865348816, + "rewards/format_reward": 0.9708333432674408, + "rewards/tag_count_reward": 0.7453125059604645, + "step": 2515 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.4541870117188, + "epoch": 0.8052488398143703, + "grad_norm": 0.2187085896730423, + "kl": 0.2485386922955513, + "learning_rate": 2.221160744339913e-06, + "loss": 0.0566, + "reward": 1.8010417222976685, + "reward_std": 0.17143189162015915, + "rewards/accuracy_reward": 0.09166666772216558, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7406250059604644, + "step": 2516 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.2645965576172, + "epoch": 0.8055688910225636, + "grad_norm": 0.1277468055486679, + "kl": 0.27220593765378, + "learning_rate": 2.214142473601657e-06, + "loss": 0.1063, + "reward": 1.7572916865348815, + "reward_std": 0.19907682836055757, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7322916805744171, + "step": 2517 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.3958557128906, + "epoch": 0.8058889422307569, + "grad_norm": 0.5361232757568359, + "kl": 0.2836416274309158, + "learning_rate": 2.207133927748104e-06, + "loss": 0.1048, + "reward": 1.7906250357627869, + "reward_std": 0.25850230678915975, + "rewards/accuracy_reward": 0.0958333358168602, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7343750238418579, + "step": 2518 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.8479309082031, + "epoch": 0.8062089934389502, + "grad_norm": 0.21963047981262207, + "kl": 0.1898048844188452, + "learning_rate": 2.2001351155332453e-06, + "loss": 0.0493, + "reward": 1.7864583730697632, + "reward_std": 0.15694307088851928, + "rewards/accuracy_reward": 0.0708333358168602, + "rewards/format_reward": 0.9708333551883698, + "rewards/tag_count_reward": 0.7447916805744171, + "step": 2519 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.4979309082031, + "epoch": 0.8065290446471436, + "grad_norm": 0.1178651824593544, + "kl": 0.20024344846606254, + "learning_rate": 2.1931460456989105e-06, + "loss": 0.0952, + "reward": 1.7979167222976684, + "reward_std": 0.18103850185871123, + "rewards/accuracy_reward": 0.0916666690260172, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7437500298023224, + "step": 2520 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.7208465576172, + "epoch": 0.8068490958553368, + "grad_norm": 0.20870809257030487, + "kl": 0.30348594933748246, + "learning_rate": 2.1861667269747623e-06, + "loss": 0.0773, + "reward": 1.7843750596046448, + "reward_std": 0.21300148218870163, + "rewards/accuracy_reward": 0.09375000223517418, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2521 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.870849609375, + "epoch": 0.8071691470635302, + "grad_norm": 0.12147420644760132, + "kl": 0.24489268735051156, + "learning_rate": 2.179197168078281e-06, + "loss": 0.0662, + "reward": 1.8171875476837158, + "reward_std": 0.13280707448720933, + "rewards/accuracy_reward": 0.10208333637565374, + "rewards/format_reward": 0.9729166865348816, + "rewards/tag_count_reward": 0.7421875119209289, + "step": 2522 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.9146026611328, + "epoch": 0.8074891982717235, + "grad_norm": 0.20473752915859222, + "kl": 0.33478925600647924, + "learning_rate": 2.1722373777147574e-06, + "loss": 0.1407, + "reward": 1.784375047683716, + "reward_std": 0.27957783192396163, + "rewards/accuracy_reward": 0.10833333358168602, + "rewards/format_reward": 0.9437500178813935, + "rewards/tag_count_reward": 0.7322916865348816, + "step": 2523 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.5479309082032, + "epoch": 0.8078092494799168, + "grad_norm": 0.232964888215065, + "kl": 0.33908804357051847, + "learning_rate": 2.165287364577282e-06, + "loss": 0.1077, + "reward": 1.823958396911621, + "reward_std": 0.23599036335945128, + "rewards/accuracy_reward": 0.11666667070239782, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7385416924953461, + "step": 2524 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.8104370117187, + "epoch": 0.8081293006881101, + "grad_norm": 0.17493554949760437, + "kl": 0.20149580687284468, + "learning_rate": 2.158347137346736e-06, + "loss": 0.0876, + "reward": 1.7171875357627868, + "reward_std": 0.20723508298397064, + "rewards/accuracy_reward": 0.020833334513008596, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 2525 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.0270965576171, + "epoch": 0.8084493518963034, + "grad_norm": 0.23425406217575073, + "kl": 0.36248365715146064, + "learning_rate": 2.1514167046917666e-06, + "loss": 0.1021, + "reward": 1.7708333611488343, + "reward_std": 0.24469319060444833, + "rewards/accuracy_reward": 0.07916666828095913, + "rewards/format_reward": 0.9500000298023223, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2526 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.4312683105469, + "epoch": 0.8087694031044967, + "grad_norm": 0.35288989543914795, + "kl": 0.47020969688892367, + "learning_rate": 2.1444960752687994e-06, + "loss": 0.1475, + "reward": 1.7546875476837158, + "reward_std": 0.30719054490327835, + "rewards/accuracy_reward": 0.09375000204890967, + "rewards/format_reward": 0.931250023841858, + "rewards/tag_count_reward": 0.7296875178813934, + "step": 2527 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.0812622070313, + "epoch": 0.8090894543126901, + "grad_norm": 0.14612969756126404, + "kl": 0.1977105811238289, + "learning_rate": 2.1375852577220078e-06, + "loss": 0.0789, + "reward": 1.7734375476837159, + "reward_std": 0.1774771437048912, + "rewards/accuracy_reward": 0.06041667014360428, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7442708492279053, + "step": 2528 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.5041931152343, + "epoch": 0.8094095055208833, + "grad_norm": 0.1703653633594513, + "kl": 0.4641824632883072, + "learning_rate": 2.1306842606833157e-06, + "loss": 0.0848, + "reward": 1.7531250238418579, + "reward_std": 0.22379239052534103, + "rewards/accuracy_reward": 0.06458333544433117, + "rewards/format_reward": 0.954166692495346, + "rewards/tag_count_reward": 0.7343750238418579, + "step": 2529 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.2125213623046, + "epoch": 0.8097295567290766, + "grad_norm": 0.2629508078098297, + "kl": 0.30261474251747134, + "learning_rate": 2.1237930927723736e-06, + "loss": 0.1067, + "reward": 1.7213541984558105, + "reward_std": 0.249358981102705, + "rewards/accuracy_reward": 0.033333334140479565, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7338541865348815, + "step": 2530 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.2291870117188, + "epoch": 0.81004960793727, + "grad_norm": 0.15494459867477417, + "kl": 0.3502315230667591, + "learning_rate": 2.116911762596563e-06, + "loss": 0.1289, + "reward": 1.7026042103767396, + "reward_std": 0.21801614612340928, + "rewards/accuracy_reward": 0.016666667722165584, + "rewards/format_reward": 0.950000011920929, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 2531 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.8021057128906, + "epoch": 0.8103696591454633, + "grad_norm": 0.11083235591650009, + "kl": 0.14325247332453728, + "learning_rate": 2.11004027875097e-06, + "loss": 0.0964, + "reward": 1.7223958849906922, + "reward_std": 0.18362916633486748, + "rewards/accuracy_reward": 0.01250000037252903, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7411458432674408, + "step": 2532 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.0771026611328, + "epoch": 0.8106897103536566, + "grad_norm": 0.1683712750673294, + "kl": 0.2535912752151489, + "learning_rate": 2.103178649818387e-06, + "loss": 0.075, + "reward": 1.7786458730697632, + "reward_std": 0.1779824249446392, + "rewards/accuracy_reward": 0.07291666977107525, + "rewards/format_reward": 0.9645833611488343, + "rewards/tag_count_reward": 0.7411458492279053, + "step": 2533 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.4458557128906, + "epoch": 0.8110097615618499, + "grad_norm": 0.12049499154090881, + "kl": 0.2638672016561031, + "learning_rate": 2.0963268843692986e-06, + "loss": 0.0826, + "reward": 1.7864583611488343, + "reward_std": 0.22827504426240922, + "rewards/accuracy_reward": 0.10416667014360428, + "rewards/format_reward": 0.943750011920929, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 2534 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.2791809082031, + "epoch": 0.8113298127700432, + "grad_norm": 0.11559872329235077, + "kl": 0.2065212272107601, + "learning_rate": 2.089484990961862e-06, + "loss": 0.0803, + "reward": 1.8260417103767395, + "reward_std": 0.1757136031985283, + "rewards/accuracy_reward": 0.11875000353902579, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7406250238418579, + "step": 2535 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.2187805175781, + "epoch": 0.8116498639782365, + "grad_norm": 0.284346342086792, + "kl": 0.4133388787508011, + "learning_rate": 2.0826529781419092e-06, + "loss": 0.0991, + "reward": 1.692187535762787, + "reward_std": 0.17340034022927284, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.947916692495346, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 2536 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.2333435058594, + "epoch": 0.8119699151864298, + "grad_norm": 0.14568276703357697, + "kl": 0.20174489133059978, + "learning_rate": 2.0758308544429317e-06, + "loss": 0.0962, + "reward": 1.7604166865348816, + "reward_std": 0.16792564019560813, + "rewards/accuracy_reward": 0.05208333432674408, + "rewards/format_reward": 0.9645833611488343, + "rewards/tag_count_reward": 0.743750023841858, + "step": 2537 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.8271026611328, + "epoch": 0.8122899663946231, + "grad_norm": 0.11698803305625916, + "kl": 0.19524867199361323, + "learning_rate": 2.069018628386067e-06, + "loss": 0.0881, + "reward": 1.8166666984558106, + "reward_std": 0.21987561136484146, + "rewards/accuracy_reward": 0.10208333637565374, + "rewards/format_reward": 0.9729166924953461, + "rewards/tag_count_reward": 0.7416666805744171, + "step": 2538 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.2583557128906, + "epoch": 0.8126100176028165, + "grad_norm": 0.35214290022850037, + "kl": 0.5445257410407066, + "learning_rate": 2.0622163084800904e-06, + "loss": 0.0919, + "reward": 1.814583420753479, + "reward_std": 0.24731214791536332, + "rewards/accuracy_reward": 0.13125000707805157, + "rewards/format_reward": 0.947916692495346, + "rewards/tag_count_reward": 0.7354166746139527, + "step": 2539 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.2666870117188, + "epoch": 0.8129300688110097, + "grad_norm": 0.15203234553337097, + "kl": 0.2172985278069973, + "learning_rate": 2.055423903221404e-06, + "loss": 0.0675, + "reward": 1.7791666984558105, + "reward_std": 0.16751005351543427, + "rewards/accuracy_reward": 0.06875000037252903, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2540 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.4229370117188, + "epoch": 0.813250120019203, + "grad_norm": 0.20225732028484344, + "kl": 0.22776147164404392, + "learning_rate": 2.0486414210940266e-06, + "loss": 0.0509, + "reward": 1.9161458730697631, + "reward_std": 0.17580842301249505, + "rewards/accuracy_reward": 0.19791666977107525, + "rewards/format_reward": 0.9750000178813935, + "rewards/tag_count_reward": 0.7432291805744171, + "step": 2541 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.5646026611328, + "epoch": 0.8135701712273964, + "grad_norm": 0.18625636398792267, + "kl": 0.4055021218955517, + "learning_rate": 2.0418688705695846e-06, + "loss": 0.1084, + "reward": 1.7718750596046449, + "reward_std": 0.2145461067557335, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 2542 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.064599609375, + "epoch": 0.8138902224355897, + "grad_norm": 0.19601131975650787, + "kl": 0.347163225710392, + "learning_rate": 2.035106260107291e-06, + "loss": 0.0904, + "reward": 1.7125000357627869, + "reward_std": 0.23749838918447494, + "rewards/accuracy_reward": 0.03125000111758709, + "rewards/format_reward": 0.9437500178813935, + "rewards/tag_count_reward": 0.7375000238418579, + "step": 2543 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.9958557128906, + "epoch": 0.814210273643783, + "grad_norm": 0.11947452276945114, + "kl": 0.39595833867788316, + "learning_rate": 2.0283535981539537e-06, + "loss": 0.1061, + "reward": 1.7432291984558106, + "reward_std": 0.21747851520776748, + "rewards/accuracy_reward": 0.05833333544433117, + "rewards/format_reward": 0.9541666805744171, + "rewards/tag_count_reward": 0.7307291865348816, + "step": 2544 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.2062622070313, + "epoch": 0.8145303248519763, + "grad_norm": 0.19575047492980957, + "kl": 0.31752732023596764, + "learning_rate": 2.021610893143947e-06, + "loss": 0.0987, + "reward": 1.9182292222976685, + "reward_std": 0.2564608708024025, + "rewards/accuracy_reward": 0.22291667740792037, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.736979192495346, + "step": 2545 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.0791778564453, + "epoch": 0.8148503760601696, + "grad_norm": 0.15424421429634094, + "kl": 0.26091369315981866, + "learning_rate": 2.0148781534992135e-06, + "loss": 0.0926, + "reward": 1.7901041984558106, + "reward_std": 0.20669013559818267, + "rewards/accuracy_reward": 0.09375000204890967, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 2546 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.2270965576172, + "epoch": 0.815170427268363, + "grad_norm": 0.08642657101154327, + "kl": 0.2408471204340458, + "learning_rate": 2.008155387629245e-06, + "loss": 0.0906, + "reward": 1.7385416865348815, + "reward_std": 0.22244496196508406, + "rewards/accuracy_reward": 0.037500000186264516, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 2547 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.0687744140625, + "epoch": 0.8154904784765562, + "grad_norm": 0.19214469194412231, + "kl": 0.431305243819952, + "learning_rate": 2.0014426039310786e-06, + "loss": 0.1049, + "reward": 1.7687500357627868, + "reward_std": 0.22442566826939583, + "rewards/accuracy_reward": 0.08750000316649675, + "rewards/format_reward": 0.9458333492279053, + "rewards/tag_count_reward": 0.7354166865348816, + "step": 2548 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.308349609375, + "epoch": 0.8158105296847495, + "grad_norm": 0.12344229966402054, + "kl": 0.21020562946796417, + "learning_rate": 1.9947398107892813e-06, + "loss": 0.0675, + "reward": 1.7697917103767395, + "reward_std": 0.1644446700811386, + "rewards/accuracy_reward": 0.0541666679084301, + "rewards/format_reward": 0.9729166805744172, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 2549 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.8750183105469, + "epoch": 0.8161305808929429, + "grad_norm": 0.10697629302740097, + "kl": 0.2528723068535328, + "learning_rate": 1.9880470165759436e-06, + "loss": 0.0409, + "reward": 1.7723958611488342, + "reward_std": 0.161284402012825, + "rewards/accuracy_reward": 0.056250000931322576, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.7453125178813934, + "step": 2550 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.681265258789, + "epoch": 0.8164506321011362, + "grad_norm": 0.1915271282196045, + "kl": 0.28193242400884627, + "learning_rate": 1.9813642296506606e-06, + "loss": 0.0633, + "reward": 1.7947917103767395, + "reward_std": 0.15104105249047278, + "rewards/accuracy_reward": 0.08541666865348815, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.740625011920929, + "step": 2551 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.8458557128906, + "epoch": 0.8167706833093294, + "grad_norm": 0.12123015522956848, + "kl": 0.2618663445115089, + "learning_rate": 1.974691458360536e-06, + "loss": 0.0738, + "reward": 1.859895896911621, + "reward_std": 0.25564419105648994, + "rewards/accuracy_reward": 0.15833334177732467, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7390625238418579, + "step": 2552 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.714599609375, + "epoch": 0.8170907345175228, + "grad_norm": 0.12185723334550858, + "kl": 0.23866596780717372, + "learning_rate": 1.9680287110401584e-06, + "loss": 0.0666, + "reward": 1.7625000476837158, + "reward_std": 0.21922733038663864, + "rewards/accuracy_reward": 0.058333336189389226, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 2553 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.758349609375, + "epoch": 0.8174107857257161, + "grad_norm": 0.1164194792509079, + "kl": 0.2658846389502287, + "learning_rate": 1.9613759960115986e-06, + "loss": 0.0811, + "reward": 1.765625035762787, + "reward_std": 0.2186833456158638, + "rewards/accuracy_reward": 0.06666666939854622, + "rewards/format_reward": 0.9583333432674408, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 2554 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.0395935058593, + "epoch": 0.8177308369339095, + "grad_norm": 0.08748471736907959, + "kl": 0.1893336571753025, + "learning_rate": 1.9547333215843945e-06, + "loss": 0.087, + "reward": 1.746875035762787, + "reward_std": 0.17420026957988738, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 2555 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.8479400634766, + "epoch": 0.8180508881421027, + "grad_norm": 0.14459647238254547, + "kl": 0.3010967392474413, + "learning_rate": 1.948100696055545e-06, + "loss": 0.0752, + "reward": 1.7864583730697632, + "reward_std": 0.20786819905042647, + "rewards/accuracy_reward": 0.08333333544433116, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 2556 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.6166931152344, + "epoch": 0.818370939350296, + "grad_norm": 0.10996969789266586, + "kl": 0.19597923345863819, + "learning_rate": 1.9414781277094963e-06, + "loss": 0.0468, + "reward": 1.7760416865348816, + "reward_std": 0.14172435849905013, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 2557 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.2666870117188, + "epoch": 0.8186909905584894, + "grad_norm": 0.13527487218379974, + "kl": 0.27192542925477026, + "learning_rate": 1.934865624818132e-06, + "loss": 0.0987, + "reward": 1.7984375476837158, + "reward_std": 0.22232221812009811, + "rewards/accuracy_reward": 0.10000000447034836, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7380208432674408, + "step": 2558 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.4854370117188, + "epoch": 0.8190110417666827, + "grad_norm": 0.10489752143621445, + "kl": 0.21075959838926792, + "learning_rate": 1.928263195640767e-06, + "loss": 0.0593, + "reward": 1.8062500476837158, + "reward_std": 0.19212491065263748, + "rewards/accuracy_reward": 0.10416667293757201, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7375000178813934, + "step": 2559 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.1125305175781, + "epoch": 0.8193310929748759, + "grad_norm": 0.1237000972032547, + "kl": 0.3519850574433804, + "learning_rate": 1.9216708484241275e-06, + "loss": 0.0607, + "reward": 1.7927083849906922, + "reward_std": 0.20241789817810057, + "rewards/accuracy_reward": 0.08541666772216558, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 2560 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.5437744140625, + "epoch": 0.8196511441830693, + "grad_norm": 0.09780974686145782, + "kl": 0.2098309613764286, + "learning_rate": 1.915088591402351e-06, + "loss": 0.0873, + "reward": 1.7880208849906922, + "reward_std": 0.2020464301109314, + "rewards/accuracy_reward": 0.09166666865348816, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7401041805744171, + "step": 2561 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.989599609375, + "epoch": 0.8199711953912626, + "grad_norm": 0.13586272299289703, + "kl": 0.30812819600105285, + "learning_rate": 1.908516432796973e-06, + "loss": 0.0929, + "reward": 1.7786458730697632, + "reward_std": 0.22650991678237914, + "rewards/accuracy_reward": 0.08125000055879354, + "rewards/format_reward": 0.9583333611488343, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2562 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.8166931152343, + "epoch": 0.820291246599456, + "grad_norm": 0.10003126412630081, + "kl": 0.21191044226288797, + "learning_rate": 1.9019543808169117e-06, + "loss": 0.0489, + "reward": 1.7963542342185974, + "reward_std": 0.19010281562805176, + "rewards/accuracy_reward": 0.08958333563059569, + "rewards/format_reward": 0.9666666924953461, + "rewards/tag_count_reward": 0.7401041924953461, + "step": 2563 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.8041870117188, + "epoch": 0.8206112978076492, + "grad_norm": 0.1322830468416214, + "kl": 0.2218364529311657, + "learning_rate": 1.895402443658465e-06, + "loss": 0.0564, + "reward": 1.7718750357627868, + "reward_std": 0.1890696920454502, + "rewards/accuracy_reward": 0.06250000130385161, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.740625011920929, + "step": 2564 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.5375183105468, + "epoch": 0.8209313490158425, + "grad_norm": 0.1352192908525467, + "kl": 0.29029730148613453, + "learning_rate": 1.888860629505297e-06, + "loss": 0.0376, + "reward": 1.7395833611488343, + "reward_std": 0.1509907476603985, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.9770833492279053, + "rewards/tag_count_reward": 0.7416666805744171, + "step": 2565 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.2833435058594, + "epoch": 0.8212514002240359, + "grad_norm": 0.1863587647676468, + "kl": 0.18537102192640303, + "learning_rate": 1.8823289465284244e-06, + "loss": 0.083, + "reward": 1.7229166865348815, + "reward_std": 0.16097078919410707, + "rewards/accuracy_reward": 0.018750000558793545, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2566 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.0083526611328, + "epoch": 0.8215714514322292, + "grad_norm": 0.08944018930196762, + "kl": 0.20662855319678783, + "learning_rate": 1.8758074028862161e-06, + "loss": 0.0705, + "reward": 1.7895833730697632, + "reward_std": 0.1706282950937748, + "rewards/accuracy_reward": 0.07916666883975268, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2567 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.0479370117188, + "epoch": 0.8218915026404224, + "grad_norm": 0.29065433144569397, + "kl": 0.29194722771644593, + "learning_rate": 1.869296006724366e-06, + "loss": 0.0541, + "reward": 1.7531250357627868, + "reward_std": 0.21704104840755462, + "rewards/accuracy_reward": 0.05416666883975267, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2568 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.064599609375, + "epoch": 0.8222115538486158, + "grad_norm": 0.16143926978111267, + "kl": 0.3712439864873886, + "learning_rate": 1.8627947661759027e-06, + "loss": 0.1243, + "reward": 1.7343750238418578, + "reward_std": 0.22560944259166718, + "rewards/accuracy_reward": 0.04375000149011612, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7364583671092987, + "step": 2569 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.4041809082031, + "epoch": 0.8225316050568091, + "grad_norm": 0.15806716680526733, + "kl": 0.2439609609544277, + "learning_rate": 1.8563036893611664e-06, + "loss": 0.0698, + "reward": 1.7145833611488341, + "reward_std": 0.14413456842303277, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7416666805744171, + "step": 2570 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.1458465576172, + "epoch": 0.8228516562650025, + "grad_norm": 0.10522006452083588, + "kl": 0.23359978944063187, + "learning_rate": 1.8498227843878025e-06, + "loss": 0.0774, + "reward": 1.7682292103767394, + "reward_std": 0.18255340680480003, + "rewards/accuracy_reward": 0.06458333507180214, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7390625238418579, + "step": 2571 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.4062744140625, + "epoch": 0.8231717074731957, + "grad_norm": 0.12738092243671417, + "kl": 0.4491455115377903, + "learning_rate": 1.8433520593507515e-06, + "loss": 0.1236, + "reward": 1.7473958730697632, + "reward_std": 0.22424405813217163, + "rewards/accuracy_reward": 0.05625000149011612, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.739062511920929, + "step": 2572 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.6562713623047, + "epoch": 0.823491758681389, + "grad_norm": 0.184599369764328, + "kl": 0.6130537793040276, + "learning_rate": 1.8368915223322392e-06, + "loss": 0.1159, + "reward": 1.7661458730697632, + "reward_std": 0.26158987879753115, + "rewards/accuracy_reward": 0.0916666690260172, + "rewards/format_reward": 0.9416666865348816, + "rewards/tag_count_reward": 0.732812511920929, + "step": 2573 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.7875183105468, + "epoch": 0.8238118098895824, + "grad_norm": 0.15006475150585175, + "kl": 0.25845105201005936, + "learning_rate": 1.8304411814017654e-06, + "loss": 0.0903, + "reward": 1.752083384990692, + "reward_std": 0.20691974982619285, + "rewards/accuracy_reward": 0.05625000260770321, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.735416692495346, + "step": 2574 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.2041870117188, + "epoch": 0.8241318610977757, + "grad_norm": 0.20258435606956482, + "kl": 0.5631472624838352, + "learning_rate": 1.8240010446160973e-06, + "loss": 0.1084, + "reward": 1.7848958849906922, + "reward_std": 0.27376395016908645, + "rewards/accuracy_reward": 0.10625000149011612, + "rewards/format_reward": 0.9479166805744171, + "rewards/tag_count_reward": 0.7307291865348816, + "step": 2575 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.4166961669922, + "epoch": 0.8244519123059689, + "grad_norm": 0.1155025064945221, + "kl": 0.31104291453957555, + "learning_rate": 1.817571120019248e-06, + "loss": 0.0822, + "reward": 1.8125000596046448, + "reward_std": 0.18279174268245696, + "rewards/accuracy_reward": 0.11458333637565374, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 2576 + }, + { + "clip_ratio": 0.0, + "completion_length": 609.0000183105469, + "epoch": 0.8247719635141623, + "grad_norm": 0.17100323736667633, + "kl": 0.2843918614089489, + "learning_rate": 1.811151415642487e-06, + "loss": 0.0943, + "reward": 1.7473958730697632, + "reward_std": 0.21448913291096688, + "rewards/accuracy_reward": 0.03958333469927311, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.739062511920929, + "step": 2577 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.7916809082031, + "epoch": 0.8250920147223556, + "grad_norm": 0.10874886065721512, + "kl": 0.35035271644592286, + "learning_rate": 1.8047419395043086e-06, + "loss": 0.1229, + "reward": 1.7109375476837159, + "reward_std": 0.23503211364150048, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.9333333492279052, + "rewards/tag_count_reward": 0.735937523841858, + "step": 2578 + }, + { + "clip_ratio": 0.0, + "completion_length": 588.4750244140625, + "epoch": 0.8254120659305488, + "grad_norm": 0.11109782010316849, + "kl": 0.42124783545732497, + "learning_rate": 1.798342699610438e-06, + "loss": 0.1023, + "reward": 1.707812523841858, + "reward_std": 0.25911448076367377, + "rewards/accuracy_reward": 0.03125000111758709, + "rewards/format_reward": 0.9479166805744171, + "rewards/tag_count_reward": 0.7286458492279053, + "step": 2579 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.520849609375, + "epoch": 0.8257321171387422, + "grad_norm": 0.19008983671665192, + "kl": 0.3546397894620895, + "learning_rate": 1.7919537039538127e-06, + "loss": 0.1139, + "reward": 1.7692708611488341, + "reward_std": 0.23377685472369195, + "rewards/accuracy_reward": 0.07916666883975268, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7338541805744171, + "step": 2580 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.2687744140625, + "epoch": 0.8260521683469355, + "grad_norm": 0.2240857034921646, + "kl": 0.29563959799706935, + "learning_rate": 1.7855749605145722e-06, + "loss": 0.0933, + "reward": 1.7614583730697633, + "reward_std": 0.18812100738286971, + "rewards/accuracy_reward": 0.05416666828095913, + "rewards/format_reward": 0.9687500298023224, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2581 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.7458435058594, + "epoch": 0.8263722195551289, + "grad_norm": 0.28932392597198486, + "kl": 0.3456702195107937, + "learning_rate": 1.7792064772600547e-06, + "loss": 0.0919, + "reward": 1.7651042222976685, + "reward_std": 0.229879729449749, + "rewards/accuracy_reward": 0.07083333563059568, + "rewards/format_reward": 0.9604166924953461, + "rewards/tag_count_reward": 0.7338541805744171, + "step": 2582 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.4062683105469, + "epoch": 0.8266922707633221, + "grad_norm": 0.13011673092842102, + "kl": 0.16959348358213902, + "learning_rate": 1.7728482621447818e-06, + "loss": 0.0893, + "reward": 1.7395833611488343, + "reward_std": 0.1753227561712265, + "rewards/accuracy_reward": 0.03125000204890967, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.743750023841858, + "step": 2583 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.802099609375, + "epoch": 0.8270123219715154, + "grad_norm": 0.16126485168933868, + "kl": 0.21559477150440215, + "learning_rate": 1.766500323110445e-06, + "loss": 0.051, + "reward": 1.8192708969116211, + "reward_std": 0.15779685974121094, + "rewards/accuracy_reward": 0.10416667088866234, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.7442708611488342, + "step": 2584 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.8729370117187, + "epoch": 0.8273323731797088, + "grad_norm": 0.28615960478782654, + "kl": 0.29707399681210517, + "learning_rate": 1.7601626680859073e-06, + "loss": 0.1335, + "reward": 1.8328125476837158, + "reward_std": 0.2527454063296318, + "rewards/accuracy_reward": 0.14791667070239783, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7348958611488342, + "step": 2585 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.9645935058594, + "epoch": 0.8276524243879021, + "grad_norm": 0.10823047906160355, + "kl": 0.22162544690072536, + "learning_rate": 1.7538353049871826e-06, + "loss": 0.0416, + "reward": 1.7625000119209289, + "reward_std": 0.140020003169775, + "rewards/accuracy_reward": 0.04583333469927311, + "rewards/format_reward": 0.9729166865348816, + "rewards/tag_count_reward": 0.7437500178813934, + "step": 2586 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.1250183105469, + "epoch": 0.8279724755960953, + "grad_norm": 0.19290484488010406, + "kl": 0.16696857735514642, + "learning_rate": 1.7475182417174318e-06, + "loss": 0.0861, + "reward": 1.794270884990692, + "reward_std": 0.22950992360711098, + "rewards/accuracy_reward": 0.08333333656191826, + "rewards/format_reward": 0.9708333551883698, + "rewards/tag_count_reward": 0.7401041865348816, + "step": 2587 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.9187683105469, + "epoch": 0.8282925268042887, + "grad_norm": 0.10784605145454407, + "kl": 0.30665692016482354, + "learning_rate": 1.7412114861669482e-06, + "loss": 0.077, + "reward": 1.8307292580604553, + "reward_std": 0.21484979316592218, + "rewards/accuracy_reward": 0.12708333600312471, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7369791865348816, + "step": 2588 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.3979431152344, + "epoch": 0.828612578012482, + "grad_norm": 0.2151651382446289, + "kl": 0.41664362922310827, + "learning_rate": 1.7349150462131536e-06, + "loss": 0.0866, + "reward": 1.8447917103767395, + "reward_std": 0.29800059348344804, + "rewards/accuracy_reward": 0.1562500050291419, + "rewards/format_reward": 0.954166692495346, + "rewards/tag_count_reward": 0.7343750178813935, + "step": 2589 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.6187622070313, + "epoch": 0.8289326292206753, + "grad_norm": 0.1969972550868988, + "kl": 0.3401130996644497, + "learning_rate": 1.7286289297205826e-06, + "loss": 0.0593, + "reward": 1.7770833611488341, + "reward_std": 0.260301998257637, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 2590 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.2979370117188, + "epoch": 0.8292526804288686, + "grad_norm": 0.27202510833740234, + "kl": 0.33993891403079035, + "learning_rate": 1.722353144540877e-06, + "loss": 0.1264, + "reward": 1.8276042222976685, + "reward_std": 0.28314041793346406, + "rewards/accuracy_reward": 0.15833333935588598, + "rewards/format_reward": 0.9354166805744171, + "rewards/tag_count_reward": 0.7338541924953461, + "step": 2591 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.4000305175781, + "epoch": 0.8295727316370619, + "grad_norm": 0.21395038068294525, + "kl": 0.3511948026716709, + "learning_rate": 1.716087698512775e-06, + "loss": 0.0687, + "reward": 1.7541667103767395, + "reward_std": 0.21019265428185463, + "rewards/accuracy_reward": 0.05416666921228171, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7395833432674408, + "step": 2592 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.2916839599609, + "epoch": 0.8298927828452553, + "grad_norm": 0.17267273366451263, + "kl": 0.344217037782073, + "learning_rate": 1.7098325994620934e-06, + "loss": 0.0797, + "reward": 1.856770884990692, + "reward_std": 0.229797425866127, + "rewards/accuracy_reward": 0.15000000447034836, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7401041805744171, + "step": 2593 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.6041870117188, + "epoch": 0.8302128340534486, + "grad_norm": 0.24830318987369537, + "kl": 0.29451605267822745, + "learning_rate": 1.703587855201736e-06, + "loss": 0.0944, + "reward": 1.7479166984558105, + "reward_std": 0.18984657078981398, + "rewards/accuracy_reward": 0.039583333395421506, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7437500178813934, + "step": 2594 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.0833557128906, + "epoch": 0.8305328852616418, + "grad_norm": 0.10851777344942093, + "kl": 0.19102349653840064, + "learning_rate": 1.6973534735316666e-06, + "loss": 0.0364, + "reward": 1.7375000238418579, + "reward_std": 0.14052069038152695, + "rewards/accuracy_reward": 0.027083333767950534, + "rewards/format_reward": 0.9708333432674408, + "rewards/tag_count_reward": 0.7395833432674408, + "step": 2595 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.008349609375, + "epoch": 0.8308529364698352, + "grad_norm": 0.5246524810791016, + "kl": 0.30549296662211417, + "learning_rate": 1.6911294622389075e-06, + "loss": 0.1013, + "reward": 1.7406250357627868, + "reward_std": 0.24924185127019882, + "rewards/accuracy_reward": 0.06041666977107525, + "rewards/format_reward": 0.9458333551883698, + "rewards/tag_count_reward": 0.7343750178813935, + "step": 2596 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.1958557128906, + "epoch": 0.8311729876780285, + "grad_norm": 0.10506986081600189, + "kl": 0.30327147617936134, + "learning_rate": 1.6849158290975277e-06, + "loss": 0.0728, + "reward": 1.7791667103767395, + "reward_std": 0.21810988560318947, + "rewards/accuracy_reward": 0.08958333544433117, + "rewards/format_reward": 0.9541666805744171, + "rewards/tag_count_reward": 0.7354166805744171, + "step": 2597 + }, + { + "clip_ratio": 0.0, + "completion_length": 593.0312774658203, + "epoch": 0.8314930388862218, + "grad_norm": 0.15811342000961304, + "kl": 0.36188592612743375, + "learning_rate": 1.6787125818686322e-06, + "loss": 0.1335, + "reward": 1.7531250357627868, + "reward_std": 0.3249521702528, + "rewards/accuracy_reward": 0.09375000465661287, + "rewards/format_reward": 0.931250023841858, + "rewards/tag_count_reward": 0.7281250178813934, + "step": 2598 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.2604370117188, + "epoch": 0.8318130900944151, + "grad_norm": 0.21360376477241516, + "kl": 0.3731867164373398, + "learning_rate": 1.6725197283003548e-06, + "loss": 0.0885, + "reward": 1.8067708730697631, + "reward_std": 0.22768967226147652, + "rewards/accuracy_reward": 0.10625000465661287, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7380208492279052, + "step": 2599 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.5771026611328, + "epoch": 0.8321331413026084, + "grad_norm": 0.18168602883815765, + "kl": 0.3145070172846317, + "learning_rate": 1.6663372761278507e-06, + "loss": 0.0948, + "reward": 1.7807292222976685, + "reward_std": 0.18614666685461997, + "rewards/accuracy_reward": 0.07916666977107525, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2600 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.2708435058594, + "epoch": 0.8324531925108017, + "grad_norm": 0.34099605679512024, + "kl": 0.3224764481186867, + "learning_rate": 1.6601652330732732e-06, + "loss": 0.1071, + "reward": 1.7057291984558105, + "reward_std": 0.23296904936432838, + "rewards/accuracy_reward": 0.018750000558793545, + "rewards/format_reward": 0.954166692495346, + "rewards/tag_count_reward": 0.7328125178813935, + "step": 2601 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.7791931152344, + "epoch": 0.8327732437189951, + "grad_norm": 0.2730673849582672, + "kl": 0.5597857162356377, + "learning_rate": 1.6540036068457833e-06, + "loss": 0.1304, + "reward": 1.8041667103767396, + "reward_std": 0.24048233777284622, + "rewards/accuracy_reward": 0.11666667014360428, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7333333492279053, + "step": 2602 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.2062774658203, + "epoch": 0.8330932949271883, + "grad_norm": 0.17329207062721252, + "kl": 0.39218656048178674, + "learning_rate": 1.647852405141529e-06, + "loss": 0.105, + "reward": 1.7619791984558106, + "reward_std": 0.2654853545129299, + "rewards/accuracy_reward": 0.07500000335276127, + "rewards/format_reward": 0.956250011920929, + "rewards/tag_count_reward": 0.7307291805744172, + "step": 2603 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.6500183105469, + "epoch": 0.8334133461353816, + "grad_norm": 0.16453422605991364, + "kl": 0.42657639682292936, + "learning_rate": 1.6417116356436348e-06, + "loss": 0.0935, + "reward": 1.7958333849906922, + "reward_std": 0.2776599481701851, + "rewards/accuracy_reward": 0.11041666977107525, + "rewards/format_reward": 0.954166692495346, + "rewards/tag_count_reward": 0.7312500238418579, + "step": 2604 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.5770935058594, + "epoch": 0.833733397343575, + "grad_norm": 0.27947208285331726, + "kl": 0.570138244330883, + "learning_rate": 1.6355813060221993e-06, + "loss": 0.0999, + "reward": 1.7619792222976685, + "reward_std": 0.25141082108020785, + "rewards/accuracy_reward": 0.0895833358168602, + "rewards/format_reward": 0.9395833671092987, + "rewards/tag_count_reward": 0.7328125238418579, + "step": 2605 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.933349609375, + "epoch": 0.8340534485517683, + "grad_norm": 0.1307932436466217, + "kl": 0.2841352041810751, + "learning_rate": 1.6294614239342764e-06, + "loss": 0.0737, + "reward": 1.7343750476837159, + "reward_std": 0.17624344304203987, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 2606 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.839599609375, + "epoch": 0.8343734997599616, + "grad_norm": 0.17136070132255554, + "kl": 0.32739310935139654, + "learning_rate": 1.6233519970238732e-06, + "loss": 0.0843, + "reward": 1.7619792222976685, + "reward_std": 0.18518462628126145, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.956250011920929, + "rewards/tag_count_reward": 0.7348958551883698, + "step": 2607 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.8708557128906, + "epoch": 0.8346935509681549, + "grad_norm": 0.12200771272182465, + "kl": 0.25093438662588596, + "learning_rate": 1.6172530329219416e-06, + "loss": 0.0834, + "reward": 1.8307292222976685, + "reward_std": 0.18550884053111077, + "rewards/accuracy_reward": 0.12291667200624942, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7411458432674408, + "step": 2608 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.4666931152344, + "epoch": 0.8350136021763482, + "grad_norm": 0.18844038248062134, + "kl": 0.3024763770401478, + "learning_rate": 1.6111645392463548e-06, + "loss": 0.071, + "reward": 1.7739583611488343, + "reward_std": 0.20886053442955016, + "rewards/accuracy_reward": 0.06875000298023223, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 2609 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.9791870117188, + "epoch": 0.8353336533845416, + "grad_norm": 0.21762879192829132, + "kl": 0.36289220452308657, + "learning_rate": 1.6050865236019165e-06, + "loss": 0.0809, + "reward": 1.7531250357627868, + "reward_std": 0.27787193953990935, + "rewards/accuracy_reward": 0.07916666679084301, + "rewards/format_reward": 0.9395833492279053, + "rewards/tag_count_reward": 0.7343750238418579, + "step": 2610 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.4062683105469, + "epoch": 0.8356537045927348, + "grad_norm": 0.2098568081855774, + "kl": 0.3839412644505501, + "learning_rate": 1.5990189935803402e-06, + "loss": 0.1146, + "reward": 1.7791666984558105, + "reward_std": 0.24917776137590408, + "rewards/accuracy_reward": 0.08750000037252903, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7375000178813934, + "step": 2611 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.5666931152343, + "epoch": 0.8359737558009281, + "grad_norm": 0.1561695784330368, + "kl": 0.38026146665215493, + "learning_rate": 1.5929619567602429e-06, + "loss": 0.0763, + "reward": 1.8197917103767396, + "reward_std": 0.21357117891311644, + "rewards/accuracy_reward": 0.1416666716337204, + "rewards/format_reward": 0.9437500178813935, + "rewards/tag_count_reward": 0.7343750238418579, + "step": 2612 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.8062683105469, + "epoch": 0.8362938070091215, + "grad_norm": 0.13479575514793396, + "kl": 0.1574801068753004, + "learning_rate": 1.5869154207071347e-06, + "loss": 0.0659, + "reward": 1.8072916984558105, + "reward_std": 0.20274921506643295, + "rewards/accuracy_reward": 0.08958333749324084, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.746875011920929, + "step": 2613 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.0937713623047, + "epoch": 0.8366138582173148, + "grad_norm": 0.17284269630908966, + "kl": 0.2832434602081776, + "learning_rate": 1.58087939297341e-06, + "loss": 0.0588, + "reward": 1.8270833849906922, + "reward_std": 0.15429509207606315, + "rewards/accuracy_reward": 0.11666667014360428, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2614 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.4229309082032, + "epoch": 0.836933909425508, + "grad_norm": 0.09957928210496902, + "kl": 0.12662966325879096, + "learning_rate": 1.5748538810983382e-06, + "loss": 0.0306, + "reward": 1.9494792222976685, + "reward_std": 0.14919039011001586, + "rewards/accuracy_reward": 0.2250000074505806, + "rewards/format_reward": 0.9791666865348816, + "rewards/tag_count_reward": 0.745312511920929, + "step": 2615 + }, + { + "clip_ratio": 0.0, + "completion_length": 604.9479309082031, + "epoch": 0.8372539606337014, + "grad_norm": 0.14037205278873444, + "kl": 0.4093516394495964, + "learning_rate": 1.5688388926080534e-06, + "loss": 0.1201, + "reward": 1.823958396911621, + "reward_std": 0.24740922898054124, + "rewards/accuracy_reward": 0.15208333786576986, + "rewards/format_reward": 0.9395833492279053, + "rewards/tag_count_reward": 0.7322916805744171, + "step": 2616 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.7166870117187, + "epoch": 0.8375740118418947, + "grad_norm": 0.3734375536441803, + "kl": 0.2914412751793861, + "learning_rate": 1.5628344350155477e-06, + "loss": 0.1217, + "reward": 1.7296875238418579, + "reward_std": 0.23707970902323722, + "rewards/accuracy_reward": 0.03958333358168602, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.735937523841858, + "step": 2617 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.3666900634765, + "epoch": 0.8378940630500881, + "grad_norm": 0.25126445293426514, + "kl": 0.26317610368132593, + "learning_rate": 1.5568405158206523e-06, + "loss": 0.0849, + "reward": 1.7848958730697633, + "reward_std": 0.18527232334017754, + "rewards/accuracy_reward": 0.08958333637565374, + "rewards/format_reward": 0.9541666805744171, + "rewards/tag_count_reward": 0.7411458492279053, + "step": 2618 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.4645935058594, + "epoch": 0.8382141142582813, + "grad_norm": 0.3053106367588043, + "kl": 0.44118655622005465, + "learning_rate": 1.5508571425100428e-06, + "loss": 0.0941, + "reward": 1.7338541984558105, + "reward_std": 0.26469497084617616, + "rewards/accuracy_reward": 0.05625000149011612, + "rewards/format_reward": 0.9416666865348816, + "rewards/tag_count_reward": 0.735937523841858, + "step": 2619 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.1729370117188, + "epoch": 0.8385341654664746, + "grad_norm": 0.22060352563858032, + "kl": 0.2865352720022202, + "learning_rate": 1.5448843225572218e-06, + "loss": 0.0707, + "reward": 1.809375035762787, + "reward_std": 0.16750450432300568, + "rewards/accuracy_reward": 0.09791667014360428, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7427083432674408, + "step": 2620 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.0520935058594, + "epoch": 0.838854216674668, + "grad_norm": 0.33194923400878906, + "kl": 0.26609497480094435, + "learning_rate": 1.5389220634225077e-06, + "loss": 0.057, + "reward": 1.834375023841858, + "reward_std": 0.25171951204538345, + "rewards/accuracy_reward": 0.12708334028720855, + "rewards/format_reward": 0.9666666924953461, + "rewards/tag_count_reward": 0.740625011920929, + "step": 2621 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.0687774658203, + "epoch": 0.8391742678828612, + "grad_norm": 0.1432190239429474, + "kl": 0.20767300575971603, + "learning_rate": 1.5329703725530298e-06, + "loss": 0.0666, + "reward": 1.840625035762787, + "reward_std": 0.20196708291769028, + "rewards/accuracy_reward": 0.13750000465661288, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7447916865348816, + "step": 2622 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.5916870117187, + "epoch": 0.8394943190910545, + "grad_norm": 0.2663310468196869, + "kl": 0.5039402447640896, + "learning_rate": 1.5270292573827173e-06, + "loss": 0.1465, + "reward": 1.7687500596046448, + "reward_std": 0.2818905636668205, + "rewards/accuracy_reward": 0.09791666939854622, + "rewards/format_reward": 0.9354166924953461, + "rewards/tag_count_reward": 0.735416692495346, + "step": 2623 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.5104431152344, + "epoch": 0.8398143702992479, + "grad_norm": 0.14252537488937378, + "kl": 0.2896796494722366, + "learning_rate": 1.5210987253322862e-06, + "loss": 0.0766, + "reward": 1.7791666865348816, + "reward_std": 0.19611259773373604, + "rewards/accuracy_reward": 0.07500000111758709, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2624 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.9958465576171, + "epoch": 0.8401344215074412, + "grad_norm": 0.11551255732774734, + "kl": 0.38152455165982246, + "learning_rate": 1.5151787838092425e-06, + "loss": 0.129, + "reward": 1.7812500238418578, + "reward_std": 0.21832403987646104, + "rewards/accuracy_reward": 0.10000000298023223, + "rewards/format_reward": 0.9416666746139526, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 2625 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.7604370117188, + "epoch": 0.8404544727156344, + "grad_norm": 0.22384434938430786, + "kl": 0.23071169778704642, + "learning_rate": 1.509269440207851e-06, + "loss": 0.0816, + "reward": 1.7817708849906921, + "reward_std": 0.23716015443205835, + "rewards/accuracy_reward": 0.08541667014360428, + "rewards/format_reward": 0.954166692495346, + "rewards/tag_count_reward": 0.7421875238418579, + "step": 2626 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.8812683105468, + "epoch": 0.8407745239238278, + "grad_norm": 0.1428438276052475, + "kl": 0.3534207258373499, + "learning_rate": 1.5033707019091503e-06, + "loss": 0.0928, + "reward": 1.7427083730697632, + "reward_std": 0.259302493929863, + "rewards/accuracy_reward": 0.06458333488553762, + "rewards/format_reward": 0.9416666805744172, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 2627 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.1895965576172, + "epoch": 0.8410945751320211, + "grad_norm": 0.17479349672794342, + "kl": 0.30650137886404993, + "learning_rate": 1.4974825762809275e-06, + "loss": 0.1127, + "reward": 1.81927090883255, + "reward_std": 0.22232020273804665, + "rewards/accuracy_reward": 0.12083333749324084, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 2628 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.5375274658203, + "epoch": 0.8414146263402145, + "grad_norm": 0.14325033128261566, + "kl": 0.27730308175086976, + "learning_rate": 1.4916050706777185e-06, + "loss": 0.1261, + "reward": 1.8218750596046447, + "reward_std": 0.2709692373871803, + "rewards/accuracy_reward": 0.13333333637565375, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2629 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.1521057128906, + "epoch": 0.8417346775484077, + "grad_norm": 0.20191915333271027, + "kl": 0.31308504939079285, + "learning_rate": 1.4857381924407833e-06, + "loss": 0.0824, + "reward": 1.7651041984558105, + "reward_std": 0.18237927034497262, + "rewards/accuracy_reward": 0.0666666679084301, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 2630 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.4229339599609, + "epoch": 0.842054728756601, + "grad_norm": 0.30419307947158813, + "kl": 0.3622270733118057, + "learning_rate": 1.4798819488981232e-06, + "loss": 0.1275, + "reward": 1.767708384990692, + "reward_std": 0.24212229549884795, + "rewards/accuracy_reward": 0.07500000335276127, + "rewards/format_reward": 0.9479166865348816, + "rewards/tag_count_reward": 0.7447916865348816, + "step": 2631 + }, + { + "clip_ratio": 0.0, + "completion_length": 610.1583557128906, + "epoch": 0.8423747799647944, + "grad_norm": 0.13989323377609253, + "kl": 0.31397813037037847, + "learning_rate": 1.474036347364446e-06, + "loss": 0.1038, + "reward": 1.6932291865348816, + "reward_std": 0.18132396936416625, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.736979192495346, + "step": 2632 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.1500213623046, + "epoch": 0.8426948311729877, + "grad_norm": 0.20425206422805786, + "kl": 0.49030707627534864, + "learning_rate": 1.4682013951411723e-06, + "loss": 0.1228, + "reward": 1.751562523841858, + "reward_std": 0.2534876331686974, + "rewards/accuracy_reward": 0.07291666865348816, + "rewards/format_reward": 0.9458333492279053, + "rewards/tag_count_reward": 0.7328125178813935, + "step": 2633 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.2437622070313, + "epoch": 0.8430148823811809, + "grad_norm": 0.299966961145401, + "kl": 0.4217537730932236, + "learning_rate": 1.4623770995164133e-06, + "loss": 0.0986, + "reward": 1.8588541984558105, + "reward_std": 0.25987871587276457, + "rewards/accuracy_reward": 0.18125000670552255, + "rewards/format_reward": 0.9458333551883698, + "rewards/tag_count_reward": 0.7317708492279053, + "step": 2634 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.0125335693359, + "epoch": 0.8433349335893743, + "grad_norm": 0.1416076272726059, + "kl": 0.24743956252932547, + "learning_rate": 1.4565634677649786e-06, + "loss": 0.0984, + "reward": 1.778645884990692, + "reward_std": 0.2074045091867447, + "rewards/accuracy_reward": 0.07708333488553762, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7390625238418579, + "step": 2635 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.6229309082031, + "epoch": 0.8436549847975676, + "grad_norm": 0.25936782360076904, + "kl": 0.44153971374034884, + "learning_rate": 1.4507605071483533e-06, + "loss": 0.0864, + "reward": 1.8010417342185974, + "reward_std": 0.22055020928382874, + "rewards/accuracy_reward": 0.10833333637565375, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 2636 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.3500213623047, + "epoch": 0.843975036005761, + "grad_norm": 0.3049626350402832, + "kl": 0.28074306845664976, + "learning_rate": 1.4449682249146957e-06, + "loss": 0.1032, + "reward": 1.8630208730697633, + "reward_std": 0.2228449009358883, + "rewards/accuracy_reward": 0.1708333395421505, + "rewards/format_reward": 0.956250011920929, + "rewards/tag_count_reward": 0.7359375119209289, + "step": 2637 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.5000274658203, + "epoch": 0.8442950872139542, + "grad_norm": 0.49720048904418945, + "kl": 0.5407494202256202, + "learning_rate": 1.4391866282988266e-06, + "loss": 0.0996, + "reward": 1.7453125238418579, + "reward_std": 0.23839708790183067, + "rewards/accuracy_reward": 0.07291666921228171, + "rewards/format_reward": 0.9416666805744172, + "rewards/tag_count_reward": 0.7307291805744172, + "step": 2638 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.0021057128906, + "epoch": 0.8446151384221475, + "grad_norm": 0.15758629143238068, + "kl": 0.3089794680476189, + "learning_rate": 1.433415724522218e-06, + "loss": 0.0945, + "reward": 1.8218750357627869, + "reward_std": 0.2678198732435703, + "rewards/accuracy_reward": 0.1270833369344473, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2639 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.2416778564453, + "epoch": 0.8449351896303409, + "grad_norm": 0.20118384063243866, + "kl": 0.5578947067260742, + "learning_rate": 1.4276555207929864e-06, + "loss": 0.1285, + "reward": 1.7604166865348816, + "reward_std": 0.2577348858118057, + "rewards/accuracy_reward": 0.08333333432674409, + "rewards/format_reward": 0.947916692495346, + "rewards/tag_count_reward": 0.729166692495346, + "step": 2640 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.5604309082031, + "epoch": 0.8452552408385342, + "grad_norm": 0.23886068165302277, + "kl": 0.4678042992949486, + "learning_rate": 1.4219060243058879e-06, + "loss": 0.1033, + "reward": 1.8104166984558105, + "reward_std": 0.20677258148789407, + "rewards/accuracy_reward": 0.10416667014360428, + "rewards/format_reward": 0.9645833611488343, + "rewards/tag_count_reward": 0.7416666805744171, + "step": 2641 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.9916870117188, + "epoch": 0.8455752920467274, + "grad_norm": 0.17162956297397614, + "kl": 0.31468153595924375, + "learning_rate": 1.4161672422422968e-06, + "loss": 0.0841, + "reward": 1.7854166865348815, + "reward_std": 0.15550397485494613, + "rewards/accuracy_reward": 0.07708333544433117, + "rewards/format_reward": 0.9666666924953461, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2642 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.9687591552735, + "epoch": 0.8458953432549208, + "grad_norm": 0.37330108880996704, + "kl": 0.3181822635233402, + "learning_rate": 1.410439181770209e-06, + "loss": 0.0937, + "reward": 1.880208384990692, + "reward_std": 0.23388459980487825, + "rewards/accuracy_reward": 0.18541667312383653, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7364583611488342, + "step": 2643 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.7812591552735, + "epoch": 0.8462153944631141, + "grad_norm": 0.2929092347621918, + "kl": 0.3507498770952225, + "learning_rate": 1.4047218500442305e-06, + "loss": 0.0923, + "reward": 1.7640625357627868, + "reward_std": 0.22633379325270653, + "rewards/accuracy_reward": 0.07916666977107525, + "rewards/format_reward": 0.9479166865348816, + "rewards/tag_count_reward": 0.7369791865348816, + "step": 2644 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.1791809082031, + "epoch": 0.8465354456713075, + "grad_norm": 0.19493304193019867, + "kl": 0.2631402283906937, + "learning_rate": 1.3990152542055647e-06, + "loss": 0.0892, + "reward": 1.692187535762787, + "reward_std": 0.20709386914968492, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.9500000298023223, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 2645 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.1312652587891, + "epoch": 0.8468554968795007, + "grad_norm": 0.1697675734758377, + "kl": 0.3390098616480827, + "learning_rate": 1.3933194013820038e-06, + "loss": 0.0811, + "reward": 1.8270833969116211, + "reward_std": 0.21904802471399307, + "rewards/accuracy_reward": 0.1354166718199849, + "rewards/format_reward": 0.9479166865348816, + "rewards/tag_count_reward": 0.7437500119209289, + "step": 2646 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.6916870117187, + "epoch": 0.847175548087694, + "grad_norm": 0.18587954342365265, + "kl": 0.34423902481794355, + "learning_rate": 1.3876342986879243e-06, + "loss": 0.0793, + "reward": 1.7244791984558105, + "reward_std": 0.23162373006343842, + "rewards/accuracy_reward": 0.03541666828095913, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7348958551883698, + "step": 2647 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.3125244140625, + "epoch": 0.8474955992958874, + "grad_norm": 0.22119919955730438, + "kl": 0.3237848818302155, + "learning_rate": 1.3819599532242733e-06, + "loss": 0.0992, + "reward": 1.7864583611488343, + "reward_std": 0.2305359125137329, + "rewards/accuracy_reward": 0.09166666883975268, + "rewards/format_reward": 0.9562500298023224, + "rewards/tag_count_reward": 0.7385416924953461, + "step": 2648 + }, + { + "clip_ratio": 0.0, + "completion_length": 599.4437683105468, + "epoch": 0.8478156505040807, + "grad_norm": 0.15753722190856934, + "kl": 0.36308416426181794, + "learning_rate": 1.3762963720785638e-06, + "loss": 0.0961, + "reward": 1.8088542103767395, + "reward_std": 0.2562564447522163, + "rewards/accuracy_reward": 0.10208333674818278, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7421875119209289, + "step": 2649 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.2937591552734, + "epoch": 0.8481357017122739, + "grad_norm": 0.2265545278787613, + "kl": 0.4071022719144821, + "learning_rate": 1.3706435623248627e-06, + "loss": 0.1261, + "reward": 1.817187535762787, + "reward_std": 0.2956122875213623, + "rewards/accuracy_reward": 0.1479166718199849, + "rewards/format_reward": 0.9354166805744171, + "rewards/tag_count_reward": 0.7338541865348815, + "step": 2650 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.441683959961, + "epoch": 0.8484557529204673, + "grad_norm": 0.2043626606464386, + "kl": 0.3923542708158493, + "learning_rate": 1.3650015310237796e-06, + "loss": 0.1057, + "reward": 1.741145873069763, + "reward_std": 0.19232798367738724, + "rewards/accuracy_reward": 0.043750002048909664, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7369791865348816, + "step": 2651 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.8833648681641, + "epoch": 0.8487758041286606, + "grad_norm": 0.17744043469429016, + "kl": 0.2601847030222416, + "learning_rate": 1.3593702852224655e-06, + "loss": 0.0939, + "reward": 1.8307292222976685, + "reward_std": 0.18132452741265298, + "rewards/accuracy_reward": 0.12083333730697632, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2652 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.8666809082031, + "epoch": 0.8490958553368539, + "grad_norm": 0.1844325214624405, + "kl": 0.2797363385558128, + "learning_rate": 1.3537498319545984e-06, + "loss": 0.0685, + "reward": 1.8364583730697632, + "reward_std": 0.2018692083656788, + "rewards/accuracy_reward": 0.13125000279396773, + "rewards/format_reward": 0.9666666746139526, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 2653 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.9500152587891, + "epoch": 0.8494159065450472, + "grad_norm": 0.2170618176460266, + "kl": 0.23281632959842682, + "learning_rate": 1.3481401782403792e-06, + "loss": 0.0911, + "reward": 1.7765625357627868, + "reward_std": 0.1638067312538624, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.9666666924953461, + "rewards/tag_count_reward": 0.7411458492279053, + "step": 2654 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.6770935058594, + "epoch": 0.8497359577532405, + "grad_norm": 0.1635342389345169, + "kl": 0.46017926633358003, + "learning_rate": 1.3425413310865087e-06, + "loss": 0.1198, + "reward": 1.8062500357627869, + "reward_std": 0.22490473836660385, + "rewards/accuracy_reward": 0.11250000409781932, + "rewards/format_reward": 0.9583333432674408, + "rewards/tag_count_reward": 0.7354166805744171, + "step": 2655 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.7958526611328, + "epoch": 0.8500560089614339, + "grad_norm": 0.1633990854024887, + "kl": 0.3503039345145226, + "learning_rate": 1.3369532974862053e-06, + "loss": 0.0993, + "reward": 1.7932292103767395, + "reward_std": 0.2544810831546783, + "rewards/accuracy_reward": 0.11041666828095913, + "rewards/format_reward": 0.9437500238418579, + "rewards/tag_count_reward": 0.7390625238418579, + "step": 2656 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.1583557128906, + "epoch": 0.8503760601696272, + "grad_norm": 0.15160250663757324, + "kl": 0.20180488303303717, + "learning_rate": 1.3313760844191713e-06, + "loss": 0.0956, + "reward": 1.7343750238418578, + "reward_std": 0.20577147156000136, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.9645833611488343, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2657 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.3541931152344, + "epoch": 0.8506961113778204, + "grad_norm": 0.12937583029270172, + "kl": 0.546181932091713, + "learning_rate": 1.325809698851598e-06, + "loss": 0.0912, + "reward": 1.7593750357627869, + "reward_std": 0.25953795313835143, + "rewards/accuracy_reward": 0.08125000260770321, + "rewards/format_reward": 0.9458333551883698, + "rewards/tag_count_reward": 0.7322916984558105, + "step": 2658 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.1250152587891, + "epoch": 0.8510161625860138, + "grad_norm": 0.13249269127845764, + "kl": 0.2853389322757721, + "learning_rate": 1.3202541477361441e-06, + "loss": 0.0726, + "reward": 1.8062500715255738, + "reward_std": 0.1834932841360569, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7354166805744171, + "step": 2659 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.6979400634766, + "epoch": 0.8513362137942071, + "grad_norm": 0.16687384247779846, + "kl": 0.3180491279810667, + "learning_rate": 1.314709438011945e-06, + "loss": 0.1088, + "reward": 1.7786458611488343, + "reward_std": 0.24222037941217422, + "rewards/accuracy_reward": 0.09791667088866234, + "rewards/format_reward": 0.9416666865348816, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2660 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.1083557128907, + "epoch": 0.8516562650024004, + "grad_norm": 0.14712361991405487, + "kl": 0.30614238381385805, + "learning_rate": 1.3091755766045922e-06, + "loss": 0.0958, + "reward": 1.9145833611488343, + "reward_std": 0.21078643649816514, + "rewards/accuracy_reward": 0.21458333935588597, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7354166865348816, + "step": 2661 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.0333526611328, + "epoch": 0.8519763162105937, + "grad_norm": 0.11870875954627991, + "kl": 0.3040292389690876, + "learning_rate": 1.303652570426125e-06, + "loss": 0.0715, + "reward": 1.7276041865348817, + "reward_std": 0.1599747955799103, + "rewards/accuracy_reward": 0.0125, + "rewards/format_reward": 0.9750000238418579, + "rewards/tag_count_reward": 0.7401041865348816, + "step": 2662 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.4250213623047, + "epoch": 0.852296367418787, + "grad_norm": 0.3058604896068573, + "kl": 0.3141802024096251, + "learning_rate": 1.2981404263750264e-06, + "loss": 0.078, + "reward": 1.7994792103767394, + "reward_std": 0.20162120461463928, + "rewards/accuracy_reward": 0.09375000279396772, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2663 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.1250213623047, + "epoch": 0.8526164186269803, + "grad_norm": 0.18708136677742004, + "kl": 0.359296178817749, + "learning_rate": 1.2926391513362102e-06, + "loss": 0.1079, + "reward": 1.7489583611488342, + "reward_std": 0.24096645563840866, + "rewards/accuracy_reward": 0.052083334885537624, + "rewards/format_reward": 0.9604166924953461, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 2664 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.0354309082031, + "epoch": 0.8529364698351737, + "grad_norm": 0.22880977392196655, + "kl": 0.4187725305557251, + "learning_rate": 1.2871487521810166e-06, + "loss": 0.1185, + "reward": 1.8052083730697632, + "reward_std": 0.28198549449443816, + "rewards/accuracy_reward": 0.12708333767950536, + "rewards/format_reward": 0.9416666865348816, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2665 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.4416809082031, + "epoch": 0.8532565210433669, + "grad_norm": 0.1381569802761078, + "kl": 0.28381815254688264, + "learning_rate": 1.2816692357672012e-06, + "loss": 0.0855, + "reward": 1.794270884990692, + "reward_std": 0.21319209337234496, + "rewards/accuracy_reward": 0.10833333861082792, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7359375119209289, + "step": 2666 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.3875244140625, + "epoch": 0.8535765722515603, + "grad_norm": 0.15339335799217224, + "kl": 0.3478463143110275, + "learning_rate": 1.2762006089389212e-06, + "loss": 0.0999, + "reward": 1.7197917103767395, + "reward_std": 0.22356971949338914, + "rewards/accuracy_reward": 0.02708333358168602, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2667 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.0375183105468, + "epoch": 0.8538966234597536, + "grad_norm": 0.27163901925086975, + "kl": 0.42675180844962596, + "learning_rate": 1.2707428785267396e-06, + "loss": 0.0928, + "reward": 1.8291667342185973, + "reward_std": 0.20625257939100267, + "rewards/accuracy_reward": 0.1416666701436043, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7375000178813934, + "step": 2668 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.3541870117188, + "epoch": 0.8542166746679468, + "grad_norm": 0.25037550926208496, + "kl": 0.23697092048823834, + "learning_rate": 1.2652960513476043e-06, + "loss": 0.0653, + "reward": 1.7333333492279053, + "reward_std": 0.13338673561811448, + "rewards/accuracy_reward": 0.01666666679084301, + "rewards/format_reward": 0.9750000178813935, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2669 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.9979370117187, + "epoch": 0.8545367258761402, + "grad_norm": 0.23881444334983826, + "kl": 0.42256051301956177, + "learning_rate": 1.2598601342048477e-06, + "loss": 0.1146, + "reward": 1.8848958611488342, + "reward_std": 0.29029052406549455, + "rewards/accuracy_reward": 0.19375000428408384, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7348958551883698, + "step": 2670 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.243765258789, + "epoch": 0.8548567770843335, + "grad_norm": 0.09502812474966049, + "kl": 0.18763545230031015, + "learning_rate": 1.2544351338881721e-06, + "loss": 0.0846, + "reward": 1.7942708492279054, + "reward_std": 0.15935472398996353, + "rewards/accuracy_reward": 0.07916666865348816, + "rewards/format_reward": 0.9729166865348816, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 2671 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.3312622070313, + "epoch": 0.8551768282925268, + "grad_norm": 0.17005877196788788, + "kl": 0.4221621580421925, + "learning_rate": 1.2490210571736484e-06, + "loss": 0.1006, + "reward": 1.7765625357627868, + "reward_std": 0.22101174741983415, + "rewards/accuracy_reward": 0.08541666697710752, + "rewards/format_reward": 0.956250011920929, + "rewards/tag_count_reward": 0.7348958492279053, + "step": 2672 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.2625183105469, + "epoch": 0.8554968795007201, + "grad_norm": 0.241080179810524, + "kl": 0.3817593351006508, + "learning_rate": 1.2436179108236989e-06, + "loss": 0.0917, + "reward": 1.775000023841858, + "reward_std": 0.2237080730497837, + "rewards/accuracy_reward": 0.08750000167638064, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.7354166805744171, + "step": 2673 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.0166870117188, + "epoch": 0.8558169307089134, + "grad_norm": 0.17033007740974426, + "kl": 0.3994890958070755, + "learning_rate": 1.2382257015870957e-06, + "loss": 0.1237, + "reward": 1.746875035762787, + "reward_std": 0.24429445043206216, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.9479166865348816, + "rewards/tag_count_reward": 0.7322916865348816, + "step": 2674 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.133349609375, + "epoch": 0.8561369819171067, + "grad_norm": 0.11920144408941269, + "kl": 0.25182824283838273, + "learning_rate": 1.2328444361989523e-06, + "loss": 0.07, + "reward": 1.7604166865348816, + "reward_std": 0.18208148702979088, + "rewards/accuracy_reward": 0.05416666697710752, + "rewards/format_reward": 0.9666666924953461, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2675 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.2958465576172, + "epoch": 0.8564570331253001, + "grad_norm": 0.1800604909658432, + "kl": 0.3815006874501705, + "learning_rate": 1.227474121380705e-06, + "loss": 0.104, + "reward": 1.7385417103767395, + "reward_std": 0.23486847579479217, + "rewards/accuracy_reward": 0.0520833358168602, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 2676 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.7083526611328, + "epoch": 0.8567770843334933, + "grad_norm": 0.19448384642601013, + "kl": 0.38638448938727377, + "learning_rate": 1.222114763840121e-06, + "loss": 0.1007, + "reward": 1.8333333730697632, + "reward_std": 0.19629458263516425, + "rewards/accuracy_reward": 0.1395833384245634, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.735416692495346, + "step": 2677 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.2854370117187, + "epoch": 0.8570971355416866, + "grad_norm": 0.15441475808620453, + "kl": 0.32658002004027364, + "learning_rate": 1.2167663702712773e-06, + "loss": 0.1138, + "reward": 1.7317708730697632, + "reward_std": 0.18842825740575792, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 2678 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.039599609375, + "epoch": 0.85741718674988, + "grad_norm": 0.19493429362773895, + "kl": 0.5683829590678215, + "learning_rate": 1.2114289473545583e-06, + "loss": 0.1266, + "reward": 1.7197916865348817, + "reward_std": 0.2377609834074974, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.9520833611488342, + "rewards/tag_count_reward": 0.7281250178813934, + "step": 2679 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.0000183105469, + "epoch": 0.8577372379580733, + "grad_norm": 0.13462968170642853, + "kl": 0.22133766189217569, + "learning_rate": 1.2061025017566374e-06, + "loss": 0.051, + "reward": 1.7822917103767395, + "reward_std": 0.16446125581860543, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.975000011920929, + "rewards/tag_count_reward": 0.7447916865348816, + "step": 2680 + }, + { + "clip_ratio": 0.0, + "completion_length": 593.8562622070312, + "epoch": 0.8580572891662666, + "grad_norm": 0.2179809808731079, + "kl": 0.30674128159880637, + "learning_rate": 1.2007870401304922e-06, + "loss": 0.0736, + "reward": 1.8442708849906921, + "reward_std": 0.2419275127351284, + "rewards/accuracy_reward": 0.1500000072643161, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7338541865348815, + "step": 2681 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.5625183105469, + "epoch": 0.8583773403744599, + "grad_norm": 0.18204998970031738, + "kl": 0.29479278065264225, + "learning_rate": 1.1954825691153682e-06, + "loss": 0.0899, + "reward": 1.7348958611488343, + "reward_std": 0.15623627305030824, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7432291805744171, + "step": 2682 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.5187744140625, + "epoch": 0.8586973915826532, + "grad_norm": 0.18926359713077545, + "kl": 0.34724462777376175, + "learning_rate": 1.190189095336791e-06, + "loss": 0.1008, + "reward": 1.8578125596046449, + "reward_std": 0.2738295793533325, + "rewards/accuracy_reward": 0.1708333373069763, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7328125178813935, + "step": 2683 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.8229309082031, + "epoch": 0.8590174427908466, + "grad_norm": 0.09490280598402023, + "kl": 0.1771955456584692, + "learning_rate": 1.1849066254065412e-06, + "loss": 0.052, + "reward": 1.7838541984558105, + "reward_std": 0.1423393502831459, + "rewards/accuracy_reward": 0.06250000130385161, + "rewards/format_reward": 0.9791666805744171, + "rewards/tag_count_reward": 0.7421875238418579, + "step": 2684 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.9875122070313, + "epoch": 0.8593374939990398, + "grad_norm": 0.18486838042736053, + "kl": 0.38636996001005175, + "learning_rate": 1.1796351659226623e-06, + "loss": 0.1157, + "reward": 1.8244792222976685, + "reward_std": 0.2464168481528759, + "rewards/accuracy_reward": 0.13541667107492686, + "rewards/format_reward": 0.9541666746139527, + "rewards/tag_count_reward": 0.7348958492279053, + "step": 2685 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.1375183105469, + "epoch": 0.8596575452072331, + "grad_norm": 0.1398511826992035, + "kl": 0.3162212152034044, + "learning_rate": 1.1743747234694437e-06, + "loss": 0.0875, + "reward": 1.8901041865348815, + "reward_std": 0.18897883892059325, + "rewards/accuracy_reward": 0.18750000298023223, + "rewards/format_reward": 0.9625000119209289, + "rewards/tag_count_reward": 0.7401041865348816, + "step": 2686 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.7896057128906, + "epoch": 0.8599775964154265, + "grad_norm": 0.23524326086044312, + "kl": 0.34184712544083595, + "learning_rate": 1.1691253046174144e-06, + "loss": 0.0877, + "reward": 1.7583333849906921, + "reward_std": 0.19846115112304688, + "rewards/accuracy_reward": 0.05833333544433117, + "rewards/format_reward": 0.9645833611488343, + "rewards/tag_count_reward": 0.7354166865348816, + "step": 2687 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.3125244140625, + "epoch": 0.8602976476236198, + "grad_norm": 0.3231591284275055, + "kl": 0.20597450919449328, + "learning_rate": 1.1638869159233301e-06, + "loss": 0.0747, + "reward": 1.7786458611488343, + "reward_std": 0.1821716882288456, + "rewards/accuracy_reward": 0.06458333544433117, + "rewards/format_reward": 0.9708333432674408, + "rewards/tag_count_reward": 0.7432291865348816, + "step": 2688 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.2187683105469, + "epoch": 0.860617698831813, + "grad_norm": 0.17155896127223969, + "kl": 0.30113087967038155, + "learning_rate": 1.1586595639301768e-06, + "loss": 0.06, + "reward": 1.7901041746139525, + "reward_std": 0.2241733819246292, + "rewards/accuracy_reward": 0.08541666772216558, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7421875238418579, + "step": 2689 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.2312713623047, + "epoch": 0.8609377500400064, + "grad_norm": 0.20491039752960205, + "kl": 0.4510466232895851, + "learning_rate": 1.1534432551671492e-06, + "loss": 0.0927, + "reward": 1.7369791865348816, + "reward_std": 0.21386837363243102, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.9625000119209289, + "rewards/tag_count_reward": 0.7348958432674408, + "step": 2690 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.8146118164062, + "epoch": 0.8612578012481997, + "grad_norm": 0.09562389552593231, + "kl": 0.27018130123615264, + "learning_rate": 1.1482379961496536e-06, + "loss": 0.0838, + "reward": 1.7692708730697633, + "reward_std": 0.20710121542215348, + "rewards/accuracy_reward": 0.07291667070239782, + "rewards/format_reward": 0.9583333611488343, + "rewards/tag_count_reward": 0.7380208492279052, + "step": 2691 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.3854400634766, + "epoch": 0.8615778524563931, + "grad_norm": 0.28722503781318665, + "kl": 0.22337874136865138, + "learning_rate": 1.143043793379287e-06, + "loss": 0.082, + "reward": 1.789583396911621, + "reward_std": 0.19062369912862778, + "rewards/accuracy_reward": 0.08541666883975267, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.743750023841858, + "step": 2692 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.3979339599609, + "epoch": 0.8618979036645863, + "grad_norm": 0.17179274559020996, + "kl": 0.27019251585006715, + "learning_rate": 1.1378606533438442e-06, + "loss": 0.073, + "reward": 1.7218750238418579, + "reward_std": 0.22592300027608872, + "rewards/accuracy_reward": 0.0354166679084301, + "rewards/format_reward": 0.954166692495346, + "rewards/tag_count_reward": 0.7322916924953461, + "step": 2693 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.6229370117187, + "epoch": 0.8622179548727796, + "grad_norm": 0.16243615746498108, + "kl": 0.25600522831082345, + "learning_rate": 1.1326885825172973e-06, + "loss": 0.0854, + "reward": 1.8052083730697632, + "reward_std": 0.20433509722352028, + "rewards/accuracy_reward": 0.10625000223517418, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 2694 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.5146057128907, + "epoch": 0.862538006080973, + "grad_norm": 0.2923242449760437, + "kl": 0.3979221750050783, + "learning_rate": 1.1275275873597957e-06, + "loss": 0.1161, + "reward": 1.7541667103767395, + "reward_std": 0.21354512870311737, + "rewards/accuracy_reward": 0.06458333544433117, + "rewards/format_reward": 0.9541666805744171, + "rewards/tag_count_reward": 0.7354166805744171, + "step": 2695 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.702099609375, + "epoch": 0.8628580572891663, + "grad_norm": 0.16917048394680023, + "kl": 0.3872626259922981, + "learning_rate": 1.122377674317653e-06, + "loss": 0.115, + "reward": 1.7744791865348817, + "reward_std": 0.21435921788215637, + "rewards/accuracy_reward": 0.08958333618938923, + "rewards/format_reward": 0.954166692495346, + "rewards/tag_count_reward": 0.7307291805744172, + "step": 2696 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.083349609375, + "epoch": 0.8631781084973595, + "grad_norm": 0.18929900228977203, + "kl": 0.33370514437556265, + "learning_rate": 1.1172388498233421e-06, + "loss": 0.0709, + "reward": 1.7781250119209289, + "reward_std": 0.1962820142507553, + "rewards/accuracy_reward": 0.07708333488553762, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2697 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.308349609375, + "epoch": 0.8634981597055529, + "grad_norm": 0.09958707541227341, + "kl": 0.24256644695997237, + "learning_rate": 1.1121111202954836e-06, + "loss": 0.0797, + "reward": 1.7947916984558105, + "reward_std": 0.21337411254644395, + "rewards/accuracy_reward": 0.0958333345130086, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2698 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.0083557128906, + "epoch": 0.8638182109137462, + "grad_norm": 0.21212667226791382, + "kl": 0.4391674891114235, + "learning_rate": 1.1069944921388442e-06, + "loss": 0.0906, + "reward": 1.7661458849906921, + "reward_std": 0.22532435059547423, + "rewards/accuracy_reward": 0.07083333488553763, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7328125178813935, + "step": 2699 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.220849609375, + "epoch": 0.8641382621219396, + "grad_norm": 0.2691134214401245, + "kl": 0.30899880900979043, + "learning_rate": 1.1018889717443182e-06, + "loss": 0.0998, + "reward": 1.7416666865348815, + "reward_std": 0.23400208950042725, + "rewards/accuracy_reward": 0.04791666772216559, + "rewards/format_reward": 0.9583333611488343, + "rewards/tag_count_reward": 0.7354166805744171, + "step": 2700 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.5500244140625, + "epoch": 0.8644583133301328, + "grad_norm": 0.20773187279701233, + "kl": 0.4713391542434692, + "learning_rate": 1.096794565488929e-06, + "loss": 0.1375, + "reward": 1.7796875476837157, + "reward_std": 0.306690426170826, + "rewards/accuracy_reward": 0.10208333861082793, + "rewards/format_reward": 0.9479166805744171, + "rewards/tag_count_reward": 0.7296875178813934, + "step": 2701 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.7541870117187, + "epoch": 0.8647783645383261, + "grad_norm": 0.30383846163749695, + "kl": 0.27798029854893685, + "learning_rate": 1.0917112797358199e-06, + "loss": 0.0984, + "reward": 1.8197917222976685, + "reward_std": 0.2308654323220253, + "rewards/accuracy_reward": 0.12708333637565375, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 2702 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.6500183105469, + "epoch": 0.8650984157465195, + "grad_norm": 0.12061861157417297, + "kl": 0.23864154443144797, + "learning_rate": 1.086639120834243e-06, + "loss": 0.0941, + "reward": 1.7442708730697631, + "reward_std": 0.1674013689160347, + "rewards/accuracy_reward": 0.03958333432674408, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 2703 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.1708557128907, + "epoch": 0.8654184669547128, + "grad_norm": 0.21024452149868011, + "kl": 0.41636669635772705, + "learning_rate": 1.0815780951195521e-06, + "loss": 0.1121, + "reward": 1.7630208730697632, + "reward_std": 0.2516354911029339, + "rewards/accuracy_reward": 0.09166666977107525, + "rewards/format_reward": 0.9395833551883698, + "rewards/tag_count_reward": 0.7317708492279053, + "step": 2704 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.3541809082031, + "epoch": 0.865738518162906, + "grad_norm": 0.27876242995262146, + "kl": 0.3096645545214415, + "learning_rate": 1.076528208913189e-06, + "loss": 0.1002, + "reward": 1.8083333611488341, + "reward_std": 0.20803507119417192, + "rewards/accuracy_reward": 0.10625000149011612, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2705 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.6625122070312, + "epoch": 0.8660585693710994, + "grad_norm": 0.19789564609527588, + "kl": 0.2858868185430765, + "learning_rate": 1.0714894685226961e-06, + "loss": 0.1107, + "reward": 1.7984375476837158, + "reward_std": 0.23042803555727004, + "rewards/accuracy_reward": 0.10000000260770321, + "rewards/format_reward": 0.9583333432674408, + "rewards/tag_count_reward": 0.7401041865348816, + "step": 2706 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.8604309082032, + "epoch": 0.8663786205792927, + "grad_norm": 0.12375348061323166, + "kl": 0.3638369083404541, + "learning_rate": 1.0664618802416814e-06, + "loss": 0.138, + "reward": 1.7260416984558105, + "reward_std": 0.24393313825130464, + "rewards/accuracy_reward": 0.05000000204890966, + "rewards/format_reward": 0.9395833492279053, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2707 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.0250183105469, + "epoch": 0.866698671787486, + "grad_norm": 0.18104706704616547, + "kl": 0.30177239924669264, + "learning_rate": 1.0614454503498306e-06, + "loss": 0.0829, + "reward": 1.7479166865348816, + "reward_std": 0.1778070405125618, + "rewards/accuracy_reward": 0.04166666772216558, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 2708 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.0500091552734, + "epoch": 0.8670187229956793, + "grad_norm": 0.15156742930412292, + "kl": 0.36322931200265884, + "learning_rate": 1.0564401851128846e-06, + "loss": 0.1007, + "reward": 1.7916667222976685, + "reward_std": 0.22478875443339347, + "rewards/accuracy_reward": 0.09583333600312471, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.737500011920929, + "step": 2709 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.9916809082031, + "epoch": 0.8673387742038726, + "grad_norm": 0.24718128144741058, + "kl": 0.3891347452998161, + "learning_rate": 1.0514460907826473e-06, + "loss": 0.0986, + "reward": 1.8010417103767395, + "reward_std": 0.23830287754535676, + "rewards/accuracy_reward": 0.10416666902601719, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7385416924953461, + "step": 2710 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.3625244140625, + "epoch": 0.867658825412066, + "grad_norm": 0.2284335196018219, + "kl": 0.367191506177187, + "learning_rate": 1.0464631735969655e-06, + "loss": 0.1262, + "reward": 1.7213541984558105, + "reward_std": 0.26351067125797273, + "rewards/accuracy_reward": 0.03750000074505806, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7338541865348815, + "step": 2711 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.2896057128906, + "epoch": 0.8679788766202592, + "grad_norm": 0.132751002907753, + "kl": 0.20918865650892257, + "learning_rate": 1.0414914397797271e-06, + "loss": 0.063, + "reward": 1.8729167222976684, + "reward_std": 0.21915601789951325, + "rewards/accuracy_reward": 0.16250000651925803, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7458333551883698, + "step": 2712 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.395849609375, + "epoch": 0.8682989278284525, + "grad_norm": 0.12675440311431885, + "kl": 0.3191126808524132, + "learning_rate": 1.0365308955408459e-06, + "loss": 0.1096, + "reward": 1.7890625476837159, + "reward_std": 0.24897948130965233, + "rewards/accuracy_reward": 0.10000000409781933, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2713 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.3437713623047, + "epoch": 0.8686189790366459, + "grad_norm": 0.13551534712314606, + "kl": 0.24030449092388154, + "learning_rate": 1.031581547076268e-06, + "loss": 0.098, + "reward": 1.746875035762787, + "reward_std": 0.16820005923509598, + "rewards/accuracy_reward": 0.04791666865348816, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 2714 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.8666809082031, + "epoch": 0.8689390302448392, + "grad_norm": 0.39579689502716064, + "kl": 0.38164796978235244, + "learning_rate": 1.0266434005679503e-06, + "loss": 0.0777, + "reward": 1.7369792103767394, + "reward_std": 0.16624386757612228, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7369791805744171, + "step": 2715 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.0937683105469, + "epoch": 0.8692590814530324, + "grad_norm": 0.21584556996822357, + "kl": 0.36943108662962915, + "learning_rate": 1.0217164621838605e-06, + "loss": 0.1115, + "reward": 1.833333396911621, + "reward_std": 0.24086197540163995, + "rewards/accuracy_reward": 0.13541667070239782, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7375000238418579, + "step": 2716 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.5812744140625, + "epoch": 0.8695791326612258, + "grad_norm": 0.21526369452476501, + "kl": 0.30623162984848024, + "learning_rate": 1.016800738077962e-06, + "loss": 0.0955, + "reward": 1.6947916984558105, + "reward_std": 0.20032616332173347, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2717 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.6250244140625, + "epoch": 0.8698991838694191, + "grad_norm": 0.1324380338191986, + "kl": 0.5877346590161323, + "learning_rate": 1.011896234390215e-06, + "loss": 0.1509, + "reward": 1.7822916984558106, + "reward_std": 0.2619222469627857, + "rewards/accuracy_reward": 0.09791666679084302, + "rewards/format_reward": 0.9520833432674408, + "rewards/tag_count_reward": 0.7322916805744171, + "step": 2718 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.3979431152344, + "epoch": 0.8702192350776125, + "grad_norm": 0.13195985555648804, + "kl": 0.18522481769323348, + "learning_rate": 1.0070029572465657e-06, + "loss": 0.0962, + "reward": 1.8500000715255738, + "reward_std": 0.18061546236276627, + "rewards/accuracy_reward": 0.14375000428408385, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7416666805744171, + "step": 2719 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.8250183105469, + "epoch": 0.8705392862858057, + "grad_norm": 0.17610874772071838, + "kl": 0.3283210381865501, + "learning_rate": 1.002120912758935e-06, + "loss": 0.1201, + "reward": 1.725520873069763, + "reward_std": 0.26378956735134124, + "rewards/accuracy_reward": 0.04375000149011612, + "rewards/format_reward": 0.9416666924953461, + "rewards/tag_count_reward": 0.7401041805744171, + "step": 2720 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.0896057128906, + "epoch": 0.870859337493999, + "grad_norm": 0.1175459697842598, + "kl": 0.3252341076731682, + "learning_rate": 9.97250107025216e-07, + "loss": 0.0661, + "reward": 1.8026041984558105, + "reward_std": 0.1510702796280384, + "rewards/accuracy_reward": 0.0895833358168602, + "rewards/format_reward": 0.9729166865348816, + "rewards/tag_count_reward": 0.7401041805744171, + "step": 2721 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.7833587646485, + "epoch": 0.8711793887021924, + "grad_norm": 0.08211004734039307, + "kl": 0.2834597870707512, + "learning_rate": 9.923905461292638e-07, + "loss": 0.0908, + "reward": 1.8187500357627868, + "reward_std": 0.1927901290357113, + "rewards/accuracy_reward": 0.11041667070239783, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.743750023841858, + "step": 2722 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.2708557128906, + "epoch": 0.8714994399103857, + "grad_norm": 0.11005070060491562, + "kl": 0.3537232682108879, + "learning_rate": 9.87542236140886e-07, + "loss": 0.0729, + "reward": 1.7796875476837157, + "reward_std": 0.18082961216568946, + "rewards/accuracy_reward": 0.07500000223517418, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7380208492279052, + "step": 2723 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.6145935058594, + "epoch": 0.8718194911185789, + "grad_norm": 0.13142850995063782, + "kl": 0.26408388651907444, + "learning_rate": 9.82705183115842e-07, + "loss": 0.1009, + "reward": 1.7942708611488343, + "reward_std": 0.19019991308450698, + "rewards/accuracy_reward": 0.08750000149011612, + "rewards/format_reward": 0.9687500119209289, + "rewards/tag_count_reward": 0.7380208432674408, + "step": 2724 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.4479370117188, + "epoch": 0.8721395423267723, + "grad_norm": 0.2968617379665375, + "kl": 0.36875737756490706, + "learning_rate": 9.77879393095823e-07, + "loss": 0.0958, + "reward": 1.8088542342185974, + "reward_std": 0.21599897295236586, + "rewards/accuracy_reward": 0.10416667275130749, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7421875238418579, + "step": 2725 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.4604339599609, + "epoch": 0.8724595935349656, + "grad_norm": 0.09855780750513077, + "kl": 0.28418837301433086, + "learning_rate": 9.730648721084601e-07, + "loss": 0.0916, + "reward": 1.7796875357627868, + "reward_std": 0.16487232595682144, + "rewards/accuracy_reward": 0.06458333544433117, + "rewards/format_reward": 0.9729166865348816, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 2726 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.2479431152344, + "epoch": 0.8727796447431589, + "grad_norm": 0.23963011801242828, + "kl": 0.3469237022101879, + "learning_rate": 9.682616261673039e-07, + "loss": 0.1049, + "reward": 1.7890625476837159, + "reward_std": 0.2078261062502861, + "rewards/accuracy_reward": 0.09583333730697632, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7348958611488342, + "step": 2727 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.1500091552734, + "epoch": 0.8730996959513522, + "grad_norm": 0.2292696088552475, + "kl": 0.2644490167498589, + "learning_rate": 9.634696612718242e-07, + "loss": 0.0742, + "reward": 1.8494792342185975, + "reward_std": 0.1921914353966713, + "rewards/accuracy_reward": 0.1562500035390258, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7369791805744171, + "step": 2728 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.3479217529297, + "epoch": 0.8734197471595455, + "grad_norm": 0.11155485361814499, + "kl": 0.35314866527915, + "learning_rate": 9.586889834073997e-07, + "loss": 0.0853, + "reward": 1.7614583611488341, + "reward_std": 0.2156276598572731, + "rewards/accuracy_reward": 0.058333334513008595, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2729 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.5625183105469, + "epoch": 0.8737397983677389, + "grad_norm": 0.27473822236061096, + "kl": 0.3876804620027542, + "learning_rate": 9.53919598545312e-07, + "loss": 0.1188, + "reward": 1.7916667103767394, + "reward_std": 0.17802366763353347, + "rewards/accuracy_reward": 0.08750000204890966, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7437500178813934, + "step": 2730 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.5500183105469, + "epoch": 0.8740598495759322, + "grad_norm": 0.10562120378017426, + "kl": 0.3450591519474983, + "learning_rate": 9.491615126427356e-07, + "loss": 0.138, + "reward": 1.7677083611488342, + "reward_std": 0.23906174153089524, + "rewards/accuracy_reward": 0.07916666883975268, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2731 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.8187744140625, + "epoch": 0.8743799007841254, + "grad_norm": 0.125253826379776, + "kl": 0.3737052485346794, + "learning_rate": 9.444147316427332e-07, + "loss": 0.0937, + "reward": 1.7614583730697633, + "reward_std": 0.18856341540813445, + "rewards/accuracy_reward": 0.06458333432674408, + "rewards/format_reward": 0.9583333611488343, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2732 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.6729431152344, + "epoch": 0.8746999519923188, + "grad_norm": 0.08726673573255539, + "kl": 0.24158574528992177, + "learning_rate": 9.396792614742478e-07, + "loss": 0.0792, + "reward": 1.7854166865348815, + "reward_std": 0.2222141295671463, + "rewards/accuracy_reward": 0.07500000391155481, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2733 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.0708526611328, + "epoch": 0.8750200032005121, + "grad_norm": 0.1890716254711151, + "kl": 0.49842873513698577, + "learning_rate": 9.349551080520913e-07, + "loss": 0.1681, + "reward": 1.7322917103767395, + "reward_std": 0.24929236024618148, + "rewards/accuracy_reward": 0.05416666828095913, + "rewards/format_reward": 0.9395833551883698, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2734 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.145849609375, + "epoch": 0.8753400544087054, + "grad_norm": 0.22258751094341278, + "kl": 0.3496193356812, + "learning_rate": 9.302422772769437e-07, + "loss": 0.1171, + "reward": 1.7927083849906922, + "reward_std": 0.2532180845737457, + "rewards/accuracy_reward": 0.10416666921228171, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7343750238418579, + "step": 2735 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.139599609375, + "epoch": 0.8756601056168987, + "grad_norm": 0.17098496854305267, + "kl": 0.2673977643251419, + "learning_rate": 9.255407750353429e-07, + "loss": 0.0588, + "reward": 1.7234375476837158, + "reward_std": 0.16605336368083953, + "rewards/accuracy_reward": 0.012500000186264515, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 2736 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.7958618164063, + "epoch": 0.875980156825092, + "grad_norm": 0.1342511624097824, + "kl": 0.3689066171646118, + "learning_rate": 9.208506071996759e-07, + "loss": 0.1102, + "reward": 1.7838542103767394, + "reward_std": 0.21994786560535431, + "rewards/accuracy_reward": 0.09791666977107524, + "rewards/format_reward": 0.9541666984558106, + "rewards/tag_count_reward": 0.7317708551883697, + "step": 2737 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.3604431152344, + "epoch": 0.8763002080332853, + "grad_norm": 0.2091393917798996, + "kl": 0.3664947893470526, + "learning_rate": 9.161717796281677e-07, + "loss": 0.1138, + "reward": 1.7390625476837158, + "reward_std": 0.2935358829796314, + "rewards/accuracy_reward": 0.058333336003124715, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7286458492279053, + "step": 2738 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.3541870117188, + "epoch": 0.8766202592414787, + "grad_norm": 0.2279767394065857, + "kl": 0.3544710837304592, + "learning_rate": 9.115042981648903e-07, + "loss": 0.0853, + "reward": 1.7359375596046447, + "reward_std": 0.2095361977815628, + "rewards/accuracy_reward": 0.04166666902601719, + "rewards/format_reward": 0.950000011920929, + "rewards/tag_count_reward": 0.7442708551883698, + "step": 2739 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.2104370117188, + "epoch": 0.8769403104496719, + "grad_norm": 0.1344815194606781, + "kl": 0.3071120284497738, + "learning_rate": 9.068481686397324e-07, + "loss": 0.0937, + "reward": 1.8223958730697631, + "reward_std": 0.19986422806978227, + "rewards/accuracy_reward": 0.1270833358168602, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7348958551883698, + "step": 2740 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.1020904541016, + "epoch": 0.8772603616578653, + "grad_norm": 0.20531679689884186, + "kl": 0.2685227755457163, + "learning_rate": 9.022033968684119e-07, + "loss": 0.0663, + "reward": 1.839062547683716, + "reward_std": 0.15720132291316985, + "rewards/accuracy_reward": 0.1229166716337204, + "rewards/format_reward": 0.9729166805744172, + "rewards/tag_count_reward": 0.7432291805744171, + "step": 2741 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.4583587646484, + "epoch": 0.8775804128660586, + "grad_norm": 0.1268453747034073, + "kl": 0.3735459715127945, + "learning_rate": 8.975699886524536e-07, + "loss": 0.0884, + "reward": 1.7067708849906922, + "reward_std": 0.21321462467312813, + "rewards/accuracy_reward": 0.014583333767950535, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7359375119209289, + "step": 2742 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.3875244140625, + "epoch": 0.8779004640742519, + "grad_norm": 0.131356343626976, + "kl": 0.4861250571906567, + "learning_rate": 8.929479497791926e-07, + "loss": 0.0604, + "reward": 1.7661458730697632, + "reward_std": 0.2645136177539825, + "rewards/accuracy_reward": 0.08333333637565374, + "rewards/format_reward": 0.9458333611488342, + "rewards/tag_count_reward": 0.736979192495346, + "step": 2743 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.4333587646485, + "epoch": 0.8782205152824452, + "grad_norm": 0.2043062150478363, + "kl": 0.2636591024696827, + "learning_rate": 8.88337286021762e-07, + "loss": 0.09, + "reward": 1.7682292103767394, + "reward_std": 0.17424845919013024, + "rewards/accuracy_reward": 0.06458333544433117, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7390625238418579, + "step": 2744 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.2041809082032, + "epoch": 0.8785405664906385, + "grad_norm": 0.12795977294445038, + "kl": 0.17618353292346, + "learning_rate": 8.837380031390885e-07, + "loss": 0.0719, + "reward": 1.8588542342185974, + "reward_std": 0.1677723281085491, + "rewards/accuracy_reward": 0.1562500052154064, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 2745 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.6916870117187, + "epoch": 0.8788606176988318, + "grad_norm": 0.08926547318696976, + "kl": 0.1727425143122673, + "learning_rate": 8.791501068758823e-07, + "loss": 0.0632, + "reward": 1.7614583611488341, + "reward_std": 0.20442461520433425, + "rewards/accuracy_reward": 0.05833333488553762, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7427083551883698, + "step": 2746 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.4208557128907, + "epoch": 0.8791806689070252, + "grad_norm": 0.24627292156219482, + "kl": 0.22680708356201648, + "learning_rate": 8.745736029626306e-07, + "loss": 0.0524, + "reward": 1.726562535762787, + "reward_std": 0.12180216982960701, + "rewards/accuracy_reward": 0.00416666679084301, + "rewards/format_reward": 0.9791666865348816, + "rewards/tag_count_reward": 0.7432291865348816, + "step": 2747 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.7125213623046, + "epoch": 0.8795007201152184, + "grad_norm": 0.13664305210113525, + "kl": 0.23294759541749954, + "learning_rate": 8.70008497115592e-07, + "loss": 0.0874, + "reward": 1.8510417103767396, + "reward_std": 0.183480966091156, + "rewards/accuracy_reward": 0.13958333879709245, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7447916865348816, + "step": 2748 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.5187683105469, + "epoch": 0.8798207713234117, + "grad_norm": 0.2415604591369629, + "kl": 0.3387389235198498, + "learning_rate": 8.654547950367898e-07, + "loss": 0.1045, + "reward": 1.7510416984558106, + "reward_std": 0.18389342874288558, + "rewards/accuracy_reward": 0.0479166679084301, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7447916805744171, + "step": 2749 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.1229370117187, + "epoch": 0.8801408225316051, + "grad_norm": 0.165102019906044, + "kl": 0.28490790314972403, + "learning_rate": 8.609125024139986e-07, + "loss": 0.0811, + "reward": 1.7822917103767395, + "reward_std": 0.20692486464977264, + "rewards/accuracy_reward": 0.07708333600312471, + "rewards/format_reward": 0.9645833611488343, + "rewards/tag_count_reward": 0.7406250238418579, + "step": 2750 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.864599609375, + "epoch": 0.8804608737397984, + "grad_norm": 0.16449257731437683, + "kl": 0.31549433767795565, + "learning_rate": 8.563816249207457e-07, + "loss": 0.0955, + "reward": 1.7578125596046448, + "reward_std": 0.22044627070426942, + "rewards/accuracy_reward": 0.05833333507180214, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7411458432674408, + "step": 2751 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.3916900634765, + "epoch": 0.8807809249479917, + "grad_norm": 0.14509373903274536, + "kl": 0.26425855085253713, + "learning_rate": 8.51862168216303e-07, + "loss": 0.0705, + "reward": 1.7729166984558105, + "reward_std": 0.18468779399991037, + "rewards/accuracy_reward": 0.06458333600312471, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7416666746139526, + "step": 2752 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.145849609375, + "epoch": 0.881100976156185, + "grad_norm": 0.17047415673732758, + "kl": 0.31822266019880774, + "learning_rate": 8.473541379456707e-07, + "loss": 0.0821, + "reward": 1.9010417342185975, + "reward_std": 0.22661916613578797, + "rewards/accuracy_reward": 0.18541667219251395, + "rewards/format_reward": 0.9750000178813935, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 2753 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.94794921875, + "epoch": 0.8814210273643783, + "grad_norm": 0.15989422798156738, + "kl": 0.30939139164984225, + "learning_rate": 8.428575397395833e-07, + "loss": 0.0502, + "reward": 1.8328125596046447, + "reward_std": 0.12547328621149062, + "rewards/accuracy_reward": 0.11250000353902578, + "rewards/format_reward": 0.9770833551883698, + "rewards/tag_count_reward": 0.7432291865348816, + "step": 2754 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.9437744140625, + "epoch": 0.8817410785725716, + "grad_norm": 0.3773564398288727, + "kl": 0.5229472696781159, + "learning_rate": 8.383723792144916e-07, + "loss": 0.0804, + "reward": 1.723437535762787, + "reward_std": 0.18466463536024094, + "rewards/accuracy_reward": 0.022916667535901068, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.735937523841858, + "step": 2755 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.4625244140625, + "epoch": 0.8820611297807649, + "grad_norm": 0.11909200251102448, + "kl": 0.20261020734906196, + "learning_rate": 8.338986619725631e-07, + "loss": 0.0616, + "reward": 1.7557291984558105, + "reward_std": 0.19833394810557364, + "rewards/accuracy_reward": 0.05208333544433117, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7411458611488342, + "step": 2756 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.8541809082031, + "epoch": 0.8823811809889582, + "grad_norm": 0.1677577942609787, + "kl": 0.2974459655582905, + "learning_rate": 8.294363936016725e-07, + "loss": 0.1096, + "reward": 1.7640625476837157, + "reward_std": 0.21791196018457412, + "rewards/accuracy_reward": 0.07916666995733976, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7348958432674408, + "step": 2757 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.4750183105468, + "epoch": 0.8827012321971516, + "grad_norm": 0.3134852349758148, + "kl": 0.32943947799503803, + "learning_rate": 8.249855796753881e-07, + "loss": 0.0577, + "reward": 1.817708384990692, + "reward_std": 0.18252429068088533, + "rewards/accuracy_reward": 0.10208333563059568, + "rewards/format_reward": 0.9729166805744172, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 2758 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.122933959961, + "epoch": 0.8830212834053448, + "grad_norm": 0.21888695657253265, + "kl": 0.15557781457901002, + "learning_rate": 8.205462257529795e-07, + "loss": 0.0677, + "reward": 1.7666667222976684, + "reward_std": 0.17220090329647064, + "rewards/accuracy_reward": 0.052083334513008596, + "rewards/format_reward": 0.9729166805744172, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2759 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.1833557128906, + "epoch": 0.8833413346135381, + "grad_norm": 0.15581285953521729, + "kl": 0.21956364698708059, + "learning_rate": 8.161183373793968e-07, + "loss": 0.0811, + "reward": 1.7536458730697633, + "reward_std": 0.15556090101599693, + "rewards/accuracy_reward": 0.04583333432674408, + "rewards/format_reward": 0.9645833432674408, + "rewards/tag_count_reward": 0.7432291746139527, + "step": 2760 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.208349609375, + "epoch": 0.8836613858217315, + "grad_norm": 0.30839839577674866, + "kl": 0.334910923242569, + "learning_rate": 8.117019200852716e-07, + "loss": 0.0615, + "reward": 1.7645833492279053, + "reward_std": 0.1759663164615631, + "rewards/accuracy_reward": 0.05000000074505806, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.7437500178813934, + "step": 2761 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.3271118164063, + "epoch": 0.8839814370299248, + "grad_norm": 0.15235859155654907, + "kl": 0.30991976857185366, + "learning_rate": 8.07296979386909e-07, + "loss": 0.0648, + "reward": 1.8390625596046448, + "reward_std": 0.1289732813835144, + "rewards/accuracy_reward": 0.12291667014360427, + "rewards/format_reward": 0.9687500119209289, + "rewards/tag_count_reward": 0.7473958492279053, + "step": 2762 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.8354339599609, + "epoch": 0.884301488238118, + "grad_norm": 0.19533921778202057, + "kl": 0.2847344473004341, + "learning_rate": 8.029035207862712e-07, + "loss": 0.1147, + "reward": 1.7619791984558106, + "reward_std": 0.2641393207013607, + "rewards/accuracy_reward": 0.07500000074505805, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7328125178813935, + "step": 2763 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.4291931152344, + "epoch": 0.8846215394463114, + "grad_norm": 0.12322978675365448, + "kl": 0.21314894780516624, + "learning_rate": 7.985215497709909e-07, + "loss": 0.0805, + "reward": 1.8369792103767395, + "reward_std": 0.19562975615262984, + "rewards/accuracy_reward": 0.12708333600312471, + "rewards/format_reward": 0.9687500298023224, + "rewards/tag_count_reward": 0.7411458551883697, + "step": 2764 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.8562744140625, + "epoch": 0.8849415906545047, + "grad_norm": 0.08437777310609818, + "kl": 0.26649289689958094, + "learning_rate": 7.94151071814343e-07, + "loss": 0.0789, + "reward": 1.7765625357627868, + "reward_std": 0.18437634333968161, + "rewards/accuracy_reward": 0.07291667070239782, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2765 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.8729431152344, + "epoch": 0.8852616418626981, + "grad_norm": 0.2084723263978958, + "kl": 0.29305521994829176, + "learning_rate": 7.897920923752533e-07, + "loss": 0.0981, + "reward": 1.764062523841858, + "reward_std": 0.2217009961605072, + "rewards/accuracy_reward": 0.06458333544433117, + "rewards/format_reward": 0.9583333611488343, + "rewards/tag_count_reward": 0.7411458492279053, + "step": 2766 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.0062622070312, + "epoch": 0.8855816930708913, + "grad_norm": 0.12448950111865997, + "kl": 0.28461782485246656, + "learning_rate": 7.854446168982777e-07, + "loss": 0.0964, + "reward": 1.8291667103767395, + "reward_std": 0.2683658979833126, + "rewards/accuracy_reward": 0.13125000502914191, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7395833611488343, + "step": 2767 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.808349609375, + "epoch": 0.8859017442790846, + "grad_norm": 0.11394886672496796, + "kl": 0.2418998047709465, + "learning_rate": 7.811086508136112e-07, + "loss": 0.0943, + "reward": 1.7395833611488343, + "reward_std": 0.16707825362682344, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 2768 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.5979400634766, + "epoch": 0.886221795487278, + "grad_norm": 0.2881830930709839, + "kl": 0.3590947136282921, + "learning_rate": 7.767841995370673e-07, + "loss": 0.1011, + "reward": 1.8229166865348816, + "reward_std": 0.21268870532512665, + "rewards/accuracy_reward": 0.12291666865348816, + "rewards/format_reward": 0.9625000119209289, + "rewards/tag_count_reward": 0.7375000178813934, + "step": 2769 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.252099609375, + "epoch": 0.8865418466954713, + "grad_norm": 0.16620886325836182, + "kl": 0.18743032775819302, + "learning_rate": 7.724712684700819e-07, + "loss": 0.0947, + "reward": 1.7177083849906922, + "reward_std": 0.17939387410879135, + "rewards/accuracy_reward": 0.00833333358168602, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7406250238418579, + "step": 2770 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.0541809082031, + "epoch": 0.8868618979036645, + "grad_norm": 0.1098373681306839, + "kl": 0.1804877772927284, + "learning_rate": 7.681698629996959e-07, + "loss": 0.0475, + "reward": 1.8322917342185974, + "reward_std": 0.10751088634133339, + "rewards/accuracy_reward": 0.10208333656191826, + "rewards/format_reward": 0.9833333492279053, + "rewards/tag_count_reward": 0.7468750178813934, + "step": 2771 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.4771026611328, + "epoch": 0.8871819491118579, + "grad_norm": 0.337385356426239, + "kl": 0.18855189830064772, + "learning_rate": 7.638799884985593e-07, + "loss": 0.0544, + "reward": 1.7598958492279053, + "reward_std": 0.16964001804590226, + "rewards/accuracy_reward": 0.04375000055879354, + "rewards/format_reward": 0.9708333551883698, + "rewards/tag_count_reward": 0.745312511920929, + "step": 2772 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.3062622070313, + "epoch": 0.8875020003200512, + "grad_norm": 0.16040952503681183, + "kl": 0.37417167574167254, + "learning_rate": 7.59601650324917e-07, + "loss": 0.0889, + "reward": 1.7911458611488342, + "reward_std": 0.1644122764468193, + "rewards/accuracy_reward": 0.08125000149011612, + "rewards/format_reward": 0.9666666746139526, + "rewards/tag_count_reward": 0.743229192495346, + "step": 2773 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.3562683105469, + "epoch": 0.8878220515282446, + "grad_norm": 0.182782843708992, + "kl": 0.19795044660568237, + "learning_rate": 7.553348538226079e-07, + "loss": 0.074, + "reward": 1.778125035762787, + "reward_std": 0.1225196048617363, + "rewards/accuracy_reward": 0.06875000204890966, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.744791692495346, + "step": 2774 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.7416870117188, + "epoch": 0.8881421027364378, + "grad_norm": 0.3356267511844635, + "kl": 0.27316139116883276, + "learning_rate": 7.510796043210477e-07, + "loss": 0.104, + "reward": 1.809895884990692, + "reward_std": 0.15356628447771073, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.9625000119209289, + "rewards/tag_count_reward": 0.7432291865348816, + "step": 2775 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.4791870117188, + "epoch": 0.8884621539446311, + "grad_norm": 0.11805539578199387, + "kl": 0.18329550586640836, + "learning_rate": 7.468359071352338e-07, + "loss": 0.0604, + "reward": 1.8666667222976685, + "reward_std": 0.20947734415531158, + "rewards/accuracy_reward": 0.15208333637565374, + "rewards/format_reward": 0.9750000178813935, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 2776 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.1708587646484, + "epoch": 0.8887822051528245, + "grad_norm": 0.08612954616546631, + "kl": 0.15863345116376876, + "learning_rate": 7.426037675657361e-07, + "loss": 0.0447, + "reward": 1.8442708730697632, + "reward_std": 0.164375888556242, + "rewards/accuracy_reward": 0.1208333345130086, + "rewards/format_reward": 0.9791666865348816, + "rewards/tag_count_reward": 0.7442708432674408, + "step": 2777 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.7875030517578, + "epoch": 0.8891022563610178, + "grad_norm": 0.10837756097316742, + "kl": 0.21986165940761565, + "learning_rate": 7.383831908986849e-07, + "loss": 0.054, + "reward": 1.8770833969116212, + "reward_std": 0.2275755934417248, + "rewards/accuracy_reward": 0.15625000596046448, + "rewards/format_reward": 0.9750000238418579, + "rewards/tag_count_reward": 0.7458333551883698, + "step": 2778 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.6583526611328, + "epoch": 0.889422307569211, + "grad_norm": 0.1390489637851715, + "kl": 0.1624234464019537, + "learning_rate": 7.341741824057713e-07, + "loss": 0.0748, + "reward": 1.863020896911621, + "reward_std": 0.17711979597806932, + "rewards/accuracy_reward": 0.15000000335276126, + "rewards/format_reward": 0.9729166865348816, + "rewards/tag_count_reward": 0.7401041805744171, + "step": 2779 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.8541809082031, + "epoch": 0.8897423587774044, + "grad_norm": 0.13301320374011993, + "kl": 0.20487010031938552, + "learning_rate": 7.299767473442332e-07, + "loss": 0.0712, + "reward": 1.806770884990692, + "reward_std": 0.17180782556533813, + "rewards/accuracy_reward": 0.09791666865348816, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7421875238418579, + "step": 2780 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.8833557128906, + "epoch": 0.8900624099855977, + "grad_norm": 0.10869420319795609, + "kl": 0.18663627617061138, + "learning_rate": 7.257908909568567e-07, + "loss": 0.077, + "reward": 1.7598958730697631, + "reward_std": 0.1811968594789505, + "rewards/accuracy_reward": 0.04583333395421505, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7473958492279053, + "step": 2781 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.9416870117187, + "epoch": 0.890382461193791, + "grad_norm": 0.1881677657365799, + "kl": 0.3977797865867615, + "learning_rate": 7.216166184719653e-07, + "loss": 0.1338, + "reward": 1.6932291865348816, + "reward_std": 0.2559267617762089, + "rewards/accuracy_reward": 0.01041666679084301, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7328125238418579, + "step": 2782 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.9166870117188, + "epoch": 0.8907025124019843, + "grad_norm": 0.14625728130340576, + "kl": 0.1800611212849617, + "learning_rate": 7.174539351034071e-07, + "loss": 0.0586, + "reward": 1.8166667342185974, + "reward_std": 0.17907868325710297, + "rewards/accuracy_reward": 0.10000000353902579, + "rewards/format_reward": 0.9750000178813935, + "rewards/tag_count_reward": 0.7416666805744171, + "step": 2783 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.2916870117188, + "epoch": 0.8910225636101776, + "grad_norm": 0.324567049741745, + "kl": 0.268314453586936, + "learning_rate": 7.133028460505642e-07, + "loss": 0.1042, + "reward": 1.7411458849906922, + "reward_std": 0.22085545733571052, + "rewards/accuracy_reward": 0.04791666734963655, + "rewards/format_reward": 0.956250011920929, + "rewards/tag_count_reward": 0.7369791805744171, + "step": 2784 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.4229309082032, + "epoch": 0.891342614818371, + "grad_norm": 0.23639392852783203, + "kl": 0.4426693290472031, + "learning_rate": 7.091633564983314e-07, + "loss": 0.129, + "reward": 1.8010417103767395, + "reward_std": 0.3021409660577774, + "rewards/accuracy_reward": 0.11458333656191826, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 2785 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.8229370117188, + "epoch": 0.8916626660265643, + "grad_norm": 0.13480092585086823, + "kl": 0.34565304294228555, + "learning_rate": 7.05035471617117e-07, + "loss": 0.1022, + "reward": 1.7161458730697632, + "reward_std": 0.20841763466596602, + "rewards/accuracy_reward": 0.02500000149011612, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2786 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.7416809082031, + "epoch": 0.8919827172347575, + "grad_norm": 0.23543386161327362, + "kl": 0.29164321571588514, + "learning_rate": 7.009191965628348e-07, + "loss": 0.0929, + "reward": 1.7140625238418579, + "reward_std": 0.22377543151378632, + "rewards/accuracy_reward": 0.02291666679084301, + "rewards/format_reward": 0.9479166865348816, + "rewards/tag_count_reward": 0.7432291865348816, + "step": 2787 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.6020965576172, + "epoch": 0.8923027684429509, + "grad_norm": 0.08022233843803406, + "kl": 0.18929398953914642, + "learning_rate": 6.96814536476893e-07, + "loss": 0.0559, + "reward": 1.7442708611488342, + "reward_std": 0.15617139339447023, + "rewards/accuracy_reward": 0.0291666679084301, + "rewards/format_reward": 0.9708333551883698, + "rewards/tag_count_reward": 0.7442708432674408, + "step": 2788 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.7229431152343, + "epoch": 0.8926228196511442, + "grad_norm": 0.12308970093727112, + "kl": 0.3831124782562256, + "learning_rate": 6.927214964861995e-07, + "loss": 0.0681, + "reward": 1.9031250596046447, + "reward_std": 0.15697635114192962, + "rewards/accuracy_reward": 0.19375000521540642, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7427083611488342, + "step": 2789 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.0833557128906, + "epoch": 0.8929428708593375, + "grad_norm": 0.16935700178146362, + "kl": 0.3776381004601717, + "learning_rate": 6.886400817031435e-07, + "loss": 0.1194, + "reward": 1.7723958969116211, + "reward_std": 0.21539193615317345, + "rewards/accuracy_reward": 0.08125000149011612, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.739062511920929, + "step": 2790 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.7687683105469, + "epoch": 0.8932629220675308, + "grad_norm": 0.08544913679361343, + "kl": 0.24526721499860288, + "learning_rate": 6.845702972255974e-07, + "loss": 0.0746, + "reward": 1.7822917222976684, + "reward_std": 0.18180104196071625, + "rewards/accuracy_reward": 0.08125000316649675, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 2791 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.1562774658203, + "epoch": 0.8935829732757241, + "grad_norm": 0.16124188899993896, + "kl": 0.29785202592611315, + "learning_rate": 6.805121481368993e-07, + "loss": 0.087, + "reward": 1.8942708730697633, + "reward_std": 0.24526706635951995, + "rewards/accuracy_reward": 0.18125000819563866, + "rewards/format_reward": 0.9729166924953461, + "rewards/tag_count_reward": 0.7401041924953461, + "step": 2792 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.9500183105469, + "epoch": 0.8939030244839175, + "grad_norm": 0.3463839590549469, + "kl": 0.2856299549341202, + "learning_rate": 6.764656395058622e-07, + "loss": 0.0934, + "reward": 1.7390625119209289, + "reward_std": 0.21077930554747581, + "rewards/accuracy_reward": 0.037500000558793545, + "rewards/format_reward": 0.9583333611488343, + "rewards/tag_count_reward": 0.7432291865348816, + "step": 2793 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.3375122070313, + "epoch": 0.8942230756921108, + "grad_norm": 0.13446158170700073, + "kl": 0.31556113585829737, + "learning_rate": 6.724307763867555e-07, + "loss": 0.0784, + "reward": 1.7307292103767395, + "reward_std": 0.19142997413873672, + "rewards/accuracy_reward": 0.02708333395421505, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7411458551883697, + "step": 2794 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.5104400634766, + "epoch": 0.894543126900304, + "grad_norm": 0.33578991889953613, + "kl": 0.3360072895884514, + "learning_rate": 6.684075638193066e-07, + "loss": 0.1057, + "reward": 1.7260416984558105, + "reward_std": 0.24067014306783677, + "rewards/accuracy_reward": 0.04375000223517418, + "rewards/format_reward": 0.9479166865348816, + "rewards/tag_count_reward": 0.7343750238418579, + "step": 2795 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.8666839599609, + "epoch": 0.8948631781084974, + "grad_norm": 0.219113290309906, + "kl": 0.25697992742061615, + "learning_rate": 6.643960068286814e-07, + "loss": 0.1114, + "reward": 1.7854167103767395, + "reward_std": 0.19451157450675965, + "rewards/accuracy_reward": 0.08750000111758709, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 2796 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.0541809082031, + "epoch": 0.8951832293166907, + "grad_norm": 0.1934441775083542, + "kl": 0.5528342947363853, + "learning_rate": 6.603961104255018e-07, + "loss": 0.1239, + "reward": 1.6994791984558106, + "reward_std": 0.3112195998430252, + "rewards/accuracy_reward": 0.04166666753590107, + "rewards/format_reward": 0.9270833432674408, + "rewards/tag_count_reward": 0.7307291805744172, + "step": 2797 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.883349609375, + "epoch": 0.8955032805248839, + "grad_norm": 0.07793132960796356, + "kl": 0.2168126493692398, + "learning_rate": 6.564078796058137e-07, + "loss": 0.0569, + "reward": 1.726562511920929, + "reward_std": 0.1572144016623497, + "rewards/accuracy_reward": 0.014583333395421505, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.7411458492279053, + "step": 2798 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.3062713623046, + "epoch": 0.8958233317330773, + "grad_norm": 0.12241532653570175, + "kl": 0.33055841401219366, + "learning_rate": 6.52431319351099e-07, + "loss": 0.0506, + "reward": 1.7161458611488343, + "reward_std": 0.16185689046978952, + "rewards/accuracy_reward": 0.00833333358168602, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7411458551883697, + "step": 2799 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.6333465576172, + "epoch": 0.8961433829412706, + "grad_norm": 0.18630848824977875, + "kl": 0.3077032431960106, + "learning_rate": 6.484664346282555e-07, + "loss": 0.1194, + "reward": 1.7812500476837159, + "reward_std": 0.2138199493288994, + "rewards/accuracy_reward": 0.09166667070239783, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7354166805744171, + "step": 2800 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.1375213623047, + "epoch": 0.896463434149464, + "grad_norm": 0.13901875913143158, + "kl": 0.32606543600559235, + "learning_rate": 6.44513230389604e-07, + "loss": 0.0603, + "reward": 1.793750035762787, + "reward_std": 0.18691025376319886, + "rewards/accuracy_reward": 0.0916666692122817, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 2801 + }, + { + "clip_ratio": 0.0, + "completion_length": 603.393765258789, + "epoch": 0.8967834853576572, + "grad_norm": 0.22432410717010498, + "kl": 0.47303307950496676, + "learning_rate": 6.405717115728727e-07, + "loss": 0.1022, + "reward": 1.7541667103767395, + "reward_std": 0.2878709942102432, + "rewards/accuracy_reward": 0.08541666846722365, + "rewards/format_reward": 0.9354166924953461, + "rewards/tag_count_reward": 0.7333333551883697, + "step": 2802 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.6562744140625, + "epoch": 0.8971035365658505, + "grad_norm": 0.424375057220459, + "kl": 0.3386325158178806, + "learning_rate": 6.366418831011955e-07, + "loss": 0.0917, + "reward": 1.7348958730697632, + "reward_std": 0.22167600244283675, + "rewards/accuracy_reward": 0.04791666828095913, + "rewards/format_reward": 0.9437500238418579, + "rewards/tag_count_reward": 0.743229192495346, + "step": 2803 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.420849609375, + "epoch": 0.8974235877740439, + "grad_norm": 0.20574527978897095, + "kl": 0.27687034383416176, + "learning_rate": 6.32723749883104e-07, + "loss": 0.0859, + "reward": 1.7479166984558105, + "reward_std": 0.23028158992528916, + "rewards/accuracy_reward": 0.05416666753590107, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7375000238418579, + "step": 2804 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.4354370117187, + "epoch": 0.8977436389822372, + "grad_norm": 0.13979966938495636, + "kl": 0.29248685389757156, + "learning_rate": 6.288173168125234e-07, + "loss": 0.0998, + "reward": 1.7921875357627868, + "reward_std": 0.22479058653116227, + "rewards/accuracy_reward": 0.09375000242143869, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.7463541865348816, + "step": 2805 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.677099609375, + "epoch": 0.8980636901904304, + "grad_norm": 0.3141164481639862, + "kl": 0.4433578472584486, + "learning_rate": 6.249225887687615e-07, + "loss": 0.1531, + "reward": 1.7578125476837159, + "reward_std": 0.29427343755960467, + "rewards/accuracy_reward": 0.08333333507180214, + "rewards/format_reward": 0.9375000238418579, + "rewards/tag_count_reward": 0.7369791865348816, + "step": 2806 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.2291809082031, + "epoch": 0.8983837413986238, + "grad_norm": 0.12283515930175781, + "kl": 0.2124796152114868, + "learning_rate": 6.210395706165106e-07, + "loss": 0.0779, + "reward": 1.8734375476837157, + "reward_std": 0.18899996876716613, + "rewards/accuracy_reward": 0.15833333525806664, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.7442708492279053, + "step": 2807 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.6562683105469, + "epoch": 0.8987037926068171, + "grad_norm": 0.2164728194475174, + "kl": 0.21420424431562424, + "learning_rate": 6.171682672058322e-07, + "loss": 0.075, + "reward": 1.7364583611488342, + "reward_std": 0.18904096335172654, + "rewards/accuracy_reward": 0.031250000558793546, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7427083611488342, + "step": 2808 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.497933959961, + "epoch": 0.8990238438150104, + "grad_norm": 0.19198265671730042, + "kl": 0.29132072255015373, + "learning_rate": 6.133086833721569e-07, + "loss": 0.0874, + "reward": 1.814583384990692, + "reward_std": 0.2104831539094448, + "rewards/accuracy_reward": 0.11250000298023224, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7395833611488343, + "step": 2809 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.2562591552735, + "epoch": 0.8993438950232037, + "grad_norm": 0.15204951167106628, + "kl": 0.2960913643240929, + "learning_rate": 6.094608239362799e-07, + "loss": 0.1252, + "reward": 1.8786458849906922, + "reward_std": 0.2817792721092701, + "rewards/accuracy_reward": 0.1875000072643161, + "rewards/format_reward": 0.954166692495346, + "rewards/tag_count_reward": 0.7369791805744171, + "step": 2810 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.775015258789, + "epoch": 0.899663946231397, + "grad_norm": 0.19675280153751373, + "kl": 0.2733559250831604, + "learning_rate": 6.056246937043475e-07, + "loss": 0.1022, + "reward": 1.7692708492279052, + "reward_std": 0.16505006551742554, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7380208611488343, + "step": 2811 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.0083465576172, + "epoch": 0.8999839974395903, + "grad_norm": 0.29460424184799194, + "kl": 0.3481321565806866, + "learning_rate": 6.018002974678616e-07, + "loss": 0.0974, + "reward": 1.8307292342185975, + "reward_std": 0.22881743013858796, + "rewards/accuracy_reward": 0.1395833384245634, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7369791805744171, + "step": 2812 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.6479370117188, + "epoch": 0.9003040486477837, + "grad_norm": 0.12847913801670074, + "kl": 0.3264305554330349, + "learning_rate": 5.979876400036599e-07, + "loss": 0.1143, + "reward": 1.7645833611488342, + "reward_std": 0.2144318014383316, + "rewards/accuracy_reward": 0.0666666692122817, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7395833432674408, + "step": 2813 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.5354370117187, + "epoch": 0.9006240998559769, + "grad_norm": 0.2630733549594879, + "kl": 0.428336625546217, + "learning_rate": 5.941867260739265e-07, + "loss": 0.091, + "reward": 1.7848958730697633, + "reward_std": 0.1891431801021099, + "rewards/accuracy_reward": 0.08541666977107525, + "rewards/format_reward": 0.9583333432674408, + "rewards/tag_count_reward": 0.7411458432674408, + "step": 2814 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.5000152587891, + "epoch": 0.9009441510641703, + "grad_norm": 0.12521198391914368, + "kl": 0.32432365864515306, + "learning_rate": 5.903975604261725e-07, + "loss": 0.0986, + "reward": 1.8453125596046447, + "reward_std": 0.20809532403945924, + "rewards/accuracy_reward": 0.14166667237877845, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7411458551883697, + "step": 2815 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.6250183105469, + "epoch": 0.9012642022723636, + "grad_norm": 0.1308985948562622, + "kl": 0.335451889783144, + "learning_rate": 5.866201477932321e-07, + "loss": 0.0883, + "reward": 1.7958333611488342, + "reward_std": 0.2012617200613022, + "rewards/accuracy_reward": 0.10000000260770321, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2816 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.7479370117187, + "epoch": 0.9015842534805569, + "grad_norm": 0.09408943355083466, + "kl": 0.24329584948718547, + "learning_rate": 5.828544928932655e-07, + "loss": 0.0881, + "reward": 1.8208333849906921, + "reward_std": 0.22802594751119615, + "rewards/accuracy_reward": 0.11041667051613331, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7437500178813934, + "step": 2817 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.4146057128906, + "epoch": 0.9019043046887502, + "grad_norm": 0.2508453130722046, + "kl": 0.42456541061401365, + "learning_rate": 5.791006004297451e-07, + "loss": 0.1234, + "reward": 1.7947917342185975, + "reward_std": 0.21245116442441941, + "rewards/accuracy_reward": 0.10000000316649675, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7385416984558105, + "step": 2818 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.8187683105468, + "epoch": 0.9022243558969435, + "grad_norm": 0.19571591913700104, + "kl": 0.3216987043619156, + "learning_rate": 5.753584750914476e-07, + "loss": 0.128, + "reward": 1.8833333611488343, + "reward_std": 0.2775250434875488, + "rewards/accuracy_reward": 0.193750006146729, + "rewards/format_reward": 0.954166692495346, + "rewards/tag_count_reward": 0.7354166805744171, + "step": 2819 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.6791748046875, + "epoch": 0.9025444071051368, + "grad_norm": 0.23652778565883636, + "kl": 0.20177725926041604, + "learning_rate": 5.7162812155246e-07, + "loss": 0.0781, + "reward": 1.751562523841858, + "reward_std": 0.21785627081990241, + "rewards/accuracy_reward": 0.05208333414047957, + "rewards/format_reward": 0.956250011920929, + "rewards/tag_count_reward": 0.7432291865348816, + "step": 2820 + }, + { + "clip_ratio": 0.0, + "completion_length": 526.1937744140625, + "epoch": 0.9028644583133302, + "grad_norm": 0.16797806322574615, + "kl": 0.18671303614974022, + "learning_rate": 5.679095444721538e-07, + "loss": 0.0619, + "reward": 1.8432292103767396, + "reward_std": 0.1968873217701912, + "rewards/accuracy_reward": 0.13333333600312472, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7432291805744171, + "step": 2821 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.0875183105469, + "epoch": 0.9031845095215234, + "grad_norm": 0.1344435214996338, + "kl": 0.252733601629734, + "learning_rate": 5.64202748495204e-07, + "loss": 0.1045, + "reward": 1.7807292222976685, + "reward_std": 0.23075918704271317, + "rewards/accuracy_reward": 0.08125000204890967, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7411458611488342, + "step": 2822 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.1604431152343, + "epoch": 0.9035045607297167, + "grad_norm": 0.21127544343471527, + "kl": 0.2684433352202177, + "learning_rate": 5.605077382515644e-07, + "loss": 0.0994, + "reward": 1.7687500357627868, + "reward_std": 0.1979643739759922, + "rewards/accuracy_reward": 0.07291666939854621, + "rewards/format_reward": 0.9583333611488343, + "rewards/tag_count_reward": 0.737500011920929, + "step": 2823 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.8312683105469, + "epoch": 0.9038246119379101, + "grad_norm": 0.14521047472953796, + "kl": 0.23440488129854203, + "learning_rate": 5.568245183564669e-07, + "loss": 0.0719, + "reward": 1.7739583849906921, + "reward_std": 0.20358410999178886, + "rewards/accuracy_reward": 0.07083333358168602, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 2824 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.9291839599609, + "epoch": 0.9041446631461034, + "grad_norm": 0.20504160225391388, + "kl": 0.301665635406971, + "learning_rate": 5.531530934104179e-07, + "loss": 0.0843, + "reward": 1.728645884990692, + "reward_std": 0.21848196685314178, + "rewards/accuracy_reward": 0.02916666679084301, + "rewards/format_reward": 0.9604166924953461, + "rewards/tag_count_reward": 0.739062511920929, + "step": 2825 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.1000183105468, + "epoch": 0.9044647143542967, + "grad_norm": 0.08053412288427353, + "kl": 0.2033343430608511, + "learning_rate": 5.494934679991914e-07, + "loss": 0.0564, + "reward": 1.7614583611488341, + "reward_std": 0.16279089897871019, + "rewards/accuracy_reward": 0.0520833358168602, + "rewards/format_reward": 0.9687500119209289, + "rewards/tag_count_reward": 0.7406250059604644, + "step": 2826 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.7062622070313, + "epoch": 0.90478476556249, + "grad_norm": 0.24011105298995972, + "kl": 0.26724216900765896, + "learning_rate": 5.458456466938233e-07, + "loss": 0.0639, + "reward": 1.8031250357627868, + "reward_std": 0.18479779735207558, + "rewards/accuracy_reward": 0.10208333637565374, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7364583432674408, + "step": 2827 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.0625152587891, + "epoch": 0.9051048167706833, + "grad_norm": 0.09952478110790253, + "kl": 0.2889927580952644, + "learning_rate": 5.422096340506089e-07, + "loss": 0.0875, + "reward": 1.7843750357627868, + "reward_std": 0.17545911446213722, + "rewards/accuracy_reward": 0.07708333786576986, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 2828 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.0750183105469, + "epoch": 0.9054248679788767, + "grad_norm": 0.16065450012683868, + "kl": 0.31739690750837324, + "learning_rate": 5.385854346110853e-07, + "loss": 0.1125, + "reward": 1.8359375238418578, + "reward_std": 0.2566369533538818, + "rewards/accuracy_reward": 0.145833339355886, + "rewards/format_reward": 0.956250011920929, + "rewards/tag_count_reward": 0.7338541865348815, + "step": 2829 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.0604431152344, + "epoch": 0.9057449191870699, + "grad_norm": 0.19560876488685608, + "kl": 0.30017624273896215, + "learning_rate": 5.349730529020436e-07, + "loss": 0.1009, + "reward": 1.7458333730697633, + "reward_std": 0.2519936338067055, + "rewards/accuracy_reward": 0.06458333637565375, + "rewards/format_reward": 0.9416666865348816, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2830 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.1521087646485, + "epoch": 0.9060649703952632, + "grad_norm": 0.15118496119976044, + "kl": 0.21989080756902696, + "learning_rate": 5.313724934355102e-07, + "loss": 0.0791, + "reward": 1.7239583611488343, + "reward_std": 0.15902083963155747, + "rewards/accuracy_reward": 0.01041666679084301, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7447916984558105, + "step": 2831 + }, + { + "clip_ratio": 0.0, + "completion_length": 593.327099609375, + "epoch": 0.9063850216034566, + "grad_norm": 0.262523353099823, + "kl": 0.49291563779115677, + "learning_rate": 5.277837607087455e-07, + "loss": 0.1086, + "reward": 1.7229166865348815, + "reward_std": 0.24671917259693146, + "rewards/accuracy_reward": 0.04166666828095913, + "rewards/format_reward": 0.9416666924953461, + "rewards/tag_count_reward": 0.7395833611488343, + "step": 2832 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.0666809082031, + "epoch": 0.9067050728116499, + "grad_norm": 0.08810193836688995, + "kl": 0.2723796620965004, + "learning_rate": 5.242068592042349e-07, + "loss": 0.0875, + "reward": 1.7750000476837158, + "reward_std": 0.20654670670628547, + "rewards/accuracy_reward": 0.07708333730697632, + "rewards/format_reward": 0.956250011920929, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2833 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.1396087646484, + "epoch": 0.9070251240198431, + "grad_norm": 0.1329844743013382, + "kl": 0.37486855015158654, + "learning_rate": 5.206417933896901e-07, + "loss": 0.0794, + "reward": 1.7984375357627869, + "reward_std": 0.21113753989338874, + "rewards/accuracy_reward": 0.09791666865348816, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7401041746139526, + "step": 2834 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.2416870117188, + "epoch": 0.9073451752280365, + "grad_norm": 0.09566251188516617, + "kl": 0.24509716033935547, + "learning_rate": 5.170885677180382e-07, + "loss": 0.0561, + "reward": 1.812500035762787, + "reward_std": 0.1898981362581253, + "rewards/accuracy_reward": 0.11250000428408384, + "rewards/format_reward": 0.956250011920929, + "rewards/tag_count_reward": 0.7437500119209289, + "step": 2835 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.6187683105469, + "epoch": 0.9076652264362298, + "grad_norm": 0.16700927913188934, + "kl": 0.38997380435466766, + "learning_rate": 5.135471866274167e-07, + "loss": 0.1089, + "reward": 1.8125000476837159, + "reward_std": 0.2434275358915329, + "rewards/accuracy_reward": 0.1333333384245634, + "rewards/format_reward": 0.9437500178813935, + "rewards/tag_count_reward": 0.735416692495346, + "step": 2836 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.7104309082031, + "epoch": 0.9079852776444232, + "grad_norm": 0.16015952825546265, + "kl": 0.2919667445123196, + "learning_rate": 5.100176545411706e-07, + "loss": 0.0978, + "reward": 1.7885417103767396, + "reward_std": 0.24260507076978682, + "rewards/accuracy_reward": 0.09583333786576986, + "rewards/format_reward": 0.9604166746139526, + "rewards/tag_count_reward": 0.7322916805744171, + "step": 2837 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.6479339599609, + "epoch": 0.9083053288526164, + "grad_norm": 0.10905592143535614, + "kl": 0.23061162009835243, + "learning_rate": 5.064999758678391e-07, + "loss": 0.0614, + "reward": 1.8208333849906921, + "reward_std": 0.15975419506430627, + "rewards/accuracy_reward": 0.11041666939854622, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7416666746139526, + "step": 2838 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.3021026611328, + "epoch": 0.9086253800608097, + "grad_norm": 0.1088496744632721, + "kl": 0.28061444610357283, + "learning_rate": 5.029941550011663e-07, + "loss": 0.0869, + "reward": 1.8427083611488342, + "reward_std": 0.20039626583456993, + "rewards/accuracy_reward": 0.13750000353902578, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 2839 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.6416839599609, + "epoch": 0.9089454312690031, + "grad_norm": 0.17556871473789215, + "kl": 0.37912697792053224, + "learning_rate": 4.995001963200763e-07, + "loss": 0.111, + "reward": 1.8447916865348817, + "reward_std": 0.2618077598512173, + "rewards/accuracy_reward": 0.16041667386889458, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7343750119209289, + "step": 2840 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.7729339599609, + "epoch": 0.9092654824771963, + "grad_norm": 0.12427106499671936, + "kl": 0.2443702958524227, + "learning_rate": 4.960181041886802e-07, + "loss": 0.0688, + "reward": 1.7322916984558105, + "reward_std": 0.16462817713618277, + "rewards/accuracy_reward": 0.02083333358168602, + "rewards/format_reward": 0.9666666924953461, + "rewards/tag_count_reward": 0.7447916865348816, + "step": 2841 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.7041809082032, + "epoch": 0.9095855336853896, + "grad_norm": 0.18514235317707062, + "kl": 0.22610717713832856, + "learning_rate": 4.925478829562668e-07, + "loss": 0.0676, + "reward": 1.8135417222976684, + "reward_std": 0.21168894320726395, + "rewards/accuracy_reward": 0.10625000298023224, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 2842 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.4666809082031, + "epoch": 0.909905584893583, + "grad_norm": 0.13283954560756683, + "kl": 0.22682435177266597, + "learning_rate": 4.89089536957299e-07, + "loss": 0.063, + "reward": 1.7192708730697632, + "reward_std": 0.13977629393339158, + "rewards/accuracy_reward": 0.006250000186264515, + "rewards/format_reward": 0.9708333551883698, + "rewards/tag_count_reward": 0.7421875119209289, + "step": 2843 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.0479431152344, + "epoch": 0.9102256361017763, + "grad_norm": 0.1513800472021103, + "kl": 0.30399431884288786, + "learning_rate": 4.856430705114035e-07, + "loss": 0.077, + "reward": 1.7833333611488342, + "reward_std": 0.17256311923265458, + "rewards/accuracy_reward": 0.06875000167638064, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7458333551883698, + "step": 2844 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.5208557128906, + "epoch": 0.9105456873099695, + "grad_norm": 0.11325372755527496, + "kl": 0.19456406235694884, + "learning_rate": 4.822084879233746e-07, + "loss": 0.0756, + "reward": 1.8213542222976684, + "reward_std": 0.18610538244247438, + "rewards/accuracy_reward": 0.11041666902601718, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 2845 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.3896087646484, + "epoch": 0.9108657385181629, + "grad_norm": 0.1760214865207672, + "kl": 0.46450803726911544, + "learning_rate": 4.787857934831564e-07, + "loss": 0.1017, + "reward": 1.7854166984558106, + "reward_std": 0.21061150655150412, + "rewards/accuracy_reward": 0.08541666902601719, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7395833611488343, + "step": 2846 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.2104309082031, + "epoch": 0.9111857897263562, + "grad_norm": 0.1330924779176712, + "kl": 0.21791302636265755, + "learning_rate": 4.7537499146584896e-07, + "loss": 0.0718, + "reward": 1.8531250357627869, + "reward_std": 0.2734475418925285, + "rewards/accuracy_reward": 0.14791666977107526, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 2847 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.4687652587891, + "epoch": 0.9115058409345496, + "grad_norm": 0.19432726502418518, + "kl": 0.2989140644669533, + "learning_rate": 4.7197608613169685e-07, + "loss": 0.0681, + "reward": 1.7250000357627868, + "reward_std": 0.1789463460445404, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.950000011920929, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2848 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.6625244140625, + "epoch": 0.9118258921427428, + "grad_norm": 0.28562673926353455, + "kl": 0.31079639568924905, + "learning_rate": 4.6858908172608743e-07, + "loss": 0.0882, + "reward": 1.839583396911621, + "reward_std": 0.19922738000750542, + "rewards/accuracy_reward": 0.13541667014360428, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2849 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.2125274658204, + "epoch": 0.9121459433509361, + "grad_norm": 0.24760982394218445, + "kl": 0.21860564053058623, + "learning_rate": 4.6521398247953543e-07, + "loss": 0.0861, + "reward": 1.7479166984558105, + "reward_std": 0.1462639383971691, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7437500119209289, + "step": 2850 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.9354370117187, + "epoch": 0.9124659945591295, + "grad_norm": 0.1644178032875061, + "kl": 0.2740326181054115, + "learning_rate": 4.618507926076954e-07, + "loss": 0.0977, + "reward": 1.7671875357627869, + "reward_std": 0.19867352321743964, + "rewards/accuracy_reward": 0.06250000223517418, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 2851 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.0396026611328, + "epoch": 0.9127860457673228, + "grad_norm": 0.22223533689975739, + "kl": 0.24603908509016037, + "learning_rate": 4.584995163113404e-07, + "loss": 0.086, + "reward": 1.8843750834465027, + "reward_std": 0.18686745911836625, + "rewards/accuracy_reward": 0.17500000596046447, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7406250238418579, + "step": 2852 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.7104431152344, + "epoch": 0.913106096975516, + "grad_norm": 0.08565722405910492, + "kl": 0.2290005251765251, + "learning_rate": 4.5516015777636535e-07, + "loss": 0.0813, + "reward": 1.9026042222976685, + "reward_std": 0.23698803335428237, + "rewards/accuracy_reward": 0.1916666716337204, + "rewards/format_reward": 0.9687500119209289, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 2853 + }, + { + "clip_ratio": 0.0, + "completion_length": 590.058349609375, + "epoch": 0.9134261481837094, + "grad_norm": 0.22499686479568481, + "kl": 0.3013485103845596, + "learning_rate": 4.518327211737761e-07, + "loss": 0.0715, + "reward": 1.7885417222976685, + "reward_std": 0.17250624895095826, + "rewards/accuracy_reward": 0.09583333730697632, + "rewards/format_reward": 0.9520833432674408, + "rewards/tag_count_reward": 0.7406250059604644, + "step": 2854 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.8104431152344, + "epoch": 0.9137461993919027, + "grad_norm": 0.1874050498008728, + "kl": 0.3433677464723587, + "learning_rate": 4.4851721065969243e-07, + "loss": 0.1284, + "reward": 1.6703125357627868, + "reward_std": 0.25159602984786034, + "rewards/accuracy_reward": 0.01666666716337204, + "rewards/format_reward": 0.9187500238418579, + "rewards/tag_count_reward": 0.7348958551883698, + "step": 2855 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.7979400634765, + "epoch": 0.914066250600096, + "grad_norm": 0.13606959581375122, + "kl": 0.28302004411816595, + "learning_rate": 4.4521363037533627e-07, + "loss": 0.0707, + "reward": 1.7906250476837158, + "reward_std": 0.12235096469521523, + "rewards/accuracy_reward": 0.07500000316649676, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7489583492279053, + "step": 2856 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.6062683105469, + "epoch": 0.9143863018082893, + "grad_norm": 0.07938756793737411, + "kl": 0.22487426400184632, + "learning_rate": 4.4192198444702685e-07, + "loss": 0.0844, + "reward": 1.7760416984558105, + "reward_std": 0.23371648490428926, + "rewards/accuracy_reward": 0.08333333637565374, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2857 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.8208557128906, + "epoch": 0.9147063530164826, + "grad_norm": 0.17403903603553772, + "kl": 0.39559953659772873, + "learning_rate": 4.386422769861742e-07, + "loss": 0.0982, + "reward": 1.7104166984558105, + "reward_std": 0.20936973839998246, + "rewards/accuracy_reward": 0.018750000186264514, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7354166865348816, + "step": 2858 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.0604309082031, + "epoch": 0.915026404224676, + "grad_norm": 0.13922373950481415, + "kl": 0.22795844152569772, + "learning_rate": 4.353745120892838e-07, + "loss": 0.0631, + "reward": 1.7739583492279052, + "reward_std": 0.1610909268260002, + "rewards/accuracy_reward": 0.05625000018626451, + "rewards/format_reward": 0.9750000178813935, + "rewards/tag_count_reward": 0.7427083551883698, + "step": 2859 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.4104431152343, + "epoch": 0.9153464554328693, + "grad_norm": 0.11690249294042587, + "kl": 0.2956189580261707, + "learning_rate": 4.3211869383793735e-07, + "loss": 0.1005, + "reward": 1.7656250596046448, + "reward_std": 0.25438908860087395, + "rewards/accuracy_reward": 0.07916666846722364, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 2860 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.4208435058594, + "epoch": 0.9156665066410625, + "grad_norm": 0.10374096035957336, + "kl": 0.22751567736268044, + "learning_rate": 4.288748262987996e-07, + "loss": 0.1048, + "reward": 1.8015625596046447, + "reward_std": 0.19006576761603355, + "rewards/accuracy_reward": 0.09375000279396772, + "rewards/format_reward": 0.9645833611488343, + "rewards/tag_count_reward": 0.7432291805744171, + "step": 2861 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.8062683105469, + "epoch": 0.9159865578492559, + "grad_norm": 0.09356549382209778, + "kl": 0.20459693036973475, + "learning_rate": 4.256429135236062e-07, + "loss": 0.0595, + "reward": 1.7729166984558105, + "reward_std": 0.145799171179533, + "rewards/accuracy_reward": 0.052083334513008596, + "rewards/format_reward": 0.9770833492279053, + "rewards/tag_count_reward": 0.7437500178813934, + "step": 2862 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.9521118164063, + "epoch": 0.9163066090574492, + "grad_norm": 0.23343823850154877, + "kl": 0.3022413983941078, + "learning_rate": 4.2242295954915913e-07, + "loss": 0.0772, + "reward": 1.8635416984558106, + "reward_std": 0.15203743427991867, + "rewards/accuracy_reward": 0.15000000651925802, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.744791692495346, + "step": 2863 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.5937652587891, + "epoch": 0.9166266602656425, + "grad_norm": 0.1239825040102005, + "kl": 0.3159925784915686, + "learning_rate": 4.1921496839732677e-07, + "loss": 0.0643, + "reward": 1.7963542342185974, + "reward_std": 0.24162321984767915, + "rewards/accuracy_reward": 0.09375000167638063, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 2864 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.5729370117188, + "epoch": 0.9169467114738358, + "grad_norm": 0.19408008456230164, + "kl": 0.3092687904834747, + "learning_rate": 4.1601894407503507e-07, + "loss": 0.0504, + "reward": 1.7197916865348817, + "reward_std": 0.15204674303531646, + "rewards/accuracy_reward": 0.008333333395421505, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7427083611488342, + "step": 2865 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.6166778564453, + "epoch": 0.9172667626820291, + "grad_norm": 0.18424807488918304, + "kl": 0.47804224863648415, + "learning_rate": 4.128348905742585e-07, + "loss": 0.0842, + "reward": 1.7911458730697631, + "reward_std": 0.23824312388896943, + "rewards/accuracy_reward": 0.10416667237877845, + "rewards/format_reward": 0.950000011920929, + "rewards/tag_count_reward": 0.7369791805744171, + "step": 2866 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.6479370117188, + "epoch": 0.9175868138902225, + "grad_norm": 0.08997310698032379, + "kl": 0.28598271422088145, + "learning_rate": 4.096628118720236e-07, + "loss": 0.0907, + "reward": 1.7640625357627868, + "reward_std": 0.1785560056567192, + "rewards/accuracy_reward": 0.052083336375653745, + "rewards/format_reward": 0.9708333551883698, + "rewards/tag_count_reward": 0.7411458492279053, + "step": 2867 + }, + { + "clip_ratio": 0.0, + "completion_length": 620.0666809082031, + "epoch": 0.9179068650984158, + "grad_norm": 0.1769360601902008, + "kl": 0.3333309397101402, + "learning_rate": 4.065027119303988e-07, + "loss": 0.1033, + "reward": 1.7739583730697632, + "reward_std": 0.26832345873117447, + "rewards/accuracy_reward": 0.09166666883975268, + "rewards/format_reward": 0.9437500238418579, + "rewards/tag_count_reward": 0.7385416746139526, + "step": 2868 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.7500274658203, + "epoch": 0.918226916306609, + "grad_norm": 0.11235616356134415, + "kl": 0.3151254206895828, + "learning_rate": 4.0335459469649117e-07, + "loss": 0.0981, + "reward": 1.800520896911621, + "reward_std": 0.20175526589155196, + "rewards/accuracy_reward": 0.09791666977107524, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7401041865348816, + "step": 2869 + }, + { + "clip_ratio": 0.0, + "completion_length": 593.270849609375, + "epoch": 0.9185469675148024, + "grad_norm": 0.10434413701295853, + "kl": 0.23269173577427865, + "learning_rate": 4.002184641024409e-07, + "loss": 0.0967, + "reward": 1.7770833730697633, + "reward_std": 0.25618855506181715, + "rewards/accuracy_reward": 0.09166666828095912, + "rewards/format_reward": 0.9437500298023224, + "rewards/tag_count_reward": 0.7416666805744171, + "step": 2870 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.6979400634766, + "epoch": 0.9188670187229957, + "grad_norm": 0.09728308767080307, + "kl": 0.21025248169898986, + "learning_rate": 3.9709432406541125e-07, + "loss": 0.0779, + "reward": 1.8218750476837158, + "reward_std": 0.15108426734805108, + "rewards/accuracy_reward": 0.10833333842456341, + "rewards/format_reward": 0.9708333611488342, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 2871 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.3146087646485, + "epoch": 0.919187069931189, + "grad_norm": 0.16741010546684265, + "kl": 0.3041342481970787, + "learning_rate": 3.9398217848759637e-07, + "loss": 0.0879, + "reward": 1.7890625476837159, + "reward_std": 0.21354203149676323, + "rewards/accuracy_reward": 0.0791666692122817, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7411458551883697, + "step": 2872 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.2562713623047, + "epoch": 0.9195071211393823, + "grad_norm": 0.1411246955394745, + "kl": 0.23895582556724548, + "learning_rate": 3.9088203125620563e-07, + "loss": 0.0973, + "reward": 1.7385416984558106, + "reward_std": 0.2176542192697525, + "rewards/accuracy_reward": 0.0541666679084301, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7343750238418579, + "step": 2873 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.1104339599609, + "epoch": 0.9198271723475756, + "grad_norm": 0.20984359085559845, + "kl": 0.35559300556778906, + "learning_rate": 3.877938862434627e-07, + "loss": 0.1038, + "reward": 1.7473958611488343, + "reward_std": 0.19333869963884354, + "rewards/accuracy_reward": 0.05000000204890966, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.739062511920929, + "step": 2874 + }, + { + "clip_ratio": 0.0, + "completion_length": 537.4083465576172, + "epoch": 0.920147223555769, + "grad_norm": 0.10066534578800201, + "kl": 0.2031643271446228, + "learning_rate": 3.847177473065955e-07, + "loss": 0.0482, + "reward": 1.8208333730697632, + "reward_std": 0.13629631251096724, + "rewards/accuracy_reward": 0.10833333749324084, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7458333432674408, + "step": 2875 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.5645935058594, + "epoch": 0.9204672747639623, + "grad_norm": 0.11685199290513992, + "kl": 0.2875231482088566, + "learning_rate": 3.816536182878416e-07, + "loss": 0.0256, + "reward": 1.7531250238418579, + "reward_std": 0.09930408298969269, + "rewards/accuracy_reward": 0.029166667722165585, + "rewards/format_reward": 0.9750000178813935, + "rewards/tag_count_reward": 0.7489583492279053, + "step": 2876 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.2562683105468, + "epoch": 0.9207873259721555, + "grad_norm": 0.12749531865119934, + "kl": 0.27901603281497955, + "learning_rate": 3.786015030144352e-07, + "loss": 0.0592, + "reward": 1.8505208849906922, + "reward_std": 0.17607783749699593, + "rewards/accuracy_reward": 0.13541667070239782, + "rewards/format_reward": 0.9687500119209289, + "rewards/tag_count_reward": 0.7463541746139526, + "step": 2877 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.6729370117188, + "epoch": 0.9211073771803489, + "grad_norm": 0.1343710869550705, + "kl": 0.22883844375610352, + "learning_rate": 3.755614052986056e-07, + "loss": 0.0537, + "reward": 1.8072917222976685, + "reward_std": 0.1575484722852707, + "rewards/accuracy_reward": 0.08958333693444728, + "rewards/format_reward": 0.9708333551883698, + "rewards/tag_count_reward": 0.7468750178813934, + "step": 2878 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.3687744140625, + "epoch": 0.9214274283885422, + "grad_norm": 0.14246773719787598, + "kl": 0.3555714774876833, + "learning_rate": 3.7253332893756877e-07, + "loss": 0.1228, + "reward": 1.7880208849906922, + "reward_std": 0.2130596399307251, + "rewards/accuracy_reward": 0.10625000316649676, + "rewards/format_reward": 0.9479166865348816, + "rewards/tag_count_reward": 0.7338541805744171, + "step": 2879 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.958349609375, + "epoch": 0.9217474795967355, + "grad_norm": 0.1846427470445633, + "kl": 0.2668366312980652, + "learning_rate": 3.695172777135292e-07, + "loss": 0.0932, + "reward": 1.7718750476837157, + "reward_std": 0.15560345873236656, + "rewards/accuracy_reward": 0.06666666865348816, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7427083551883698, + "step": 2880 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.7458587646485, + "epoch": 0.9220675308049288, + "grad_norm": 0.13593432307243347, + "kl": 0.46407483220100404, + "learning_rate": 3.66513255393669e-07, + "loss": 0.0988, + "reward": 1.7625000596046447, + "reward_std": 0.22477970719337464, + "rewards/accuracy_reward": 0.07500000223517418, + "rewards/format_reward": 0.9541666805744171, + "rewards/tag_count_reward": 0.7333333432674408, + "step": 2881 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.7854370117187, + "epoch": 0.9223875820131221, + "grad_norm": 0.08887147158384323, + "kl": 0.20311668664216995, + "learning_rate": 3.6352126573015013e-07, + "loss": 0.0405, + "reward": 1.7395833492279054, + "reward_std": 0.13752839267253875, + "rewards/accuracy_reward": 0.01458333358168602, + "rewards/format_reward": 0.9770833551883698, + "rewards/tag_count_reward": 0.7479166865348816, + "step": 2882 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.2562713623047, + "epoch": 0.9227076332213154, + "grad_norm": 0.17821405827999115, + "kl": 0.2897450774908066, + "learning_rate": 3.605413124600965e-07, + "loss": 0.1086, + "reward": 1.7807292103767396, + "reward_std": 0.22551444172859192, + "rewards/accuracy_reward": 0.07291666958481073, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7432291746139527, + "step": 2883 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.2875152587891, + "epoch": 0.9230276844295088, + "grad_norm": 0.17026633024215698, + "kl": 0.2222075067460537, + "learning_rate": 3.575733993056063e-07, + "loss": 0.0778, + "reward": 1.8151042103767394, + "reward_std": 0.1940957099199295, + "rewards/accuracy_reward": 0.10416666939854621, + "rewards/format_reward": 0.9666666924953461, + "rewards/tag_count_reward": 0.7442708551883698, + "step": 2884 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.1583618164062, + "epoch": 0.923347735637702, + "grad_norm": 0.2168344110250473, + "kl": 0.3349597044289112, + "learning_rate": 3.546175299737342e-07, + "loss": 0.0803, + "reward": 1.7307291984558106, + "reward_std": 0.17358548641204835, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7390625298023223, + "step": 2885 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.9458557128906, + "epoch": 0.9236677868458953, + "grad_norm": 0.2521231472492218, + "kl": 0.24037815183401107, + "learning_rate": 3.5167370815649694e-07, + "loss": 0.0825, + "reward": 1.7151041984558106, + "reward_std": 0.19021971225738527, + "rewards/accuracy_reward": 0.010416666977107525, + "rewards/format_reward": 0.9604166924953461, + "rewards/tag_count_reward": 0.7442708551883698, + "step": 2886 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.1583526611328, + "epoch": 0.9239878380540887, + "grad_norm": 0.17703336477279663, + "kl": 0.21621669009327887, + "learning_rate": 3.4874193753085426e-07, + "loss": 0.0605, + "reward": 1.7875000715255738, + "reward_std": 0.11906407549977302, + "rewards/accuracy_reward": 0.07083333544433117, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7479166805744171, + "step": 2887 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.645849609375, + "epoch": 0.9243078892622819, + "grad_norm": 0.18615660071372986, + "kl": 0.344948410987854, + "learning_rate": 3.458222217587226e-07, + "loss": 0.1063, + "reward": 1.789062535762787, + "reward_std": 0.25289190337061884, + "rewards/accuracy_reward": 0.09375000093132257, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2888 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.3146057128906, + "epoch": 0.9246279404704753, + "grad_norm": 0.07631520926952362, + "kl": 0.15430775843560696, + "learning_rate": 3.4291456448695805e-07, + "loss": 0.0496, + "reward": 1.7645833611488342, + "reward_std": 0.12003937363624573, + "rewards/accuracy_reward": 0.04583333432674408, + "rewards/format_reward": 0.9770833432674408, + "rewards/tag_count_reward": 0.7416666805744171, + "step": 2889 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.5291870117187, + "epoch": 0.9249479916786686, + "grad_norm": 0.24916376173496246, + "kl": 0.35432265847921374, + "learning_rate": 3.4001896934735436e-07, + "loss": 0.0911, + "reward": 1.770312535762787, + "reward_std": 0.21689439043402672, + "rewards/accuracy_reward": 0.07708333395421504, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.736979192495346, + "step": 2890 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.0041870117187, + "epoch": 0.9252680428868619, + "grad_norm": 0.18258577585220337, + "kl": 0.3797022372484207, + "learning_rate": 3.3713543995663735e-07, + "loss": 0.1418, + "reward": 1.6942708611488342, + "reward_std": 0.21223524063825608, + "rewards/accuracy_reward": 0.006250000186264515, + "rewards/format_reward": 0.947916692495346, + "rewards/tag_count_reward": 0.7401041924953461, + "step": 2891 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.5771026611328, + "epoch": 0.9255880940950552, + "grad_norm": 0.1253851056098938, + "kl": 0.2872502990067005, + "learning_rate": 3.34263979916466e-07, + "loss": 0.0809, + "reward": 1.8552083849906922, + "reward_std": 0.2737518347799778, + "rewards/accuracy_reward": 0.16458333786576987, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 2892 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.6458587646484, + "epoch": 0.9259081453032485, + "grad_norm": 0.12900535762310028, + "kl": 0.27396869882941244, + "learning_rate": 3.314045928134224e-07, + "loss": 0.1208, + "reward": 1.9781250715255738, + "reward_std": 0.2516678273677826, + "rewards/accuracy_reward": 0.27708334028720855, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7406250238418579, + "step": 2893 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.3500122070312, + "epoch": 0.9262281965114418, + "grad_norm": 0.16405299305915833, + "kl": 0.23438700921833516, + "learning_rate": 3.2855728221900975e-07, + "loss": 0.0605, + "reward": 1.817708396911621, + "reward_std": 0.11167410463094711, + "rewards/accuracy_reward": 0.10000000298023223, + "rewards/format_reward": 0.975000011920929, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 2894 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.8250091552734, + "epoch": 0.9265482477196352, + "grad_norm": 0.3562224507331848, + "kl": 0.359239012748003, + "learning_rate": 3.2572205168964645e-07, + "loss": 0.126, + "reward": 1.709375023841858, + "reward_std": 0.20814426690340043, + "rewards/accuracy_reward": 0.01458333358168602, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7364583611488342, + "step": 2895 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.6000183105468, + "epoch": 0.9268682989278284, + "grad_norm": 0.21366332471370697, + "kl": 0.20416639670729636, + "learning_rate": 3.2289890476665975e-07, + "loss": 0.07, + "reward": 1.7697917103767395, + "reward_std": 0.19547061547636985, + "rewards/accuracy_reward": 0.06875000186264515, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2896 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.0104309082031, + "epoch": 0.9271883501360217, + "grad_norm": 0.10463610291481018, + "kl": 0.25862638279795647, + "learning_rate": 3.200878449762901e-07, + "loss": 0.0649, + "reward": 1.8213542342185973, + "reward_std": 0.20296233296394348, + "rewards/accuracy_reward": 0.11666666995733976, + "rewards/format_reward": 0.9604166924953461, + "rewards/tag_count_reward": 0.7442708432674408, + "step": 2897 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.3625244140625, + "epoch": 0.9275084013442151, + "grad_norm": 0.2624492645263672, + "kl": 0.31345591209828855, + "learning_rate": 3.172888758296755e-07, + "loss": 0.1211, + "reward": 1.7791666984558105, + "reward_std": 0.2839872606098652, + "rewards/accuracy_reward": 0.1000000024214387, + "rewards/format_reward": 0.9458333492279053, + "rewards/tag_count_reward": 0.7333333551883697, + "step": 2898 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.6312652587891, + "epoch": 0.9278284525524084, + "grad_norm": 0.18744568526744843, + "kl": 0.30883694961667063, + "learning_rate": 3.145020008228539e-07, + "loss": 0.1168, + "reward": 1.8046875715255737, + "reward_std": 0.16510328650474548, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7421875238418579, + "step": 2899 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.0708465576172, + "epoch": 0.9281485037606017, + "grad_norm": 0.10015768557786942, + "kl": 0.2992133036255836, + "learning_rate": 3.117272234367563e-07, + "loss": 0.0921, + "reward": 1.744270884990692, + "reward_std": 0.18532090559601783, + "rewards/accuracy_reward": 0.0458333345130086, + "rewards/format_reward": 0.9541666805744171, + "rewards/tag_count_reward": 0.7442708492279053, + "step": 2900 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.2625122070312, + "epoch": 0.928468554968795, + "grad_norm": 0.14354941248893738, + "kl": 0.27462932020425795, + "learning_rate": 3.089645471372038e-07, + "loss": 0.0729, + "reward": 1.7692708730697633, + "reward_std": 0.1999375715851784, + "rewards/accuracy_reward": 0.08541666977107525, + "rewards/format_reward": 0.9500000298023223, + "rewards/tag_count_reward": 0.7338541984558106, + "step": 2901 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.508349609375, + "epoch": 0.9287886061769883, + "grad_norm": 0.17410211265087128, + "kl": 0.24624013304710388, + "learning_rate": 3.0621397537490494e-07, + "loss": 0.1219, + "reward": 1.7942708611488343, + "reward_std": 0.3087786689400673, + "rewards/accuracy_reward": 0.10833333730697632, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.735937523841858, + "step": 2902 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.737515258789, + "epoch": 0.9291086573851817, + "grad_norm": 0.14404526352882385, + "kl": 0.18631610609591007, + "learning_rate": 3.0347551158544597e-07, + "loss": 0.0563, + "reward": 1.7703125119209289, + "reward_std": 0.1441615879535675, + "rewards/accuracy_reward": 0.05000000111758709, + "rewards/format_reward": 0.9750000178813935, + "rewards/tag_count_reward": 0.7453125238418579, + "step": 2903 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.439599609375, + "epoch": 0.9294287085933749, + "grad_norm": 0.20506027340888977, + "kl": 0.37727788612246516, + "learning_rate": 3.007491591892886e-07, + "loss": 0.0967, + "reward": 1.8562500596046447, + "reward_std": 0.2165341705083847, + "rewards/accuracy_reward": 0.1541666707023978, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7375000238418579, + "step": 2904 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.2062713623047, + "epoch": 0.9297487598015682, + "grad_norm": 0.18293797969818115, + "kl": 0.20713808685541152, + "learning_rate": 2.9803492159177103e-07, + "loss": 0.071, + "reward": 1.807812547683716, + "reward_std": 0.18278513848781586, + "rewards/accuracy_reward": 0.0875000013038516, + "rewards/format_reward": 0.9770833492279053, + "rewards/tag_count_reward": 0.7432291805744171, + "step": 2905 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.0562713623046, + "epoch": 0.9300688110097616, + "grad_norm": 0.22001373767852783, + "kl": 0.17172672897577285, + "learning_rate": 2.953328021830981e-07, + "loss": 0.0572, + "reward": 1.825520896911621, + "reward_std": 0.15396547242999076, + "rewards/accuracy_reward": 0.11041667070239783, + "rewards/format_reward": 0.9729166924953461, + "rewards/tag_count_reward": 0.7421875238418579, + "step": 2906 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.6729431152344, + "epoch": 0.9303888622179549, + "grad_norm": 0.09129034727811813, + "kl": 0.2686162628233433, + "learning_rate": 2.926428043383378e-07, + "loss": 0.0988, + "reward": 1.814583384990692, + "reward_std": 0.2419304519891739, + "rewards/accuracy_reward": 0.11875000353902579, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 2907 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.1291931152343, + "epoch": 0.9307089134261481, + "grad_norm": 0.340289443731308, + "kl": 0.6954551450908184, + "learning_rate": 2.8996493141741686e-07, + "loss": 0.1143, + "reward": 1.7781250476837158, + "reward_std": 0.28263785168528555, + "rewards/accuracy_reward": 0.1062500037252903, + "rewards/format_reward": 0.9375000178813935, + "rewards/tag_count_reward": 0.7343750238418579, + "step": 2908 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.7979339599609, + "epoch": 0.9310289646343415, + "grad_norm": 0.2295711785554886, + "kl": 0.43438730686903, + "learning_rate": 2.8729918676511983e-07, + "loss": 0.0989, + "reward": 1.8583333849906922, + "reward_std": 0.2656490132212639, + "rewards/accuracy_reward": 0.18541667014360427, + "rewards/format_reward": 0.9354166924953461, + "rewards/tag_count_reward": 0.7375000238418579, + "step": 2909 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.5687652587891, + "epoch": 0.9313490158425348, + "grad_norm": 0.1720622330904007, + "kl": 0.35845495462417604, + "learning_rate": 2.846455737110787e-07, + "loss": 0.0689, + "reward": 1.7369791865348816, + "reward_std": 0.21831071525812148, + "rewards/accuracy_reward": 0.05000000149011612, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7369791984558105, + "step": 2910 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.3021057128906, + "epoch": 0.9316690670507282, + "grad_norm": 0.16650277376174927, + "kl": 0.35746832117438315, + "learning_rate": 2.8200409556977894e-07, + "loss": 0.0936, + "reward": 1.8375000357627869, + "reward_std": 0.25913550406694413, + "rewards/accuracy_reward": 0.13958333656191826, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2911 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.5666870117187, + "epoch": 0.9319891182589214, + "grad_norm": 0.12058953940868378, + "kl": 0.24439542815089227, + "learning_rate": 2.7937475564054017e-07, + "loss": 0.0821, + "reward": 1.7796875357627868, + "reward_std": 0.21224845796823502, + "rewards/accuracy_reward": 0.0812500013038516, + "rewards/format_reward": 0.9604166924953461, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 2912 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.9521026611328, + "epoch": 0.9323091694671147, + "grad_norm": 0.21773386001586914, + "kl": 0.35191044956445694, + "learning_rate": 2.767575572075287e-07, + "loss": 0.1179, + "reward": 1.7239583730697632, + "reward_std": 0.23633338287472724, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.9479166865348816, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 2913 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.6396057128907, + "epoch": 0.9326292206753081, + "grad_norm": 0.14750872552394867, + "kl": 0.5395490519702435, + "learning_rate": 2.74152503539743e-07, + "loss": 0.0997, + "reward": 1.7177083969116211, + "reward_std": 0.19782040342688562, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.947916692495346, + "rewards/tag_count_reward": 0.7343750178813935, + "step": 2914 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.5583557128906, + "epoch": 0.9329492718835014, + "grad_norm": 0.10624227672815323, + "kl": 0.4461661420762539, + "learning_rate": 2.7155959789101127e-07, + "loss": 0.1291, + "reward": 1.7812500476837159, + "reward_std": 0.28088073134422303, + "rewards/accuracy_reward": 0.10416667088866234, + "rewards/format_reward": 0.9437500178813935, + "rewards/tag_count_reward": 0.7333333611488342, + "step": 2915 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.4812683105469, + "epoch": 0.9332693230916946, + "grad_norm": 0.196004256606102, + "kl": 0.20092077553272247, + "learning_rate": 2.6897884349998735e-07, + "loss": 0.0628, + "reward": 1.8072917103767394, + "reward_std": 0.1401705838739872, + "rewards/accuracy_reward": 0.08958333563059569, + "rewards/format_reward": 0.9750000178813935, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 2916 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.5687744140625, + "epoch": 0.933589374299888, + "grad_norm": 0.14856016635894775, + "kl": 0.3332873769104481, + "learning_rate": 2.6641024359015056e-07, + "loss": 0.109, + "reward": 1.8395833849906922, + "reward_std": 0.2291702926158905, + "rewards/accuracy_reward": 0.1395833358168602, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2917 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.2500183105469, + "epoch": 0.9339094255080813, + "grad_norm": 0.3066563010215759, + "kl": 0.5219496801495552, + "learning_rate": 2.638538013697956e-07, + "loss": 0.1372, + "reward": 1.7557292103767395, + "reward_std": 0.24314365535974503, + "rewards/accuracy_reward": 0.07916666977107525, + "rewards/format_reward": 0.9416666865348816, + "rewards/tag_count_reward": 0.7348958551883698, + "step": 2918 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.4666809082031, + "epoch": 0.9342294767162747, + "grad_norm": 0.23431648313999176, + "kl": 0.21682993099093437, + "learning_rate": 2.613095200320359e-07, + "loss": 0.0652, + "reward": 1.8515625715255737, + "reward_std": 0.1445869944989681, + "rewards/accuracy_reward": 0.13958333637565373, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.7411458492279053, + "step": 2919 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.3687683105469, + "epoch": 0.9345495279244679, + "grad_norm": 0.20319998264312744, + "kl": 0.2534866757690907, + "learning_rate": 2.587774027547918e-07, + "loss": 0.0787, + "reward": 1.7703125476837158, + "reward_std": 0.2077922374010086, + "rewards/accuracy_reward": 0.07916666865348816, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2920 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.2833435058594, + "epoch": 0.9348695791326612, + "grad_norm": 0.18832314014434814, + "kl": 0.4107868306338787, + "learning_rate": 2.5625745270078775e-07, + "loss": 0.0822, + "reward": 1.7130208730697631, + "reward_std": 0.20654768422245978, + "rewards/accuracy_reward": 0.01666666679084301, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 2921 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.7833557128906, + "epoch": 0.9351896303408546, + "grad_norm": 0.24442380666732788, + "kl": 0.2872084707021713, + "learning_rate": 2.5374967301755924e-07, + "loss": 0.0997, + "reward": 1.758333396911621, + "reward_std": 0.2554009936749935, + "rewards/accuracy_reward": 0.07291666865348816, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.7333333492279053, + "step": 2922 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.0437622070312, + "epoch": 0.9355096815490479, + "grad_norm": 0.1751808375120163, + "kl": 0.44566518142819406, + "learning_rate": 2.5125406683743417e-07, + "loss": 0.096, + "reward": 1.7348958849906921, + "reward_std": 0.21916937381029128, + "rewards/accuracy_reward": 0.03958333395421505, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7369791865348816, + "step": 2923 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.02294921875, + "epoch": 0.9358297327572411, + "grad_norm": 0.12745539844036102, + "kl": 0.15233333893120288, + "learning_rate": 2.487706372775345e-07, + "loss": 0.0583, + "reward": 1.7635417103767395, + "reward_std": 0.11595178246498108, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.9770833492279053, + "rewards/tag_count_reward": 0.7447916865348816, + "step": 2924 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.4375244140625, + "epoch": 0.9361497839654345, + "grad_norm": 0.17469380795955658, + "kl": 0.3449023649096489, + "learning_rate": 2.4629938743977567e-07, + "loss": 0.1054, + "reward": 1.8369791984558106, + "reward_std": 0.25577750355005263, + "rewards/accuracy_reward": 0.15000000409781933, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.732812511920929, + "step": 2925 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.6416870117188, + "epoch": 0.9364698351736278, + "grad_norm": 0.29228219389915466, + "kl": 0.5941868476569653, + "learning_rate": 2.438403204108597e-07, + "loss": 0.0752, + "reward": 1.8057292222976684, + "reward_std": 0.20368908420205117, + "rewards/accuracy_reward": 0.10625000316649676, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2926 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.3604339599609, + "epoch": 0.9367898863818211, + "grad_norm": 0.30949291586875916, + "kl": 0.20246201269328595, + "learning_rate": 2.413934392622719e-07, + "loss": 0.0582, + "reward": 1.811458373069763, + "reward_std": 0.15752314329147338, + "rewards/accuracy_reward": 0.08750000260770321, + "rewards/format_reward": 0.9750000178813935, + "rewards/tag_count_reward": 0.7489583492279053, + "step": 2927 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.1041839599609, + "epoch": 0.9371099375900144, + "grad_norm": 0.2570835053920746, + "kl": 0.24000799655914307, + "learning_rate": 2.3895874705027635e-07, + "loss": 0.0772, + "reward": 1.8093750476837158, + "reward_std": 0.16699628233909608, + "rewards/accuracy_reward": 0.10833333637565375, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2928 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.5166870117188, + "epoch": 0.9374299887982077, + "grad_norm": 0.28607016801834106, + "kl": 0.3831566788256168, + "learning_rate": 2.3653624681591048e-07, + "loss": 0.0714, + "reward": 1.7671875476837158, + "reward_std": 0.2298620417714119, + "rewards/accuracy_reward": 0.06666666902601719, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7401041805744171, + "step": 2929 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.8125122070312, + "epoch": 0.937750040006401, + "grad_norm": 0.09761839359998703, + "kl": 0.18521942049264908, + "learning_rate": 2.3412594158498836e-07, + "loss": 0.0764, + "reward": 1.7520833492279053, + "reward_std": 0.16688498705625535, + "rewards/accuracy_reward": 0.05208333432674408, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7375000238418579, + "step": 2930 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.4187622070312, + "epoch": 0.9380700912145943, + "grad_norm": 0.43173423409461975, + "kl": 0.29425476044416427, + "learning_rate": 2.3172783436808844e-07, + "loss": 0.1062, + "reward": 1.7093750357627868, + "reward_std": 0.22269544750452042, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.9416666805744172, + "rewards/tag_count_reward": 0.7343750238418579, + "step": 2931 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.581265258789, + "epoch": 0.9383901424227876, + "grad_norm": 0.16565969586372375, + "kl": 0.277605714648962, + "learning_rate": 2.2934192816055355e-07, + "loss": 0.063, + "reward": 1.8677083730697632, + "reward_std": 0.20725997984409333, + "rewards/accuracy_reward": 0.15833333786576986, + "rewards/format_reward": 0.9729166805744172, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2932 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.6312622070312, + "epoch": 0.938710193630981, + "grad_norm": 0.07633515447378159, + "kl": 0.28674716129899025, + "learning_rate": 2.2696822594248768e-07, + "loss": 0.1155, + "reward": 1.7755208730697631, + "reward_std": 0.23790881559252738, + "rewards/accuracy_reward": 0.0916666692122817, + "rewards/format_reward": 0.9479166865348816, + "rewards/tag_count_reward": 0.7359375119209289, + "step": 2933 + }, + { + "clip_ratio": 0.0, + "completion_length": 590.2958435058594, + "epoch": 0.9390302448391743, + "grad_norm": 0.1412808895111084, + "kl": 0.2938103273510933, + "learning_rate": 2.2460673067875029e-07, + "loss": 0.0979, + "reward": 1.723437523841858, + "reward_std": 0.18467547446489335, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.9479166865348816, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 2934 + }, + { + "clip_ratio": 0.0, + "completion_length": 599.4854339599609, + "epoch": 0.9393502960473675, + "grad_norm": 0.17745190858840942, + "kl": 0.3535077393054962, + "learning_rate": 2.2225744531895632e-07, + "loss": 0.094, + "reward": 1.7895833730697632, + "reward_std": 0.2649504989385605, + "rewards/accuracy_reward": 0.09375000353902578, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7395833611488343, + "step": 2935 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.5416931152344, + "epoch": 0.9396703472555609, + "grad_norm": 0.23511171340942383, + "kl": 0.43363613411784174, + "learning_rate": 2.1992037279746746e-07, + "loss": 0.0866, + "reward": 1.7776042103767395, + "reward_std": 0.2053623117506504, + "rewards/accuracy_reward": 0.07500000279396772, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7380208432674408, + "step": 2936 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.0541870117188, + "epoch": 0.9399903984637542, + "grad_norm": 0.21815741062164307, + "kl": 0.2682798236608505, + "learning_rate": 2.1759551603339092e-07, + "loss": 0.0642, + "reward": 1.832812535762787, + "reward_std": 0.2220204994082451, + "rewards/accuracy_reward": 0.14375000409781932, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.7369791805744171, + "step": 2937 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.4479278564453, + "epoch": 0.9403104496719475, + "grad_norm": 0.18351462483406067, + "kl": 0.21441592164337636, + "learning_rate": 2.1528287793057934e-07, + "loss": 0.0714, + "reward": 1.7739583730697632, + "reward_std": 0.1786945417523384, + "rewards/accuracy_reward": 0.07083333469927311, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.740625011920929, + "step": 2938 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.1896026611328, + "epoch": 0.9406305008801408, + "grad_norm": 0.20814381539821625, + "kl": 0.31521559022367, + "learning_rate": 2.129824613776188e-07, + "loss": 0.0862, + "reward": 1.707812523841858, + "reward_std": 0.16487954929471016, + "rewards/accuracy_reward": 0.00625, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7411458492279053, + "step": 2939 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.5250244140625, + "epoch": 0.9409505520883341, + "grad_norm": 0.14655859768390656, + "kl": 0.24050994589924812, + "learning_rate": 2.1069426924783532e-07, + "loss": 0.0874, + "reward": 1.8135416984558106, + "reward_std": 0.22481777146458626, + "rewards/accuracy_reward": 0.1041666705161333, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7447916865348816, + "step": 2940 + }, + { + "clip_ratio": 0.0, + "completion_length": 595.1125091552734, + "epoch": 0.9412706032965275, + "grad_norm": 0.35549604892730713, + "kl": 0.33449497222900393, + "learning_rate": 2.0841830439928045e-07, + "loss": 0.0939, + "reward": 1.752083384990692, + "reward_std": 0.2661015272140503, + "rewards/accuracy_reward": 0.060416669212281705, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7416666924953461, + "step": 2941 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.5687774658203, + "epoch": 0.9415906545047208, + "grad_norm": 0.18642841279506683, + "kl": 0.4748840194195509, + "learning_rate": 2.06154569674738e-07, + "loss": 0.0888, + "reward": 1.8156250357627868, + "reward_std": 0.19740560948848723, + "rewards/accuracy_reward": 0.12708333730697632, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 2942 + }, + { + "clip_ratio": 0.0, + "completion_length": 595.5187561035157, + "epoch": 0.941910705712914, + "grad_norm": 0.1558806449174881, + "kl": 0.5146293081343174, + "learning_rate": 2.0390306790171398e-07, + "loss": 0.0856, + "reward": 1.7010416865348816, + "reward_std": 0.23505208715796472, + "rewards/accuracy_reward": 0.02708333432674408, + "rewards/format_reward": 0.9375000178813935, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2943 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.527099609375, + "epoch": 0.9422307569211074, + "grad_norm": 0.07957801222801208, + "kl": 0.28175764046609403, + "learning_rate": 2.016638018924344e-07, + "loss": 0.096, + "reward": 1.7854167103767395, + "reward_std": 0.2046665370464325, + "rewards/accuracy_reward": 0.09583333637565375, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.737500011920929, + "step": 2944 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.7521057128906, + "epoch": 0.9425508081293007, + "grad_norm": 0.09544239938259125, + "kl": 0.2202799826860428, + "learning_rate": 1.9943677444384192e-07, + "loss": 0.0668, + "reward": 1.743750011920929, + "reward_std": 0.17324633449316024, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7375000178813934, + "step": 2945 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.3604431152344, + "epoch": 0.942870859337494, + "grad_norm": 0.13867995142936707, + "kl": 0.28282299637794495, + "learning_rate": 1.9722198833759366e-07, + "loss": 0.0822, + "reward": 1.7552083611488343, + "reward_std": 0.15816964358091354, + "rewards/accuracy_reward": 0.04375000130385161, + "rewards/format_reward": 0.9666666924953461, + "rewards/tag_count_reward": 0.7447916865348816, + "step": 2946 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.339599609375, + "epoch": 0.9431909105456873, + "grad_norm": 0.12346024811267853, + "kl": 0.21222805231809616, + "learning_rate": 1.95019446340059e-07, + "loss": 0.0875, + "reward": 1.7489583611488342, + "reward_std": 0.18876290768384935, + "rewards/accuracy_reward": 0.04791666828095913, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 2947 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.9896026611328, + "epoch": 0.9435109617538806, + "grad_norm": 0.22070105373859406, + "kl": 0.24906435757875442, + "learning_rate": 1.928291512023106e-07, + "loss": 0.0606, + "reward": 1.7786458730697632, + "reward_std": 0.16378662288188933, + "rewards/accuracy_reward": 0.07291666995733977, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7411458492279053, + "step": 2948 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.9604278564453, + "epoch": 0.943831012962074, + "grad_norm": 0.25414496660232544, + "kl": 0.45791466608643533, + "learning_rate": 1.9065110566012347e-07, + "loss": 0.0893, + "reward": 1.806770884990692, + "reward_std": 0.18329955637454987, + "rewards/accuracy_reward": 0.11250000204890967, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 2949 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.8354400634765, + "epoch": 0.9441510641702673, + "grad_norm": 0.10287056118249893, + "kl": 0.19068565890192984, + "learning_rate": 1.8848531243397471e-07, + "loss": 0.0742, + "reward": 1.8005208611488341, + "reward_std": 0.2313265398144722, + "rewards/accuracy_reward": 0.09791666902601719, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7380208492279052, + "step": 2950 + }, + { + "clip_ratio": 0.0, + "completion_length": 614.2437683105469, + "epoch": 0.9444711153784605, + "grad_norm": 0.31609681248664856, + "kl": 0.4250298887491226, + "learning_rate": 1.8633177422903824e-07, + "loss": 0.1, + "reward": 1.7354166984558106, + "reward_std": 0.23066288232803345, + "rewards/accuracy_reward": 0.05000000111758709, + "rewards/format_reward": 0.9458333551883698, + "rewards/tag_count_reward": 0.7395833611488343, + "step": 2951 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.0771118164063, + "epoch": 0.9447911665866539, + "grad_norm": 0.1550053358078003, + "kl": 0.4375451445579529, + "learning_rate": 1.8419049373517904e-07, + "loss": 0.0789, + "reward": 1.7687500357627868, + "reward_std": 0.1940486691892147, + "rewards/accuracy_reward": 0.06458333376795053, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7416666924953461, + "step": 2952 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.4896118164063, + "epoch": 0.9451112177948472, + "grad_norm": 0.12883856892585754, + "kl": 0.2577236250042915, + "learning_rate": 1.8206147362695214e-07, + "loss": 0.0695, + "reward": 1.7578125238418578, + "reward_std": 0.2019563138484955, + "rewards/accuracy_reward": 0.04583333395421505, + "rewards/format_reward": 0.9687500298023224, + "rewards/tag_count_reward": 0.7432291865348816, + "step": 2953 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.5916870117187, + "epoch": 0.9454312690030405, + "grad_norm": 0.2383638322353363, + "kl": 0.3122972398996353, + "learning_rate": 1.7994471656359814e-07, + "loss": 0.1238, + "reward": 1.7244791865348816, + "reward_std": 0.24809951484203338, + "rewards/accuracy_reward": 0.043750000186264515, + "rewards/format_reward": 0.947916692495346, + "rewards/tag_count_reward": 0.7328125357627868, + "step": 2954 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.689599609375, + "epoch": 0.9457513202112338, + "grad_norm": 0.16189329326152802, + "kl": 0.22522822245955468, + "learning_rate": 1.778402251890432e-07, + "loss": 0.0805, + "reward": 1.7411458611488342, + "reward_std": 0.18320491760969163, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.743229192495346, + "step": 2955 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.5250122070313, + "epoch": 0.9460713714194271, + "grad_norm": 0.13715258240699768, + "kl": 0.3096561312675476, + "learning_rate": 1.7574800213189137e-07, + "loss": 0.1016, + "reward": 1.8203125476837159, + "reward_std": 0.26009301394224166, + "rewards/accuracy_reward": 0.1270833369344473, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7432291805744171, + "step": 2956 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.7750122070313, + "epoch": 0.9463914226276204, + "grad_norm": 0.08837933093309402, + "kl": 0.2637955330312252, + "learning_rate": 1.7366805000542108e-07, + "loss": 0.0762, + "reward": 1.7151042103767395, + "reward_std": 0.19586375132203102, + "rewards/accuracy_reward": 0.016666667349636555, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7401041865348816, + "step": 2957 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.7979339599609, + "epoch": 0.9467114738358138, + "grad_norm": 0.1346248984336853, + "kl": 0.20087103992700578, + "learning_rate": 1.7160037140758645e-07, + "loss": 0.0779, + "reward": 1.862500047683716, + "reward_std": 0.1999605506658554, + "rewards/accuracy_reward": 0.15625000409781933, + "rewards/format_reward": 0.9666666924953461, + "rewards/tag_count_reward": 0.7395833611488343, + "step": 2958 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.5771118164063, + "epoch": 0.947031525044007, + "grad_norm": 0.17053905129432678, + "kl": 0.2561337880790234, + "learning_rate": 1.6954496892101047e-07, + "loss": 0.0945, + "reward": 1.7369791984558105, + "reward_std": 0.17246170416474343, + "rewards/accuracy_reward": 0.03958333451300859, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7411458432674408, + "step": 2959 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.6000305175781, + "epoch": 0.9473515762522003, + "grad_norm": 0.19663800299167633, + "kl": 0.35930315852165223, + "learning_rate": 1.6750184511298285e-07, + "loss": 0.1183, + "reward": 1.7598958611488342, + "reward_std": 0.18631593957543374, + "rewards/accuracy_reward": 0.06041666883975268, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7369791865348816, + "step": 2960 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.0625183105469, + "epoch": 0.9476716274603937, + "grad_norm": 0.1454787403345108, + "kl": 0.19797628447413446, + "learning_rate": 1.6547100253545889e-07, + "loss": 0.0749, + "reward": 1.8822917342185974, + "reward_std": 0.1931290477514267, + "rewards/accuracy_reward": 0.16041667126119136, + "rewards/format_reward": 0.9791666805744171, + "rewards/tag_count_reward": 0.7427083432674408, + "step": 2961 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.8021026611328, + "epoch": 0.947991678668587, + "grad_norm": 0.18358418345451355, + "kl": 0.22578486204147338, + "learning_rate": 1.6345244372504842e-07, + "loss": 0.0949, + "reward": 1.7697916984558106, + "reward_std": 0.18777953833341599, + "rewards/accuracy_reward": 0.06250000130385161, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7406250238418579, + "step": 2962 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.2937683105469, + "epoch": 0.9483117298767803, + "grad_norm": 0.1297026425600052, + "kl": 0.3527979046106339, + "learning_rate": 1.6144617120302351e-07, + "loss": 0.0965, + "reward": 1.8135417461395265, + "reward_std": 0.2634100914001465, + "rewards/accuracy_reward": 0.12916666977107524, + "rewards/format_reward": 0.9458333492279053, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 2963 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.8333557128906, + "epoch": 0.9486317810849736, + "grad_norm": 0.10099179297685623, + "kl": 0.282944992184639, + "learning_rate": 1.5945218747530855e-07, + "loss": 0.0808, + "reward": 1.7708333730697632, + "reward_std": 0.21695861518383025, + "rewards/accuracy_reward": 0.0791666703298688, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 2964 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.195849609375, + "epoch": 0.9489518322931669, + "grad_norm": 0.1370352953672409, + "kl": 0.34712174348533154, + "learning_rate": 1.5747049503248013e-07, + "loss": 0.0634, + "reward": 1.7697916984558106, + "reward_std": 0.1845792807638645, + "rewards/accuracy_reward": 0.05416666939854622, + "rewards/format_reward": 0.9708333432674408, + "rewards/tag_count_reward": 0.7447916746139527, + "step": 2965 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.9771179199219, + "epoch": 0.9492718835013603, + "grad_norm": 0.16078034043312073, + "kl": 0.3685935214161873, + "learning_rate": 1.5550109634975718e-07, + "loss": 0.1126, + "reward": 1.7260416984558105, + "reward_std": 0.2447558268904686, + "rewards/accuracy_reward": 0.04375000055879354, + "rewards/format_reward": 0.9458333611488342, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 2966 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.1083526611328, + "epoch": 0.9495919347095535, + "grad_norm": 0.2678912878036499, + "kl": 0.35141145139932634, + "learning_rate": 1.5354399388700868e-07, + "loss": 0.1208, + "reward": 1.7854167222976685, + "reward_std": 0.2304367497563362, + "rewards/accuracy_reward": 0.10625000111758709, + "rewards/format_reward": 0.9437500238418579, + "rewards/tag_count_reward": 0.7354166865348816, + "step": 2967 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.5291870117187, + "epoch": 0.9499119859177468, + "grad_norm": 0.20223970711231232, + "kl": 0.3309009000658989, + "learning_rate": 1.5159919008874368e-07, + "loss": 0.1449, + "reward": 1.8500000476837157, + "reward_std": 0.24914255663752555, + "rewards/accuracy_reward": 0.166666672937572, + "rewards/format_reward": 0.9479166805744171, + "rewards/tag_count_reward": 0.7354166805744171, + "step": 2968 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.0000183105469, + "epoch": 0.9502320371259402, + "grad_norm": 0.10167260468006134, + "kl": 0.17943341620266437, + "learning_rate": 1.4966668738410905e-07, + "loss": 0.0396, + "reward": 1.8109375357627868, + "reward_std": 0.13079179599881172, + "rewards/accuracy_reward": 0.08541666883975267, + "rewards/format_reward": 0.9770833373069763, + "rewards/tag_count_reward": 0.7484375059604644, + "step": 2969 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.5729339599609, + "epoch": 0.9505520883341335, + "grad_norm": 0.11473032832145691, + "kl": 0.2629383150488138, + "learning_rate": 1.477464881868862e-07, + "loss": 0.0789, + "reward": 1.8244792222976685, + "reward_std": 0.1891520008444786, + "rewards/accuracy_reward": 0.12083333693444728, + "rewards/format_reward": 0.9625000119209289, + "rewards/tag_count_reward": 0.7411458432674408, + "step": 2970 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.7437713623046, + "epoch": 0.9508721395423267, + "grad_norm": 0.12449677288532257, + "kl": 0.2245940238237381, + "learning_rate": 1.458385948954899e-07, + "loss": 0.0666, + "reward": 1.8380208969116212, + "reward_std": 0.19873855113983155, + "rewards/accuracy_reward": 0.13333333805203437, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7401041805744171, + "step": 2971 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.5583557128906, + "epoch": 0.9511921907505201, + "grad_norm": 0.29751840233802795, + "kl": 0.4296580046415329, + "learning_rate": 1.4394300989296618e-07, + "loss": 0.1319, + "reward": 1.7692708730697633, + "reward_std": 0.23732186257839202, + "rewards/accuracy_reward": 0.08750000204890966, + "rewards/format_reward": 0.9395833432674408, + "rewards/tag_count_reward": 0.7421875119209289, + "step": 2972 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.5562683105469, + "epoch": 0.9515122419587134, + "grad_norm": 0.11807762831449509, + "kl": 0.23386535197496414, + "learning_rate": 1.4205973554698548e-07, + "loss": 0.0742, + "reward": 1.7604166865348816, + "reward_std": 0.2106850653886795, + "rewards/accuracy_reward": 0.0541666679084301, + "rewards/format_reward": 0.9645833611488343, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2973 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.064599609375, + "epoch": 0.9518322931669067, + "grad_norm": 0.14003440737724304, + "kl": 0.21622378900647163, + "learning_rate": 1.4018877420983956e-07, + "loss": 0.0603, + "reward": 1.7697917222976685, + "reward_std": 0.18948814198374747, + "rewards/accuracy_reward": 0.05625000167638063, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.7427083432674408, + "step": 2974 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.5562561035156, + "epoch": 0.9521523443751, + "grad_norm": 0.10616900771856308, + "kl": 0.22268827855587006, + "learning_rate": 1.383301282184446e-07, + "loss": 0.089, + "reward": 1.7739583611488343, + "reward_std": 0.19601852148771287, + "rewards/accuracy_reward": 0.06250000316649676, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7447916805744171, + "step": 2975 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.8604309082032, + "epoch": 0.9524723955832933, + "grad_norm": 0.14727354049682617, + "kl": 0.3326699022203684, + "learning_rate": 1.3648379989433135e-07, + "loss": 0.1121, + "reward": 1.7796875476837157, + "reward_std": 0.23153150603175163, + "rewards/accuracy_reward": 0.09791666977107524, + "rewards/format_reward": 0.9458333551883698, + "rewards/tag_count_reward": 0.735937523841858, + "step": 2976 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.3062683105469, + "epoch": 0.9527924467914867, + "grad_norm": 0.1879003345966339, + "kl": 0.24564929269254207, + "learning_rate": 1.3464979154364844e-07, + "loss": 0.074, + "reward": 1.8286458730697632, + "reward_std": 0.17841232642531396, + "rewards/accuracy_reward": 0.12083333730697632, + "rewards/format_reward": 0.9687500119209289, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2977 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.3708587646485, + "epoch": 0.9531124979996799, + "grad_norm": 0.24627186357975006, + "kl": 0.2939072445034981, + "learning_rate": 1.328281054571534e-07, + "loss": 0.1061, + "reward": 1.7781250715255736, + "reward_std": 0.23098910599946976, + "rewards/accuracy_reward": 0.08750000372529029, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7364583551883698, + "step": 2978 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.483349609375, + "epoch": 0.9534325492078732, + "grad_norm": 0.11084544658660889, + "kl": 0.3061968058347702, + "learning_rate": 1.3101874391021285e-07, + "loss": 0.0848, + "reward": 1.8708333849906922, + "reward_std": 0.20706724524497985, + "rewards/accuracy_reward": 0.17500000596046447, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7354166865348816, + "step": 2979 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.2666809082032, + "epoch": 0.9537526004160666, + "grad_norm": 0.16704536974430084, + "kl": 0.42880779802799224, + "learning_rate": 1.2922170916280118e-07, + "loss": 0.0408, + "reward": 1.8218750476837158, + "reward_std": 0.13136814534664154, + "rewards/accuracy_reward": 0.10625000316649676, + "rewards/format_reward": 0.9729166865348816, + "rewards/tag_count_reward": 0.7427083551883698, + "step": 2980 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.6833618164062, + "epoch": 0.9540726516242599, + "grad_norm": 0.18254122138023376, + "kl": 0.3421017203480005, + "learning_rate": 1.274370034594974e-07, + "loss": 0.1099, + "reward": 1.7604167222976685, + "reward_std": 0.24414713978767394, + "rewards/accuracy_reward": 0.0791666692122817, + "rewards/format_reward": 0.9437500178813935, + "rewards/tag_count_reward": 0.7375000178813934, + "step": 2981 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.4562683105469, + "epoch": 0.9543927028324531, + "grad_norm": 0.18417489528656006, + "kl": 0.282155305147171, + "learning_rate": 1.2566462902947496e-07, + "loss": 0.0854, + "reward": 1.817708396911621, + "reward_std": 0.24376452192664147, + "rewards/accuracy_reward": 0.13125000540167092, + "rewards/format_reward": 0.9458333551883698, + "rewards/tag_count_reward": 0.7406250298023224, + "step": 2982 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.7604431152344, + "epoch": 0.9547127540406465, + "grad_norm": 0.21982231736183167, + "kl": 0.3524301677942276, + "learning_rate": 1.2390458808651085e-07, + "loss": 0.0805, + "reward": 1.8322916984558106, + "reward_std": 0.29580608904361727, + "rewards/accuracy_reward": 0.14166667107492686, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7343750238418579, + "step": 2983 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.364599609375, + "epoch": 0.9550328052488398, + "grad_norm": 0.18464107811450958, + "kl": 0.2895275134593248, + "learning_rate": 1.2215688282897542e-07, + "loss": 0.1011, + "reward": 1.7953125715255738, + "reward_std": 0.21910280585289002, + "rewards/accuracy_reward": 0.1083333345130086, + "rewards/format_reward": 0.9520833432674408, + "rewards/tag_count_reward": 0.7348958492279053, + "step": 2984 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.8750152587891, + "epoch": 0.9553528564570332, + "grad_norm": 0.2103576511144638, + "kl": 0.48140868097543715, + "learning_rate": 1.2042151543983028e-07, + "loss": 0.0708, + "reward": 1.8223958730697631, + "reward_std": 0.18225472569465637, + "rewards/accuracy_reward": 0.11458333879709244, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.739062511920929, + "step": 2985 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.4375183105469, + "epoch": 0.9556729076652264, + "grad_norm": 0.10476110875606537, + "kl": 0.3020617179572582, + "learning_rate": 1.186984880866271e-07, + "loss": 0.0947, + "reward": 1.784375047683716, + "reward_std": 0.20616189390420914, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7406250238418579, + "step": 2986 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.6562744140625, + "epoch": 0.9559929588734197, + "grad_norm": 0.1183815449476242, + "kl": 0.30500880554318427, + "learning_rate": 1.1698780292150325e-07, + "loss": 0.0719, + "reward": 1.864583384990692, + "reward_std": 0.2061496764421463, + "rewards/accuracy_reward": 0.15625000447034837, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 2987 + }, + { + "clip_ratio": 0.0, + "completion_length": 603.670849609375, + "epoch": 0.9563130100816131, + "grad_norm": 0.2120182067155838, + "kl": 0.2620538957417011, + "learning_rate": 1.1528946208118286e-07, + "loss": 0.0741, + "reward": 1.7244791984558105, + "reward_std": 0.19255055412650107, + "rewards/accuracy_reward": 0.02083333358168602, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7411458551883697, + "step": 2988 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.0458587646484, + "epoch": 0.9566330612898064, + "grad_norm": 0.18735171854496002, + "kl": 0.3196378767490387, + "learning_rate": 1.1360346768696907e-07, + "loss": 0.0928, + "reward": 1.7416666984558105, + "reward_std": 0.24244018495082856, + "rewards/accuracy_reward": 0.05833333563059569, + "rewards/format_reward": 0.9458333492279053, + "rewards/tag_count_reward": 0.7375000298023224, + "step": 2989 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.8541931152344, + "epoch": 0.9569531124979996, + "grad_norm": 0.09267427027225494, + "kl": 0.17590968012809755, + "learning_rate": 1.11929821844744e-07, + "loss": 0.0771, + "reward": 1.7635416984558105, + "reward_std": 0.16972372829914092, + "rewards/accuracy_reward": 0.05000000111758709, + "rewards/format_reward": 0.9708333551883698, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 2990 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.9312530517578, + "epoch": 0.957273163706193, + "grad_norm": 0.12013649940490723, + "kl": 0.2494128279387951, + "learning_rate": 1.1026852664496656e-07, + "loss": 0.0976, + "reward": 1.784375047683716, + "reward_std": 0.1636178210377693, + "rewards/accuracy_reward": 0.0729166679084301, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 2991 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.4687683105469, + "epoch": 0.9575932149143863, + "grad_norm": 0.14498887956142426, + "kl": 0.18977786004543304, + "learning_rate": 1.0861958416266805e-07, + "loss": 0.0593, + "reward": 1.8005208730697633, + "reward_std": 0.19197710752487182, + "rewards/accuracy_reward": 0.0854166679084301, + "rewards/format_reward": 0.9687500238418579, + "rewards/tag_count_reward": 0.7463541805744172, + "step": 2992 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.1187683105469, + "epoch": 0.9579132661225797, + "grad_norm": 0.09392816573381424, + "kl": 0.33596227318048477, + "learning_rate": 1.0698299645745203e-07, + "loss": 0.0888, + "reward": 1.7036458611488343, + "reward_std": 0.21316526755690574, + "rewards/accuracy_reward": 0.018750000186264514, + "rewards/format_reward": 0.9479166805744171, + "rewards/tag_count_reward": 0.7369791865348816, + "step": 2993 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.0562622070313, + "epoch": 0.9582333173307729, + "grad_norm": 0.11171252280473709, + "kl": 0.22640425711870193, + "learning_rate": 1.0535876557349111e-07, + "loss": 0.0706, + "reward": 1.7869792103767395, + "reward_std": 0.18889242559671401, + "rewards/accuracy_reward": 0.07500000298023224, + "rewards/format_reward": 0.9729166924953461, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 2994 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.7166778564454, + "epoch": 0.9585533685389662, + "grad_norm": 0.10955782979726791, + "kl": 0.22637251615524293, + "learning_rate": 1.0374689353952027e-07, + "loss": 0.057, + "reward": 1.810937523841858, + "reward_std": 0.16208795085549355, + "rewards/accuracy_reward": 0.10000000204890966, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.7401041865348816, + "step": 2995 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.2229400634766, + "epoch": 0.9588734197471596, + "grad_norm": 0.285765677690506, + "kl": 0.27739047557115554, + "learning_rate": 1.0214738236884014e-07, + "loss": 0.0849, + "reward": 1.8604167103767395, + "reward_std": 0.19695503115653992, + "rewards/accuracy_reward": 0.15625000316649676, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 2996 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.2750183105469, + "epoch": 0.9591934709553529, + "grad_norm": 0.2077830582857132, + "kl": 0.36373098865151404, + "learning_rate": 1.0056023405931259e-07, + "loss": 0.0933, + "reward": 1.742187535762787, + "reward_std": 0.2123005896806717, + "rewards/accuracy_reward": 0.0520833358168602, + "rewards/format_reward": 0.954166692495346, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 2997 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.8916839599609, + "epoch": 0.9595135221635461, + "grad_norm": 0.1561278998851776, + "kl": 0.2940549574792385, + "learning_rate": 9.898545059335852e-08, + "loss": 0.0842, + "reward": 1.717187523841858, + "reward_std": 0.1930335447192192, + "rewards/accuracy_reward": 0.022916667349636554, + "rewards/format_reward": 0.956250011920929, + "rewards/tag_count_reward": 0.7380208492279052, + "step": 2998 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.1375213623047, + "epoch": 0.9598335733717395, + "grad_norm": 0.10308010131120682, + "kl": 0.22390391640365123, + "learning_rate": 9.742303393795005e-08, + "loss": 0.0928, + "reward": 1.8218750476837158, + "reward_std": 0.19930132627487182, + "rewards/accuracy_reward": 0.12500000409781933, + "rewards/format_reward": 0.9604166924953461, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 2999 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.5979370117187, + "epoch": 0.9601536245799328, + "grad_norm": 0.16804414987564087, + "kl": 0.21638592407107354, + "learning_rate": 9.587298604461614e-08, + "loss": 0.0584, + "reward": 1.7817708849906921, + "reward_std": 0.1360452577471733, + "rewards/accuracy_reward": 0.07291666865348816, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7463541865348816, + "step": 3000 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.927099609375, + "epoch": 0.9604736757881261, + "grad_norm": 0.09598581492900848, + "kl": 0.1559834960848093, + "learning_rate": 9.433530884943698e-08, + "loss": 0.0599, + "reward": 1.767708384990692, + "reward_std": 0.12271808385848999, + "rewards/accuracy_reward": 0.04583333469927311, + "rewards/format_reward": 0.9770833432674408, + "rewards/tag_count_reward": 0.7447916865348816, + "step": 3001 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.9541900634765, + "epoch": 0.9607937269963194, + "grad_norm": 0.12407588958740234, + "kl": 0.360856419801712, + "learning_rate": 9.281000427304066e-08, + "loss": 0.069, + "reward": 1.8088541984558106, + "reward_std": 0.20785682201385497, + "rewards/accuracy_reward": 0.10833333637565375, + "rewards/format_reward": 0.9583333611488343, + "rewards/tag_count_reward": 0.7421875298023224, + "step": 3002 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.9437713623047, + "epoch": 0.9611137782045127, + "grad_norm": 0.12733730673789978, + "kl": 0.2946822591125965, + "learning_rate": 9.129707422059986e-08, + "loss": 0.0782, + "reward": 1.7244791984558105, + "reward_std": 0.16163463965058328, + "rewards/accuracy_reward": 0.018750000558793545, + "rewards/format_reward": 0.9625000119209289, + "rewards/tag_count_reward": 0.7432291865348816, + "step": 3003 + }, + { + "clip_ratio": 0.0, + "completion_length": 537.0750152587891, + "epoch": 0.961433829412706, + "grad_norm": 0.3727104365825653, + "kl": 0.2450065303593874, + "learning_rate": 8.979652058183185e-08, + "loss": 0.1028, + "reward": 1.8244792222976685, + "reward_std": 0.22431774139404298, + "rewards/accuracy_reward": 0.11875000149011612, + "rewards/format_reward": 0.9645833432674408, + "rewards/tag_count_reward": 0.7411458551883697, + "step": 3004 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.4645965576171, + "epoch": 0.9617538806208994, + "grad_norm": 0.12149068713188171, + "kl": 0.20398061200976372, + "learning_rate": 8.830834523099518e-08, + "loss": 0.0679, + "reward": 1.773437535762787, + "reward_std": 0.1679071843624115, + "rewards/accuracy_reward": 0.0645833358168602, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7442708492279053, + "step": 3005 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.0854370117188, + "epoch": 0.9620739318290926, + "grad_norm": 0.1221243143081665, + "kl": 0.39972667805850504, + "learning_rate": 8.683255002688962e-08, + "loss": 0.113, + "reward": 1.7072916865348815, + "reward_std": 0.22407453507184982, + "rewards/accuracy_reward": 0.01666666716337204, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 3006 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.8416870117187, + "epoch": 0.962393983037286, + "grad_norm": 0.12119229882955551, + "kl": 0.21771190762519838, + "learning_rate": 8.536913681284731e-08, + "loss": 0.0849, + "reward": 1.8875000596046447, + "reward_std": 0.2431383326649666, + "rewards/accuracy_reward": 0.18958333935588598, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 3007 + }, + { + "clip_ratio": 0.0, + "completion_length": 537.3979278564453, + "epoch": 0.9627140342454793, + "grad_norm": 0.36122357845306396, + "kl": 0.2558132287114859, + "learning_rate": 8.391810741673722e-08, + "loss": 0.0811, + "reward": 1.783333384990692, + "reward_std": 0.20741451680660247, + "rewards/accuracy_reward": 0.07708333544433117, + "rewards/format_reward": 0.9645833432674408, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 3008 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.7666809082032, + "epoch": 0.9630340854536726, + "grad_norm": 0.22902488708496094, + "kl": 0.29923873767256737, + "learning_rate": 8.24794636509596e-08, + "loss": 0.0986, + "reward": 1.8473958611488341, + "reward_std": 0.23173879757523536, + "rewards/accuracy_reward": 0.1416666716337204, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7411458551883697, + "step": 3009 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.310433959961, + "epoch": 0.9633541366618659, + "grad_norm": 0.22059831023216248, + "kl": 0.2688716005533934, + "learning_rate": 8.105320731244703e-08, + "loss": 0.0727, + "reward": 1.8421875715255738, + "reward_std": 0.18312807828187944, + "rewards/accuracy_reward": 0.12708333563059568, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.7442708492279053, + "step": 3010 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.7916961669922, + "epoch": 0.9636741878700592, + "grad_norm": 0.36360955238342285, + "kl": 0.4199592448771, + "learning_rate": 7.963934018265562e-08, + "loss": 0.0867, + "reward": 1.7635416984558105, + "reward_std": 0.23115942478179932, + "rewards/accuracy_reward": 0.0916666692122817, + "rewards/format_reward": 0.9375000119209289, + "rewards/tag_count_reward": 0.7343750059604645, + "step": 3011 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.1250183105469, + "epoch": 0.9639942390782525, + "grad_norm": 0.22022384405136108, + "kl": 0.3199925169348717, + "learning_rate": 7.823786402756827e-08, + "loss": 0.1118, + "reward": 1.7937500715255736, + "reward_std": 0.23845134377479554, + "rewards/accuracy_reward": 0.10416666995733977, + "rewards/format_reward": 0.9479166984558105, + "rewards/tag_count_reward": 0.7416666924953461, + "step": 3012 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.6333435058593, + "epoch": 0.9643142902864459, + "grad_norm": 0.15807892382144928, + "kl": 0.33093359544873235, + "learning_rate": 7.684878059769363e-08, + "loss": 0.102, + "reward": 1.794270884990692, + "reward_std": 0.2638327829539776, + "rewards/accuracy_reward": 0.11875000111758709, + "rewards/format_reward": 0.9395833611488342, + "rewards/tag_count_reward": 0.735937523841858, + "step": 3013 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.2354339599609, + "epoch": 0.9646343414946391, + "grad_norm": 0.13306574523448944, + "kl": 0.27508914321660993, + "learning_rate": 7.547209162805824e-08, + "loss": 0.0831, + "reward": 1.820833396911621, + "reward_std": 0.2090136304497719, + "rewards/accuracy_reward": 0.11458333749324083, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 3014 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.6041900634766, + "epoch": 0.9649543927028325, + "grad_norm": 0.10513435304164886, + "kl": 0.2855644281953573, + "learning_rate": 7.410779883820663e-08, + "loss": 0.0818, + "reward": 1.765625035762787, + "reward_std": 0.17442589700222016, + "rewards/accuracy_reward": 0.05625000204890966, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.740625011920929, + "step": 3015 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.6416870117188, + "epoch": 0.9652744439110258, + "grad_norm": 0.11084417253732681, + "kl": 0.26390017867088317, + "learning_rate": 7.275590393220456e-08, + "loss": 0.0724, + "reward": 1.7838541865348816, + "reward_std": 0.19236919954419135, + "rewards/accuracy_reward": 0.08125000242143869, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 3016 + }, + { + "clip_ratio": 0.0, + "completion_length": 533.6958526611328, + "epoch": 0.965594495119219, + "grad_norm": 0.24113225936889648, + "kl": 0.3328822206705809, + "learning_rate": 7.141640859862576e-08, + "loss": 0.0785, + "reward": 1.7536458730697633, + "reward_std": 0.1738430380821228, + "rewards/accuracy_reward": 0.04375000055879354, + "rewards/format_reward": 0.9708333551883698, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 3017 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.5562683105469, + "epoch": 0.9659145463274124, + "grad_norm": 0.21408711373806, + "kl": 0.43628372699022294, + "learning_rate": 7.0089314510563e-08, + "loss": 0.1153, + "reward": 1.7932292103767395, + "reward_std": 0.22712324783205987, + "rewards/accuracy_reward": 0.1000000037252903, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7369791805744171, + "step": 3018 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.3750183105469, + "epoch": 0.9662345975356057, + "grad_norm": 0.247038334608078, + "kl": 0.26444780342280866, + "learning_rate": 6.877462332561479e-08, + "loss": 0.0754, + "reward": 1.7697916865348815, + "reward_std": 0.18048504441976548, + "rewards/accuracy_reward": 0.07291666772216558, + "rewards/format_reward": 0.9541666746139527, + "rewards/tag_count_reward": 0.7427083432674408, + "step": 3019 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.7625244140625, + "epoch": 0.966554648743799, + "grad_norm": 0.10298003256320953, + "kl": 0.23233777694404126, + "learning_rate": 6.747233668588981e-08, + "loss": 0.0442, + "reward": 1.8005208611488341, + "reward_std": 0.13236782178282738, + "rewards/accuracy_reward": 0.07916666977107525, + "rewards/format_reward": 0.9791666805744171, + "rewards/tag_count_reward": 0.7421875119209289, + "step": 3020 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.0062652587891, + "epoch": 0.9668746999519923, + "grad_norm": 0.1813294142484665, + "kl": 0.2319230657070875, + "learning_rate": 6.618245621800135e-08, + "loss": 0.0954, + "reward": 1.8520833849906921, + "reward_std": 0.2161620169878006, + "rewards/accuracy_reward": 0.1458333360031247, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7395833611488343, + "step": 3021 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.5916809082031, + "epoch": 0.9671947511601856, + "grad_norm": 0.1469898223876953, + "kl": 0.2930970214307308, + "learning_rate": 6.49049835330684e-08, + "loss": 0.1043, + "reward": 1.709375023841858, + "reward_std": 0.20650226771831512, + "rewards/accuracy_reward": 0.020833334326744078, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 3022 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.5625183105469, + "epoch": 0.967514802368379, + "grad_norm": 0.1649038940668106, + "kl": 0.3373178992420435, + "learning_rate": 6.36399202267135e-08, + "loss": 0.0835, + "reward": 1.7692708849906922, + "reward_std": 0.20760041624307632, + "rewards/accuracy_reward": 0.07291666809469462, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 3023 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.9000244140625, + "epoch": 0.9678348535765723, + "grad_norm": 0.1377795785665512, + "kl": 0.3631039060652256, + "learning_rate": 6.23872678790538e-08, + "loss": 0.0643, + "reward": 1.7510417103767395, + "reward_std": 0.21666239500045775, + "rewards/accuracy_reward": 0.05625000260770321, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 3024 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.9229370117188, + "epoch": 0.9681549047847655, + "grad_norm": 0.2074446827173233, + "kl": 0.3897064059972763, + "learning_rate": 6.114702805471107e-08, + "loss": 0.1027, + "reward": 1.7135416865348816, + "reward_std": 0.22525653690099717, + "rewards/accuracy_reward": 0.02291666716337204, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 3025 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.4541809082032, + "epoch": 0.9684749559929589, + "grad_norm": 0.10851828008890152, + "kl": 0.273499009013176, + "learning_rate": 5.991920230279946e-08, + "loss": 0.0826, + "reward": 1.7744792103767395, + "reward_std": 0.18417903929948806, + "rewards/accuracy_reward": 0.08125000298023224, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7390625238418579, + "step": 3026 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.989599609375, + "epoch": 0.9687950072011522, + "grad_norm": 0.0974067747592926, + "kl": 0.3477734237909317, + "learning_rate": 5.870379215692778e-08, + "loss": 0.1281, + "reward": 1.8484375596046447, + "reward_std": 0.24913154244422914, + "rewards/accuracy_reward": 0.15833333786576986, + "rewards/format_reward": 0.9520833611488342, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 3027 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.3396026611329, + "epoch": 0.9691150584093455, + "grad_norm": 0.10124637931585312, + "kl": 0.31874447837471964, + "learning_rate": 5.750079913519835e-08, + "loss": 0.1191, + "reward": 1.804687535762787, + "reward_std": 0.23008078709244728, + "rewards/accuracy_reward": 0.11250000204890967, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 3028 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.7896057128906, + "epoch": 0.9694351096175388, + "grad_norm": 0.32607904076576233, + "kl": 0.19704431369900705, + "learning_rate": 5.6310224740202536e-08, + "loss": 0.0759, + "reward": 1.7203125476837158, + "reward_std": 0.17980255410075188, + "rewards/accuracy_reward": 0.01458333395421505, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7411458432674408, + "step": 3029 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.6291809082031, + "epoch": 0.9697551608257321, + "grad_norm": 0.13909195363521576, + "kl": 0.36173166893422604, + "learning_rate": 5.5132070459021914e-08, + "loss": 0.0725, + "reward": 1.7401042103767395, + "reward_std": 0.21508442014455795, + "rewards/accuracy_reward": 0.0479166679084301, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.7401041805744171, + "step": 3030 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.108349609375, + "epoch": 0.9700752120339254, + "grad_norm": 0.18055284023284912, + "kl": 0.40517835319042206, + "learning_rate": 5.3966337763223795e-08, + "loss": 0.1077, + "reward": 1.7041666746139525, + "reward_std": 0.1730414643883705, + "rewards/accuracy_reward": 0.002083333395421505, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7416666865348815, + "step": 3031 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.9541809082032, + "epoch": 0.9703952632421188, + "grad_norm": 0.1053120568394661, + "kl": 0.226448442786932, + "learning_rate": 5.281302810886013e-08, + "loss": 0.0701, + "reward": 1.7687500357627868, + "reward_std": 0.2200807049870491, + "rewards/accuracy_reward": 0.07291666939854621, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7354166865348816, + "step": 3032 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.9541809082032, + "epoch": 0.970715314450312, + "grad_norm": 0.14212128520011902, + "kl": 0.21750407926738263, + "learning_rate": 5.1672142936466385e-08, + "loss": 0.0557, + "reward": 1.7453125238418579, + "reward_std": 0.20142756253480912, + "rewards/accuracy_reward": 0.0375, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7411458551883697, + "step": 3033 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.0041931152343, + "epoch": 0.9710353656585053, + "grad_norm": 0.12605281174182892, + "kl": 0.26052655279636383, + "learning_rate": 5.054368367106044e-08, + "loss": 0.0366, + "reward": 1.8239583730697633, + "reward_std": 0.18405086249113084, + "rewards/accuracy_reward": 0.11250000298023224, + "rewards/format_reward": 0.9666666924953461, + "rewards/tag_count_reward": 0.7447916865348816, + "step": 3034 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.1479339599609, + "epoch": 0.9713554168666987, + "grad_norm": 0.09858619421720505, + "kl": 0.29451265931129456, + "learning_rate": 4.9427651722137035e-08, + "loss": 0.0872, + "reward": 1.8848958730697631, + "reward_std": 0.2750785931944847, + "rewards/accuracy_reward": 0.1916666753590107, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7369791865348816, + "step": 3035 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.910433959961, + "epoch": 0.971675468074892, + "grad_norm": 0.2123694270849228, + "kl": 0.23448918834328653, + "learning_rate": 4.8324048483670006e-08, + "loss": 0.0798, + "reward": 1.8343750476837157, + "reward_std": 0.2175012208521366, + "rewards/accuracy_reward": 0.13541667275130748, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 3036 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.3396026611329, + "epoch": 0.9719955192830853, + "grad_norm": 0.0967680960893631, + "kl": 0.19490599371492862, + "learning_rate": 4.723287533411003e-08, + "loss": 0.07, + "reward": 1.7619791865348815, + "reward_std": 0.22343166172504425, + "rewards/accuracy_reward": 0.054166667722165586, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7432291865348816, + "step": 3037 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.5083618164062, + "epoch": 0.9723155704912786, + "grad_norm": 0.15740668773651123, + "kl": 0.2883405897766352, + "learning_rate": 4.615413363638133e-08, + "loss": 0.0588, + "reward": 1.7901041984558106, + "reward_std": 0.24050376191735268, + "rewards/accuracy_reward": 0.08125000316649675, + "rewards/format_reward": 0.9708333551883698, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 3038 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.1812683105469, + "epoch": 0.9726356216994719, + "grad_norm": 0.17502444982528687, + "kl": 0.32404273599386213, + "learning_rate": 4.508782473787943e-08, + "loss": 0.1306, + "reward": 1.7463541865348815, + "reward_std": 0.2457516685128212, + "rewards/accuracy_reward": 0.06250000130385161, + "rewards/format_reward": 0.9437500238418579, + "rewards/tag_count_reward": 0.7401041805744171, + "step": 3039 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.497933959961, + "epoch": 0.9729556729076653, + "grad_norm": 0.09075573831796646, + "kl": 0.2926942154765129, + "learning_rate": 4.403394997047339e-08, + "loss": 0.1007, + "reward": 1.8265625476837157, + "reward_std": 0.27125475853681563, + "rewards/accuracy_reward": 0.13958333991467953, + "rewards/format_reward": 0.947916692495346, + "rewards/tag_count_reward": 0.7390625238418579, + "step": 3040 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.8562744140625, + "epoch": 0.9732757241158585, + "grad_norm": 0.15645340085029602, + "kl": 0.46642662212252617, + "learning_rate": 4.299251065049803e-08, + "loss": 0.136, + "reward": 1.7083333730697632, + "reward_std": 0.28290791213512423, + "rewards/accuracy_reward": 0.04375000055879354, + "rewards/format_reward": 0.9354166805744171, + "rewards/tag_count_reward": 0.7291666746139527, + "step": 3041 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.3916931152344, + "epoch": 0.9735957753240518, + "grad_norm": 0.18267188966274261, + "kl": 0.4055576235055923, + "learning_rate": 4.1963508078759486e-08, + "loss": 0.1059, + "reward": 1.7119792103767395, + "reward_std": 0.26228521317243575, + "rewards/accuracy_reward": 0.058333334513008595, + "rewards/format_reward": 0.9250000178813934, + "rewards/tag_count_reward": 0.7286458551883698, + "step": 3042 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.5208618164063, + "epoch": 0.9739158265322452, + "grad_norm": 0.11391862481832504, + "kl": 0.2481472548097372, + "learning_rate": 4.094694354052742e-08, + "loss": 0.0629, + "reward": 1.8135417222976684, + "reward_std": 0.14478981345891953, + "rewards/accuracy_reward": 0.09583333693444729, + "rewards/format_reward": 0.9750000059604644, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 3043 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.6312683105468, + "epoch": 0.9742358777404385, + "grad_norm": 0.14298422634601593, + "kl": 0.3634337313473225, + "learning_rate": 3.9942818305537255e-08, + "loss": 0.1113, + "reward": 1.7713541984558105, + "reward_std": 0.23919814825057983, + "rewards/accuracy_reward": 0.08541666865348815, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.735937523841858, + "step": 3044 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.8083557128906, + "epoch": 0.9745559289486317, + "grad_norm": 0.14067691564559937, + "kl": 0.6347122829407453, + "learning_rate": 3.895113362798464e-08, + "loss": 0.1434, + "reward": 1.7317708730697632, + "reward_std": 0.28452501222491267, + "rewards/accuracy_reward": 0.07083333451300859, + "rewards/format_reward": 0.9354166805744171, + "rewards/tag_count_reward": 0.7255208492279053, + "step": 3045 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.6729400634765, + "epoch": 0.9748759801568251, + "grad_norm": 0.2969621419906616, + "kl": 0.24330624416470528, + "learning_rate": 3.797189074652874e-08, + "loss": 0.102, + "reward": 1.7552083492279054, + "reward_std": 0.192726993560791, + "rewards/accuracy_reward": 0.060416669212281705, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 3046 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.783349609375, + "epoch": 0.9751960313650184, + "grad_norm": 0.18543684482574463, + "kl": 0.343426763266325, + "learning_rate": 3.700509088428894e-08, + "loss": 0.1255, + "reward": 1.829687523841858, + "reward_std": 0.23542412966489792, + "rewards/accuracy_reward": 0.1416666742414236, + "rewards/format_reward": 0.9500000178813934, + "rewards/tag_count_reward": 0.7380208492279052, + "step": 3047 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.0229309082031, + "epoch": 0.9755160825732118, + "grad_norm": 0.2756257951259613, + "kl": 0.3182968482375145, + "learning_rate": 3.6050735248841506e-08, + "loss": 0.0898, + "reward": 1.7703125596046447, + "reward_std": 0.24028173089027405, + "rewards/accuracy_reward": 0.0791666692122817, + "rewards/format_reward": 0.9562500178813934, + "rewards/tag_count_reward": 0.7348958492279053, + "step": 3048 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.6937744140625, + "epoch": 0.975836133781405, + "grad_norm": 0.19421151280403137, + "kl": 0.23446202799677848, + "learning_rate": 3.5108825032217355e-08, + "loss": 0.0748, + "reward": 1.8364583611488343, + "reward_std": 0.2093771666288376, + "rewards/accuracy_reward": 0.12916667070239782, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7427083611488342, + "step": 3049 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.8229278564453, + "epoch": 0.9761561849895983, + "grad_norm": 0.1765829175710678, + "kl": 0.239769284427166, + "learning_rate": 3.417936141090539e-08, + "loss": 0.0545, + "reward": 1.792187547683716, + "reward_std": 0.21487408950924874, + "rewards/accuracy_reward": 0.08541667070239782, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7401041746139526, + "step": 3050 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.0208557128906, + "epoch": 0.9764762361977917, + "grad_norm": 0.13331003487110138, + "kl": 0.2412990540266037, + "learning_rate": 3.326234554584917e-08, + "loss": 0.1001, + "reward": 1.7531250238418579, + "reward_std": 0.21676268726587294, + "rewards/accuracy_reward": 0.05416666828095913, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 3051 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.6000183105468, + "epoch": 0.976796287405985, + "grad_norm": 0.11980487406253815, + "kl": 0.3370642215013504, + "learning_rate": 3.235777858244027e-08, + "loss": 0.0972, + "reward": 1.8760417342185973, + "reward_std": 0.20026633143424988, + "rewards/accuracy_reward": 0.177083339355886, + "rewards/format_reward": 0.9604166924953461, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 3052 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.733349609375, + "epoch": 0.9771163386141782, + "grad_norm": 0.18214666843414307, + "kl": 0.2942465879023075, + "learning_rate": 3.1465661650523785e-08, + "loss": 0.1002, + "reward": 1.8046875596046448, + "reward_std": 0.24582924097776412, + "rewards/accuracy_reward": 0.12083333544433117, + "rewards/format_reward": 0.9479166865348816, + "rewards/tag_count_reward": 0.7359375119209289, + "step": 3053 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.4770965576172, + "epoch": 0.9774363898223716, + "grad_norm": 0.2062879353761673, + "kl": 0.21511095613241196, + "learning_rate": 3.0585995864395033e-08, + "loss": 0.0901, + "reward": 1.7583333730697632, + "reward_std": 0.20363759249448776, + "rewards/accuracy_reward": 0.054166667722165586, + "rewards/format_reward": 0.9583333492279053, + "rewards/tag_count_reward": 0.7458333551883698, + "step": 3054 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.6312683105468, + "epoch": 0.9777564410305649, + "grad_norm": 0.25694888830184937, + "kl": 0.2298228584229946, + "learning_rate": 2.9718782322794015e-08, + "loss": 0.0708, + "reward": 1.7968750476837159, + "reward_std": 0.24236893951892852, + "rewards/accuracy_reward": 0.09583333730697632, + "rewards/format_reward": 0.9604166924953461, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 3055 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.1375183105469, + "epoch": 0.9780764922387583, + "grad_norm": 0.06718038767576218, + "kl": 0.20514454543590546, + "learning_rate": 2.8864022108910927e-08, + "loss": 0.0614, + "reward": 1.7260416865348815, + "reward_std": 0.13063133358955384, + "rewards/accuracy_reward": 0.010416667163372039, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.7447916865348816, + "step": 3056 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.6791870117188, + "epoch": 0.9783965434469515, + "grad_norm": 0.13397450745105743, + "kl": 0.20096199810504914, + "learning_rate": 2.802171629037953e-08, + "loss": 0.0603, + "reward": 1.8041666984558105, + "reward_std": 0.21762454360723496, + "rewards/accuracy_reward": 0.1125000037252903, + "rewards/format_reward": 0.9520833492279053, + "rewards/tag_count_reward": 0.7395833492279053, + "step": 3057 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.3729400634766, + "epoch": 0.9787165946551448, + "grad_norm": 0.17857791483402252, + "kl": 0.3088830351829529, + "learning_rate": 2.719186591927603e-08, + "loss": 0.1031, + "reward": 1.7270833730697632, + "reward_std": 0.23266912549734114, + "rewards/accuracy_reward": 0.03750000111758709, + "rewards/format_reward": 0.950000011920929, + "rewards/tag_count_reward": 0.7395833432674408, + "step": 3058 + }, + { + "clip_ratio": 0.0, + "completion_length": 593.3021087646484, + "epoch": 0.9790366458633382, + "grad_norm": 0.15610116720199585, + "kl": 0.23165589943528175, + "learning_rate": 2.637447203212129e-08, + "loss": 0.0853, + "reward": 1.8005208849906922, + "reward_std": 0.22202980518341064, + "rewards/accuracy_reward": 0.0979166692122817, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7401041805744171, + "step": 3059 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.0396087646484, + "epoch": 0.9793566970715314, + "grad_norm": 0.24467244744300842, + "kl": 0.2348164737224579, + "learning_rate": 2.556953564987752e-08, + "loss": 0.0722, + "reward": 1.8072917342185975, + "reward_std": 0.18154692202806472, + "rewards/accuracy_reward": 0.09583333637565375, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.744791692495346, + "step": 3060 + }, + { + "clip_ratio": 0.0, + "completion_length": 590.1250183105469, + "epoch": 0.9796767482797247, + "grad_norm": 0.14823976159095764, + "kl": 0.28711467459797857, + "learning_rate": 2.4777057777946034e-08, + "loss": 0.1042, + "reward": 1.6937500357627868, + "reward_std": 0.2312575563788414, + "rewards/accuracy_reward": 0.008333333395421505, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7354166984558106, + "step": 3061 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.2458557128906, + "epoch": 0.9799967994879181, + "grad_norm": 0.13439303636550903, + "kl": 0.21451763063669205, + "learning_rate": 2.3997039406167266e-08, + "loss": 0.0484, + "reward": 1.7958333849906922, + "reward_std": 0.13553692996501923, + "rewards/accuracy_reward": 0.0791666692122817, + "rewards/format_reward": 0.9708333492279053, + "rewards/tag_count_reward": 0.7458333492279052, + "step": 3062 + }, + { + "clip_ratio": 0.0, + "completion_length": 595.1854309082031, + "epoch": 0.9803168506961114, + "grad_norm": 0.12152549624443054, + "kl": 0.36722201108932495, + "learning_rate": 2.322948150881854e-08, + "loss": 0.1193, + "reward": 1.707812535762787, + "reward_std": 0.26899106055498123, + "rewards/accuracy_reward": 0.052083334513008596, + "rewards/format_reward": 0.9250000238418579, + "rewards/tag_count_reward": 0.7307291865348816, + "step": 3063 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.4958404541015, + "epoch": 0.9806369019043046, + "grad_norm": 0.1406739056110382, + "kl": 0.2821358598768711, + "learning_rate": 2.2474385044615188e-08, + "loss": 0.0637, + "reward": 1.8052083611488343, + "reward_std": 0.23521801978349685, + "rewards/accuracy_reward": 0.10416666883975267, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 3064 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.8479370117187, + "epoch": 0.980956953112498, + "grad_norm": 0.15969295799732208, + "kl": 0.3610251784324646, + "learning_rate": 2.173175095670499e-08, + "loss": 0.122, + "reward": 1.7802083611488342, + "reward_std": 0.2173793375492096, + "rewards/accuracy_reward": 0.08958333563059569, + "rewards/format_reward": 0.9479166865348816, + "rewards/tag_count_reward": 0.7427083551883698, + "step": 3065 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.2812683105469, + "epoch": 0.9812770043206913, + "grad_norm": 0.21414420008659363, + "kl": 0.3591214381158352, + "learning_rate": 2.100158017267151e-08, + "loss": 0.0997, + "reward": 1.7401041984558105, + "reward_std": 0.2544398784637451, + "rewards/accuracy_reward": 0.06041666846722364, + "rewards/format_reward": 0.9416666865348816, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 3066 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.1646057128906, + "epoch": 0.9815970555288847, + "grad_norm": 0.19221711158752441, + "kl": 0.2868764579296112, + "learning_rate": 2.028387360453188e-08, + "loss": 0.0974, + "reward": 1.7140625238418579, + "reward_std": 0.22124834954738617, + "rewards/accuracy_reward": 0.02916666679084301, + "rewards/format_reward": 0.9458333492279053, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 3067 + }, + { + "clip_ratio": 0.0, + "completion_length": 593.27294921875, + "epoch": 0.9819171067370779, + "grad_norm": 0.1753731369972229, + "kl": 0.30784842520952227, + "learning_rate": 1.9578632148733455e-08, + "loss": 0.1143, + "reward": 1.6729167103767395, + "reward_std": 0.24664187729358672, + "rewards/accuracy_reward": 0.006250000186264515, + "rewards/format_reward": 0.9333333551883698, + "rewards/tag_count_reward": 0.7333333551883697, + "step": 3068 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.4666870117187, + "epoch": 0.9822371579452712, + "grad_norm": 0.24415169656276703, + "kl": 0.34099898114800453, + "learning_rate": 1.8885856686152725e-08, + "loss": 0.0821, + "reward": 1.726562535762787, + "reward_std": 0.21097205057740212, + "rewards/accuracy_reward": 0.04583333432674408, + "rewards/format_reward": 0.9479166865348816, + "rewards/tag_count_reward": 0.732812511920929, + "step": 3069 + }, + { + "clip_ratio": 0.0, + "completion_length": 533.0958435058594, + "epoch": 0.9825572091534646, + "grad_norm": 0.09442782402038574, + "kl": 0.325906627625227, + "learning_rate": 1.8205548082099733e-08, + "loss": 0.0564, + "reward": 1.7869791984558105, + "reward_std": 0.1821589708328247, + "rewards/accuracy_reward": 0.0770833358168602, + "rewards/format_reward": 0.9687500119209289, + "rewards/tag_count_reward": 0.7411458492279053, + "step": 3070 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.789599609375, + "epoch": 0.9828772603616579, + "grad_norm": 0.253113329410553, + "kl": 0.24712217450141907, + "learning_rate": 1.7537707186308093e-08, + "loss": 0.1244, + "reward": 1.801562535762787, + "reward_std": 0.2580643087625504, + "rewards/accuracy_reward": 0.12291667275130749, + "rewards/format_reward": 0.9458333551883698, + "rewards/tag_count_reward": 0.732812511920929, + "step": 3071 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.6104309082032, + "epoch": 0.9831973115698511, + "grad_norm": 0.1100858673453331, + "kl": 0.22945720814168452, + "learning_rate": 1.6882334832942772e-08, + "loss": 0.0992, + "reward": 1.8088541984558106, + "reward_std": 0.20694586411118507, + "rewards/accuracy_reward": 0.10833333563059569, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7442708551883698, + "step": 3072 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.0666870117187, + "epoch": 0.9835173627780445, + "grad_norm": 0.12897634506225586, + "kl": 0.30970082357525824, + "learning_rate": 1.623943184059229e-08, + "loss": 0.1166, + "reward": 1.7489583849906922, + "reward_std": 0.21749227941036225, + "rewards/accuracy_reward": 0.0687500013038516, + "rewards/format_reward": 0.9416666865348816, + "rewards/tag_count_reward": 0.7385416805744172, + "step": 3073 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.2000183105469, + "epoch": 0.9838374139862378, + "grad_norm": 0.24033485352993011, + "kl": 0.3132885962724686, + "learning_rate": 1.5608999012272085e-08, + "loss": 0.0843, + "reward": 1.7281250357627869, + "reward_std": 0.16667985767126084, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7343750178813935, + "step": 3074 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.7500152587891, + "epoch": 0.9841574651944311, + "grad_norm": 0.07913769036531448, + "kl": 0.2783251881599426, + "learning_rate": 1.499103713542005e-08, + "loss": 0.0482, + "reward": 1.7885416746139526, + "reward_std": 0.1665610209107399, + "rewards/accuracy_reward": 0.07291666921228171, + "rewards/format_reward": 0.9750000059604644, + "rewards/tag_count_reward": 0.740625011920929, + "step": 3075 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.0833557128906, + "epoch": 0.9844775164026244, + "grad_norm": 0.17048057913780212, + "kl": 0.34632964730262755, + "learning_rate": 1.4385546981897647e-08, + "loss": 0.1022, + "reward": 1.7401042103767395, + "reward_std": 0.21267496347427367, + "rewards/accuracy_reward": 0.045833334885537626, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 3076 + }, + { + "clip_ratio": 0.0, + "completion_length": 588.2833526611328, + "epoch": 0.9847975676108177, + "grad_norm": 0.13539846241474152, + "kl": 0.30142875015735626, + "learning_rate": 1.379252930799102e-08, + "loss": 0.0594, + "reward": 1.7718750476837157, + "reward_std": 0.22751090675592422, + "rewards/accuracy_reward": 0.08333333730697631, + "rewards/format_reward": 0.954166692495346, + "rewards/tag_count_reward": 0.7343750178813935, + "step": 3077 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.7583557128906, + "epoch": 0.985117618819011, + "grad_norm": 0.3082070052623749, + "kl": 0.3032428666949272, + "learning_rate": 1.3211984854404337e-08, + "loss": 0.1176, + "reward": 1.7770833730697633, + "reward_std": 0.22418717592954635, + "rewards/accuracy_reward": 0.08333333451300859, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 3078 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.8271057128907, + "epoch": 0.9854376700272044, + "grad_norm": 0.17651331424713135, + "kl": 0.32943628504872324, + "learning_rate": 1.264391434626533e-08, + "loss": 0.1171, + "reward": 1.7630208730697632, + "reward_std": 0.19876088947057724, + "rewards/accuracy_reward": 0.06666667070239782, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 3079 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.633349609375, + "epoch": 0.9857577212353976, + "grad_norm": 0.12131522595882416, + "kl": 0.22669636681675912, + "learning_rate": 1.2088318493117534e-08, + "loss": 0.0728, + "reward": 1.734375035762787, + "reward_std": 0.17790770083665847, + "rewards/accuracy_reward": 0.025000000186264516, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7427083551883698, + "step": 3080 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.4125152587891, + "epoch": 0.986077772443591, + "grad_norm": 0.16687986254692078, + "kl": 0.30181434378027916, + "learning_rate": 1.1545197988925839e-08, + "loss": 0.0896, + "reward": 1.792187547683716, + "reward_std": 0.17845623940229416, + "rewards/accuracy_reward": 0.10000000298023223, + "rewards/format_reward": 0.9541666865348816, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 3081 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.6791931152344, + "epoch": 0.9863978236517843, + "grad_norm": 0.166269451379776, + "kl": 0.38384261056780816, + "learning_rate": 1.1014553512072036e-08, + "loss": 0.0944, + "reward": 1.7432292222976684, + "reward_std": 0.21463448256254197, + "rewards/accuracy_reward": 0.05416666828095913, + "rewards/format_reward": 0.950000011920929, + "rewards/tag_count_reward": 0.7390625238418579, + "step": 3082 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.1666748046875, + "epoch": 0.9867178748599776, + "grad_norm": 0.26047223806381226, + "kl": 0.3793337717652321, + "learning_rate": 1.049638572535483e-08, + "loss": 0.1121, + "reward": 1.7692708611488341, + "reward_std": 0.21841761842370033, + "rewards/accuracy_reward": 0.07916666865348816, + "rewards/format_reward": 0.9541666805744171, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 3083 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.1937683105468, + "epoch": 0.9870379260681709, + "grad_norm": 0.2953816056251526, + "kl": 0.26855692490935323, + "learning_rate": 9.990695275988727e-09, + "loss": 0.075, + "reward": 1.8380208730697631, + "reward_std": 0.19789665341377258, + "rewards/accuracy_reward": 0.13125000447034835, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 3084 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.4000305175781, + "epoch": 0.9873579772763642, + "grad_norm": 0.13979560136795044, + "kl": 0.3356237094849348, + "learning_rate": 9.49748279560514e-09, + "loss": 0.0903, + "reward": 1.8380208611488342, + "reward_std": 0.2137792520225048, + "rewards/accuracy_reward": 0.13750000204890966, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7380208551883698, + "step": 3085 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.4916870117188, + "epoch": 0.9876780284845575, + "grad_norm": 0.19319754838943481, + "kl": 0.3143337398767471, + "learning_rate": 9.01674890024684e-09, + "loss": 0.0927, + "reward": 1.7260417222976685, + "reward_std": 0.23539431765675545, + "rewards/accuracy_reward": 0.03541666716337204, + "rewards/format_reward": 0.9479166984558105, + "rewards/tag_count_reward": 0.7427083551883698, + "step": 3086 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.2896118164062, + "epoch": 0.9879980796927509, + "grad_norm": 0.13193345069885254, + "kl": 0.33816151022911073, + "learning_rate": 8.548494190372402e-09, + "loss": 0.1099, + "reward": 1.7890625715255737, + "reward_std": 0.2728777229785919, + "rewards/accuracy_reward": 0.11458333805203438, + "rewards/format_reward": 0.9395833551883698, + "rewards/tag_count_reward": 0.7348958551883698, + "step": 3087 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.3250244140625, + "epoch": 0.9883181309009441, + "grad_norm": 0.10972767323255539, + "kl": 0.2844900615513325, + "learning_rate": 8.092719250853975e-09, + "loss": 0.0916, + "reward": 1.7411458492279053, + "reward_std": 0.14775414243340493, + "rewards/accuracy_reward": 0.03333333432674408, + "rewards/format_reward": 0.9687500178813935, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 3088 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.6646057128906, + "epoch": 0.9886381821091375, + "grad_norm": 0.15878772735595703, + "kl": 0.34863837584853175, + "learning_rate": 7.649424650972847e-09, + "loss": 0.1037, + "reward": 1.8317708730697633, + "reward_std": 0.22507388815283774, + "rewards/accuracy_reward": 0.13125000428408384, + "rewards/format_reward": 0.9604166865348815, + "rewards/tag_count_reward": 0.7401041865348816, + "step": 3089 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.4354278564454, + "epoch": 0.9889582333173308, + "grad_norm": 0.14936627447605133, + "kl": 0.2459190659224987, + "learning_rate": 7.218610944426108e-09, + "loss": 0.0887, + "reward": 1.7484375357627868, + "reward_std": 0.19498306661844253, + "rewards/accuracy_reward": 0.0479166692122817, + "rewards/format_reward": 0.9604166805744171, + "rewards/tag_count_reward": 0.7401041984558105, + "step": 3090 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.904183959961, + "epoch": 0.9892782845255241, + "grad_norm": 0.11002171039581299, + "kl": 0.49073898121714593, + "learning_rate": 6.800278669317762e-09, + "loss": 0.1155, + "reward": 1.7229166865348815, + "reward_std": 0.24243892431259156, + "rewards/accuracy_reward": 0.039583333395421506, + "rewards/format_reward": 0.9500000238418579, + "rewards/tag_count_reward": 0.7333333551883697, + "step": 3091 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.7021026611328, + "epoch": 0.9895983357337174, + "grad_norm": 0.11355835944414139, + "kl": 0.33552836179733275, + "learning_rate": 6.394428348164284e-09, + "loss": 0.0884, + "reward": 1.7526041984558105, + "reward_std": 0.18140390366315842, + "rewards/accuracy_reward": 0.054166667722165586, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 3092 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.3541900634766, + "epoch": 0.9899183869419107, + "grad_norm": 0.19902049005031586, + "kl": 0.24995511323213576, + "learning_rate": 6.001060487891286e-09, + "loss": 0.0791, + "reward": 1.7546875476837158, + "reward_std": 0.2084854982793331, + "rewards/accuracy_reward": 0.06041666865348816, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7380208492279052, + "step": 3093 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.2750244140625, + "epoch": 0.990238438150104, + "grad_norm": 0.17721405625343323, + "kl": 0.2684307098388672, + "learning_rate": 5.6201755798313e-09, + "loss": 0.0578, + "reward": 1.7473958492279054, + "reward_std": 0.12207645624876022, + "rewards/accuracy_reward": 0.03541666772216558, + "rewards/format_reward": 0.9729166865348816, + "rewards/tag_count_reward": 0.739062511920929, + "step": 3094 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.2583435058593, + "epoch": 0.9905584893582974, + "grad_norm": 0.11798027902841568, + "kl": 0.22919094637036325, + "learning_rate": 5.251774099727103e-09, + "loss": 0.0895, + "reward": 1.8020833730697632, + "reward_std": 0.20591284781694413, + "rewards/accuracy_reward": 0.0958333358168602, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7416666924953461, + "step": 3095 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.3979400634765, + "epoch": 0.9908785405664906, + "grad_norm": 0.19848725199699402, + "kl": 0.4313486650586128, + "learning_rate": 4.895856507730612e-09, + "loss": 0.1078, + "reward": 1.7447916746139527, + "reward_std": 0.23529223948717118, + "rewards/accuracy_reward": 0.0708333333954215, + "rewards/format_reward": 0.9375000178813935, + "rewards/tag_count_reward": 0.7364583492279053, + "step": 3096 + }, + { + "clip_ratio": 0.0, + "completion_length": 599.0312744140625, + "epoch": 0.991198591774684, + "grad_norm": 0.29588741064071655, + "kl": 0.3257336333394051, + "learning_rate": 4.55242324839622e-09, + "loss": 0.1146, + "reward": 1.7567708611488342, + "reward_std": 0.28883601576089857, + "rewards/accuracy_reward": 0.08541666883975267, + "rewards/format_reward": 0.9375000178813935, + "rewards/tag_count_reward": 0.7338541924953461, + "step": 3097 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.9854370117188, + "epoch": 0.9915186429828773, + "grad_norm": 0.1464318186044693, + "kl": 0.31563855409622193, + "learning_rate": 4.22147475068968e-09, + "loss": 0.0892, + "reward": 1.8328125596046447, + "reward_std": 0.22565954253077508, + "rewards/accuracy_reward": 0.1291666690260172, + "rewards/format_reward": 0.9645833611488343, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 3098 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.0625091552735, + "epoch": 0.9918386941910706, + "grad_norm": 0.2843438386917114, + "kl": 0.24993645697832106, + "learning_rate": 3.903011427978109e-09, + "loss": 0.0968, + "reward": 1.7885417222976685, + "reward_std": 0.22308254763484, + "rewards/accuracy_reward": 0.0916666692122817, + "rewards/format_reward": 0.9562500238418579, + "rewards/tag_count_reward": 0.7406250238418579, + "step": 3099 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.564599609375, + "epoch": 0.9921587453992639, + "grad_norm": 0.1468961387872696, + "kl": 0.2920463755726814, + "learning_rate": 3.597033678038875e-09, + "loss": 0.0485, + "reward": 1.8338542103767395, + "reward_std": 0.1874682992696762, + "rewards/accuracy_reward": 0.12500000298023223, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 3100 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.4166870117188, + "epoch": 0.9924787966074572, + "grad_norm": 0.15643829107284546, + "kl": 0.31074003875255585, + "learning_rate": 3.303541883049599e-09, + "loss": 0.1386, + "reward": 1.7671875596046447, + "reward_std": 0.2778718054294586, + "rewards/accuracy_reward": 0.08541667088866234, + "rewards/format_reward": 0.9500000298023223, + "rewards/tag_count_reward": 0.7317708492279053, + "step": 3101 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.9021057128906, + "epoch": 0.9927988478156505, + "grad_norm": 0.1353743076324463, + "kl": 0.2804649338126183, + "learning_rate": 3.0225364095970432e-09, + "loss": 0.1057, + "reward": 1.756250023841858, + "reward_std": 0.2645873501896858, + "rewards/accuracy_reward": 0.07916666679084301, + "rewards/format_reward": 0.9437500178813935, + "rewards/tag_count_reward": 0.7333333432674408, + "step": 3102 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.1333526611328, + "epoch": 0.9931188990238438, + "grad_norm": 0.0932333841919899, + "kl": 0.25905941873788835, + "learning_rate": 2.7540176086671145e-09, + "loss": 0.1129, + "reward": 1.7427083611488343, + "reward_std": 0.1830264799296856, + "rewards/accuracy_reward": 0.041666666977107525, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7385416865348816, + "step": 3103 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.8979431152344, + "epoch": 0.9934389502320371, + "grad_norm": 0.17291957139968872, + "kl": 0.2739930372685194, + "learning_rate": 2.4979858156537474e-09, + "loss": 0.1074, + "reward": 1.7869792103767395, + "reward_std": 0.22782448977231978, + "rewards/accuracy_reward": 0.09166667088866234, + "rewards/format_reward": 0.954166692495346, + "rewards/tag_count_reward": 0.7411458551883697, + "step": 3104 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.5666870117187, + "epoch": 0.9937590014402304, + "grad_norm": 0.14900268614292145, + "kl": 0.22179678678512574, + "learning_rate": 2.2544413503522432e-09, + "loss": 0.0685, + "reward": 1.8458333730697631, + "reward_std": 0.23180068656802177, + "rewards/accuracy_reward": 0.14375000465661286, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7395833551883697, + "step": 3105 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.6395965576172, + "epoch": 0.9940790526484238, + "grad_norm": 0.16432535648345947, + "kl": 0.378567086905241, + "learning_rate": 2.02338451695816e-09, + "loss": 0.0833, + "reward": 1.7901042222976684, + "reward_std": 0.22208391055464743, + "rewards/accuracy_reward": 0.08541666939854622, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7401041805744171, + "step": 3106 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.2937744140625, + "epoch": 0.994399103856617, + "grad_norm": 0.1785258948802948, + "kl": 0.21541684567928315, + "learning_rate": 1.804815604075083e-09, + "loss": 0.0671, + "reward": 1.788020873069763, + "reward_std": 0.16471139043569566, + "rewards/accuracy_reward": 0.07500000335276127, + "rewards/format_reward": 0.9708333432674408, + "rewards/tag_count_reward": 0.7421875178813935, + "step": 3107 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.3437713623047, + "epoch": 0.9947191550648103, + "grad_norm": 0.2913496196269989, + "kl": 0.38024489805102346, + "learning_rate": 1.5987348847024132e-09, + "loss": 0.1083, + "reward": 1.8026042103767395, + "reward_std": 0.24502479285001755, + "rewards/accuracy_reward": 0.11666666865348815, + "rewards/format_reward": 0.9520833551883697, + "rewards/tag_count_reward": 0.7338541805744171, + "step": 3108 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.1750183105469, + "epoch": 0.9950392062730037, + "grad_norm": 0.17706918716430664, + "kl": 0.24948984608054162, + "learning_rate": 1.4051426162464687e-09, + "loss": 0.0613, + "reward": 1.7614583611488341, + "reward_std": 0.1818772867321968, + "rewards/accuracy_reward": 0.0520833358168602, + "rewards/format_reward": 0.9666666924953461, + "rewards/tag_count_reward": 0.7427083492279053, + "step": 3109 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.2291870117188, + "epoch": 0.995359257481197, + "grad_norm": 0.16352076828479767, + "kl": 0.3233878821134567, + "learning_rate": 1.2240390405116043e-09, + "loss": 0.0913, + "reward": 1.756250023841858, + "reward_std": 0.23520760014653205, + "rewards/accuracy_reward": 0.07708333563059569, + "rewards/format_reward": 0.9437500178813935, + "rewards/tag_count_reward": 0.735416692495346, + "step": 3110 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.6187622070313, + "epoch": 0.9956793086893903, + "grad_norm": 0.3277549147605896, + "kl": 0.3297587588429451, + "learning_rate": 1.0554243837035404e-09, + "loss": 0.1324, + "reward": 1.7932292222976685, + "reward_std": 0.23931009843945503, + "rewards/accuracy_reward": 0.11041666865348816, + "rewards/format_reward": 0.9437500238418579, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 3111 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.1791748046875, + "epoch": 0.9959993598975836, + "grad_norm": 0.1446753889322281, + "kl": 0.25720045566558836, + "learning_rate": 8.992988564315852e-10, + "loss": 0.0708, + "reward": 1.7598958730697631, + "reward_std": 0.20648740902543067, + "rewards/accuracy_reward": 0.06250000037252904, + "rewards/format_reward": 0.9583333551883697, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 3112 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.9125061035156, + "epoch": 0.9963194111057769, + "grad_norm": 0.14114601910114288, + "kl": 0.2895710654556751, + "learning_rate": 7.556626537019717e-10, + "loss": 0.0735, + "reward": 1.764062523841858, + "reward_std": 0.16610406339168549, + "rewards/accuracy_reward": 0.05625000186264515, + "rewards/format_reward": 0.9666666805744171, + "rewards/tag_count_reward": 0.7411458611488342, + "step": 3113 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.2562683105468, + "epoch": 0.9966394623139703, + "grad_norm": 0.15220247209072113, + "kl": 0.25976728796958926, + "learning_rate": 6.245159549223001e-10, + "loss": 0.0824, + "reward": 1.8223958730697631, + "reward_std": 0.14715693891048431, + "rewards/accuracy_reward": 0.10416667088866234, + "rewards/format_reward": 0.9729166805744172, + "rewards/tag_count_reward": 0.745312511920929, + "step": 3114 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.3187683105468, + "epoch": 0.9969595135221635, + "grad_norm": 0.12920396029949188, + "kl": 0.2570869214832783, + "learning_rate": 5.058589239026468e-10, + "loss": 0.0691, + "reward": 1.8505208849906922, + "reward_std": 0.22681027501821518, + "rewards/accuracy_reward": 0.14375000465661286, + "rewards/format_reward": 0.9645833551883698, + "rewards/tag_count_reward": 0.7421875238418579, + "step": 3115 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.4187683105469, + "epoch": 0.9972795647303568, + "grad_norm": 0.17800143361091614, + "kl": 0.3351675134152174, + "learning_rate": 3.9969170884890384e-10, + "loss": 0.0882, + "reward": 1.8515625238418578, + "reward_std": 0.20919253826141357, + "rewards/accuracy_reward": 0.14375000353902578, + "rewards/format_reward": 0.9666666865348816, + "rewards/tag_count_reward": 0.7411458492279053, + "step": 3116 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.8896057128907, + "epoch": 0.9975996159385502, + "grad_norm": 0.20195423066616058, + "kl": 0.3757719676941633, + "learning_rate": 3.0601444236944e-10, + "loss": 0.0951, + "reward": 1.7546875476837158, + "reward_std": 0.21815839111804963, + "rewards/accuracy_reward": 0.07291666883975267, + "rewards/format_reward": 0.9458333492279053, + "rewards/tag_count_reward": 0.7359375178813934, + "step": 3117 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.9270965576172, + "epoch": 0.9979196671467435, + "grad_norm": 0.17428921163082123, + "kl": 0.24478441402316092, + "learning_rate": 2.2482724147177005e-10, + "loss": 0.0998, + "reward": 1.772395873069763, + "reward_std": 0.21767098605632781, + "rewards/accuracy_reward": 0.07083333488553763, + "rewards/format_reward": 0.962500023841858, + "rewards/tag_count_reward": 0.7390625178813934, + "step": 3118 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.952099609375, + "epoch": 0.9982397183549367, + "grad_norm": 0.19090551137924194, + "kl": 0.2739730294793844, + "learning_rate": 1.561302075625548e-10, + "loss": 0.0562, + "reward": 1.8151041865348816, + "reward_std": 0.19158529341220856, + "rewards/accuracy_reward": 0.10833333656191826, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7442708492279053, + "step": 3119 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.8687713623046, + "epoch": 0.9985597695631301, + "grad_norm": 0.2590288817882538, + "kl": 0.24943210408091546, + "learning_rate": 9.9923426446491e-11, + "loss": 0.0792, + "reward": 1.8656250476837157, + "reward_std": 0.2500589728355408, + "rewards/accuracy_reward": 0.16041667014360428, + "rewards/format_reward": 0.9645833492279052, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 3120 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.5166900634765, + "epoch": 0.9988798207713234, + "grad_norm": 0.16108450293540955, + "kl": 0.1967288039624691, + "learning_rate": 5.620696832964179e-11, + "loss": 0.087, + "reward": 1.8333333730697632, + "reward_std": 0.1661988228559494, + "rewards/accuracy_reward": 0.11666666977107525, + "rewards/format_reward": 0.9708333432674408, + "rewards/tag_count_reward": 0.7458333492279052, + "step": 3121 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.1583557128906, + "epoch": 0.9991998719795168, + "grad_norm": 0.10591613501310349, + "kl": 0.4319122813642025, + "learning_rate": 2.4980887813885745e-11, + "loss": 0.0541, + "reward": 1.7588541984558106, + "reward_std": 0.18204645216464996, + "rewards/accuracy_reward": 0.06666666772216559, + "rewards/format_reward": 0.9541666805744171, + "rewards/tag_count_reward": 0.7380208492279052, + "step": 3122 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.2312774658203, + "epoch": 0.99951992318771, + "grad_norm": 0.13070808351039886, + "kl": 0.35216558873653414, + "learning_rate": 6.245223903578179e-12, + "loss": 0.1193, + "reward": 1.7572917103767396, + "reward_std": 0.24927352666854857, + "rewards/accuracy_reward": 0.07708333637565375, + "rewards/format_reward": 0.9395833611488342, + "rewards/tag_count_reward": 0.7406250178813935, + "step": 3123 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.9297424316406, + "epoch": 0.9998399743959033, + "grad_norm": 0.24119921028614044, + "kl": 0.23806431293487548, + "learning_rate": 0.0, + "loss": 0.072, + "reward": 1.7848958730697633, + "reward_std": 0.2171033151447773, + "rewards/accuracy_reward": 0.08125000149011612, + "rewards/format_reward": 0.9625000178813934, + "rewards/tag_count_reward": 0.7411458551883697, + "step": 3124 + }, + { + "epoch": 0.9998399743959033, + "step": 3124, + "total_flos": 0.0, + "train_loss": 0.07466813361974993, + "train_runtime": 412723.7185, + "train_samples_per_second": 0.227, + "train_steps_per_second": 0.008 + } + ], + "logging_steps": 1, + "max_steps": 3124, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}