{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997759689343589, "eval_steps": 500, "global_step": 3347, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 648.9933319091797, "epoch": 0.00029870808752146963, "grad_norm": 0.35712730884552, "kl": 0.0, "learning_rate": 5.970149253731344e-08, "loss": 0.0267, "reward": 0.1941964365541935, "reward_std": 0.30125652253627777, "rewards/accuracy_reward": 0.07589286123402417, "rewards/format_reward": 0.01785714365541935, "rewards/tag_count_reward": 0.1004464328289032, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 690.7321624755859, "epoch": 0.0005974161750429393, "grad_norm": 0.32807624340057373, "kl": 0.0, "learning_rate": 1.1940298507462688e-07, "loss": 0.0408, "reward": 0.2126116156578064, "reward_std": 0.28106333315372467, "rewards/accuracy_reward": 0.08705357415601611, "rewards/format_reward": 0.015625000931322575, "rewards/tag_count_reward": 0.1099330373108387, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 652.9576110839844, "epoch": 0.0008961242625644089, "grad_norm": 0.3359529674053192, "kl": 0.00012636184692382812, "learning_rate": 1.7910447761194033e-07, "loss": 0.0192, "reward": 0.1808035783469677, "reward_std": 0.3680461049079895, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.03571428777649999, "rewards/tag_count_reward": 0.1026785746216774, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 645.700927734375, "epoch": 0.0011948323500858785, "grad_norm": 0.39243999123573303, "kl": 0.00014209747314453125, "learning_rate": 2.3880597014925377e-07, "loss": -0.0263, "reward": 0.20089286379516125, "reward_std": 0.2840797193348408, "rewards/accuracy_reward": 0.10937500605359674, "rewards/format_reward": 0.024553573224693537, "rewards/tag_count_reward": 0.06696428824216127, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 643.325927734375, "epoch": 0.0014935404376073482, "grad_norm": 0.32424959540367126, "kl": 0.0001373291015625, "learning_rate": 2.9850746268656716e-07, "loss": 0.0405, "reward": 0.19531250931322575, "reward_std": 0.28625644743442535, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.024553572526201606, "rewards/tag_count_reward": 0.09709822200238705, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 577.9174499511719, "epoch": 0.0017922485251288178, "grad_norm": 0.4192008078098297, "kl": 0.00014519691467285156, "learning_rate": 3.5820895522388065e-07, "loss": 0.0323, "reward": 0.2176339328289032, "reward_std": 0.37645474076271057, "rewards/accuracy_reward": 0.08035714598372579, "rewards/format_reward": 0.026785715483129025, "rewards/tag_count_reward": 0.1104910783469677, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 634.7232513427734, "epoch": 0.0020909566126502874, "grad_norm": 0.38207411766052246, "kl": 0.0001436471939086914, "learning_rate": 4.179104477611941e-07, "loss": 0.0261, "reward": 0.2114955447614193, "reward_std": 0.40590810030698776, "rewards/accuracy_reward": 0.07812500279396772, "rewards/format_reward": 0.03125000186264515, "rewards/tag_count_reward": 0.10212053917348385, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 652.075927734375, "epoch": 0.002389664700171757, "grad_norm": 0.3726411461830139, "kl": 0.00013434886932373047, "learning_rate": 4.776119402985075e-07, "loss": 0.0178, "reward": 0.274553582072258, "reward_std": 0.3702705651521683, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.037946430733427405, "rewards/tag_count_reward": 0.12053571827709675, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 674.2410888671875, "epoch": 0.0026883727876932267, "grad_norm": 0.32046744227409363, "kl": 0.000133514404296875, "learning_rate": 5.373134328358209e-07, "loss": 0.0324, "reward": 0.2566964402794838, "reward_std": 0.3425132781267166, "rewards/accuracy_reward": 0.11383929289877415, "rewards/format_reward": 0.03125000139698386, "rewards/tag_count_reward": 0.1116071455180645, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 663.4241333007812, "epoch": 0.0029870808752146963, "grad_norm": 0.3215222656726837, "kl": 0.00014126300811767578, "learning_rate": 5.970149253731343e-07, "loss": 0.0296, "reward": 0.23214286752045155, "reward_std": 0.32417190819978714, "rewards/accuracy_reward": 0.10267857369035482, "rewards/format_reward": 0.024553572991862893, "rewards/tag_count_reward": 0.10491071548312902, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 641.6317138671875, "epoch": 0.003285788962736166, "grad_norm": 0.369708389043808, "kl": 0.00017976760864257812, "learning_rate": 6.567164179104478e-07, "loss": 0.0292, "reward": 0.2327009029686451, "reward_std": 0.3681044206023216, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.033482144586741924, "rewards/tag_count_reward": 0.0987723246216774, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 670.607177734375, "epoch": 0.0035844970502576356, "grad_norm": 0.37167418003082275, "kl": 0.00031113624572753906, "learning_rate": 7.164179104477613e-07, "loss": 0.0354, "reward": 0.2890625074505806, "reward_std": 0.3958292677998543, "rewards/accuracy_reward": 0.10491071920841932, "rewards/format_reward": 0.03794643096625805, "rewards/tag_count_reward": 0.14620536379516125, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 670.7031555175781, "epoch": 0.003883205137779105, "grad_norm": 0.32989999651908875, "kl": 0.0003037452697753906, "learning_rate": 7.761194029850747e-07, "loss": 0.0225, "reward": 0.2059151865541935, "reward_std": 0.2921414449810982, "rewards/accuracy_reward": 0.08705357694998384, "rewards/format_reward": 0.020089287078008056, "rewards/tag_count_reward": 0.09877232648432255, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 651.2924499511719, "epoch": 0.004181913225300575, "grad_norm": 0.35746681690216064, "kl": 0.0013456344604492188, "learning_rate": 8.358208955223882e-07, "loss": 0.052, "reward": 0.3113839477300644, "reward_std": 0.4012499302625656, "rewards/accuracy_reward": 0.14285714738070965, "rewards/format_reward": 0.020089286845177412, "rewards/tag_count_reward": 0.1484375074505806, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 568.6986999511719, "epoch": 0.004480621312822045, "grad_norm": 0.3707813620567322, "kl": 0.00238800048828125, "learning_rate": 8.955223880597015e-07, "loss": 0.0411, "reward": 0.2801339402794838, "reward_std": 0.4527314156293869, "rewards/accuracy_reward": 0.09821428824216127, "rewards/format_reward": 0.03794643096625805, "rewards/tag_count_reward": 0.1439732201397419, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 675.8303833007812, "epoch": 0.004779329400343514, "grad_norm": 0.3311849534511566, "kl": 0.003849029541015625, "learning_rate": 9.55223880597015e-07, "loss": 0.0767, "reward": 0.2879464365541935, "reward_std": 0.4580071195960045, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.04687500186264515, "rewards/tag_count_reward": 0.1875000111758709, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 658.7611846923828, "epoch": 0.005078037487864984, "grad_norm": 0.2981855869293213, "kl": 0.00569915771484375, "learning_rate": 1.0149253731343285e-06, "loss": 0.0316, "reward": 0.2957589477300644, "reward_std": 0.4212942570447922, "rewards/accuracy_reward": 0.07142857275903225, "rewards/format_reward": 0.03125000186264515, "rewards/tag_count_reward": 0.1930803693830967, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 610.6652069091797, "epoch": 0.005376745575386453, "grad_norm": 1.6162383556365967, "kl": 0.0546875, "learning_rate": 1.0746268656716418e-06, "loss": 0.0582, "reward": 0.4073660932481289, "reward_std": 0.5211747512221336, "rewards/accuracy_reward": 0.0982142873108387, "rewards/format_reward": 0.08258929196745157, "rewards/tag_count_reward": 0.2265625149011612, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 602.216552734375, "epoch": 0.0056754536629079234, "grad_norm": 6.988424777984619, "kl": 0.180419921875, "learning_rate": 1.1343283582089555e-06, "loss": 0.0424, "reward": 0.4453125223517418, "reward_std": 0.5109957680106163, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0892857201397419, "rewards/tag_count_reward": 0.2645089440047741, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 641.9152221679688, "epoch": 0.005974161750429393, "grad_norm": 2.7693028450012207, "kl": 0.094970703125, "learning_rate": 1.1940298507462686e-06, "loss": 0.068, "reward": 0.4827009215950966, "reward_std": 0.5849575400352478, "rewards/accuracy_reward": 0.14285714668221772, "rewards/format_reward": 0.11383928917348385, "rewards/tag_count_reward": 0.2260044775903225, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 676.7544860839844, "epoch": 0.006272869837950863, "grad_norm": 0.7221543192863464, "kl": 0.036041259765625, "learning_rate": 1.253731343283582e-06, "loss": 0.0377, "reward": 0.3666294813156128, "reward_std": 0.5138736888766289, "rewards/accuracy_reward": 0.09821429150179029, "rewards/format_reward": 0.0848214328289032, "rewards/tag_count_reward": 0.1835937574505806, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 684.9553985595703, "epoch": 0.006571577925472332, "grad_norm": 1.3959470987319946, "kl": 0.02008056640625, "learning_rate": 1.3134328358208956e-06, "loss": 0.0439, "reward": 0.3041294775903225, "reward_std": 0.48464132845401764, "rewards/accuracy_reward": 0.03125000232830644, "rewards/format_reward": 0.06919643189758062, "rewards/tag_count_reward": 0.203683041036129, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 659.6227874755859, "epoch": 0.006870286012993802, "grad_norm": 0.27147364616394043, "kl": 0.01207733154296875, "learning_rate": 1.373134328358209e-06, "loss": 0.0538, "reward": 0.4268973395228386, "reward_std": 0.5855062752962112, "rewards/accuracy_reward": 0.055803572526201606, "rewards/format_reward": 0.12500000558793545, "rewards/tag_count_reward": 0.2460937611758709, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 635.5558319091797, "epoch": 0.007168994100515271, "grad_norm": 0.2978614568710327, "kl": 0.01080322265625, "learning_rate": 1.4328358208955226e-06, "loss": 0.0138, "reward": 0.3671875111758709, "reward_std": 0.43648601323366165, "rewards/accuracy_reward": 0.16071429289877415, "rewards/format_reward": 0.04687500232830644, "rewards/tag_count_reward": 0.1595982201397419, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 634.9754791259766, "epoch": 0.007467702188036741, "grad_norm": 0.28692740201950073, "kl": 0.010986328125, "learning_rate": 1.4925373134328358e-06, "loss": 0.0351, "reward": 0.3939732313156128, "reward_std": 0.5311274901032448, "rewards/accuracy_reward": 0.09598214761354029, "rewards/format_reward": 0.0803571455180645, "rewards/tag_count_reward": 0.2176339402794838, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 653.4866333007812, "epoch": 0.00776641027555821, "grad_norm": 0.3115369379520416, "kl": 0.007049560546875, "learning_rate": 1.5522388059701494e-06, "loss": 0.0555, "reward": 0.3470982238650322, "reward_std": 0.5082446187734604, "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.07589286006987095, "rewards/tag_count_reward": 0.199776791036129, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 649.3861846923828, "epoch": 0.00806511836307968, "grad_norm": 0.2684030532836914, "kl": 0.00783538818359375, "learning_rate": 1.6119402985074628e-06, "loss": 0.0342, "reward": 0.4341518059372902, "reward_std": 0.42567258328199387, "rewards/accuracy_reward": 0.18973215227015316, "rewards/format_reward": 0.0625000037252903, "rewards/tag_count_reward": 0.18191965110599995, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 673.7768096923828, "epoch": 0.00836382645060115, "grad_norm": 0.365773469209671, "kl": 0.00861358642578125, "learning_rate": 1.6716417910447764e-06, "loss": 0.0513, "reward": 0.3242187649011612, "reward_std": 0.48682042211294174, "rewards/accuracy_reward": 0.07366071594879031, "rewards/format_reward": 0.05803571594879031, "rewards/tag_count_reward": 0.19252233393490314, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 574.810302734375, "epoch": 0.008662534538122619, "grad_norm": 0.3945966362953186, "kl": 0.00943756103515625, "learning_rate": 1.7313432835820898e-06, "loss": 0.0402, "reward": 0.3811384215950966, "reward_std": 0.5165999233722687, "rewards/accuracy_reward": 0.06473214738070965, "rewards/format_reward": 0.07589286006987095, "rewards/tag_count_reward": 0.2405133992433548, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 612.0223388671875, "epoch": 0.00896124262564409, "grad_norm": 0.31475022435188293, "kl": 0.00765228271484375, "learning_rate": 1.791044776119403e-06, "loss": 0.0505, "reward": 0.4112723357975483, "reward_std": 0.5008794963359833, "rewards/accuracy_reward": 0.1741071529686451, "rewards/format_reward": 0.06919643329456449, "rewards/tag_count_reward": 0.16796875558793545, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 588.7299346923828, "epoch": 0.009259950713165559, "grad_norm": 0.3783261775970459, "kl": 0.012603759765625, "learning_rate": 1.8507462686567165e-06, "loss": 0.0166, "reward": 0.3392857387661934, "reward_std": 0.48479193076491356, "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.07812500186264515, "rewards/tag_count_reward": 0.2120535857975483, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 620.8080596923828, "epoch": 0.009558658800687028, "grad_norm": 0.31461775302886963, "kl": 0.011383056640625, "learning_rate": 1.91044776119403e-06, "loss": 0.0509, "reward": 0.4441964477300644, "reward_std": 0.5653792172670364, "rewards/accuracy_reward": 0.10491071990691125, "rewards/format_reward": 0.0870535746216774, "rewards/tag_count_reward": 0.2522321566939354, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 642.4129638671875, "epoch": 0.009857366888208497, "grad_norm": 0.33666884899139404, "kl": 0.013580322265625, "learning_rate": 1.9701492537313433e-06, "loss": 0.0768, "reward": 0.4603794813156128, "reward_std": 0.6088481843471527, "rewards/accuracy_reward": 0.06696429033763707, "rewards/format_reward": 0.1183035746216774, "rewards/tag_count_reward": 0.2751116156578064, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 612.4062805175781, "epoch": 0.010156074975729968, "grad_norm": 0.3676197826862335, "kl": 0.0199127197265625, "learning_rate": 2.029850746268657e-06, "loss": 0.0484, "reward": 0.4609375149011612, "reward_std": 0.582410179078579, "rewards/accuracy_reward": 0.10044643096625805, "rewards/format_reward": 0.113839291036129, "rewards/tag_count_reward": 0.2466517984867096, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 617.2344055175781, "epoch": 0.010454783063251438, "grad_norm": 0.3593224883079529, "kl": 0.0201416015625, "learning_rate": 2.08955223880597e-06, "loss": 0.0718, "reward": 0.4843750298023224, "reward_std": 0.6080197393894196, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.10267857648432255, "rewards/tag_count_reward": 0.2901785969734192, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 646.9553985595703, "epoch": 0.010753491150772907, "grad_norm": 0.33488398790359497, "kl": 0.026214599609375, "learning_rate": 2.1492537313432837e-06, "loss": 0.0201, "reward": 0.3962053768336773, "reward_std": 0.5660637244582176, "rewards/accuracy_reward": 0.011160714784637094, "rewards/format_reward": 0.09821429010480642, "rewards/tag_count_reward": 0.286830373108387, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 604.9888610839844, "epoch": 0.011052199238294378, "grad_norm": 0.3604983389377594, "kl": 0.03656005859375, "learning_rate": 2.2089552238805973e-06, "loss": 0.0426, "reward": 0.6088169813156128, "reward_std": 0.6330413222312927, "rewards/accuracy_reward": 0.12276786053553224, "rewards/format_reward": 0.13169643469154835, "rewards/tag_count_reward": 0.3543526977300644, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 672.3683166503906, "epoch": 0.011350907325815847, "grad_norm": 0.32615405321121216, "kl": 0.035888671875, "learning_rate": 2.268656716417911e-06, "loss": 0.028, "reward": 0.4464285895228386, "reward_std": 0.5749453604221344, "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.08482143469154835, "rewards/tag_count_reward": 0.3169643059372902, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 574.9419708251953, "epoch": 0.011649615413337316, "grad_norm": 5.075498104095459, "kl": 0.32513427734375, "learning_rate": 2.328358208955224e-06, "loss": 0.0315, "reward": 0.6149553805589676, "reward_std": 0.6226405948400497, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.1361607201397419, "rewards/tag_count_reward": 0.4140625223517418, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 588.0826110839844, "epoch": 0.011948323500858785, "grad_norm": 1.1203423738479614, "kl": 0.0958251953125, "learning_rate": 2.3880597014925373e-06, "loss": 0.0254, "reward": 0.6741071790456772, "reward_std": 0.6606137305498123, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.129464291036129, "rewards/tag_count_reward": 0.4129464477300644, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 593.2254791259766, "epoch": 0.012247031588380256, "grad_norm": 0.8736413717269897, "kl": 0.0770263671875, "learning_rate": 2.447761194029851e-06, "loss": 0.0075, "reward": 0.7444196939468384, "reward_std": 0.6106376349925995, "rewards/accuracy_reward": 0.12500000488944352, "rewards/format_reward": 0.14062500558793545, "rewards/tag_count_reward": 0.4787946715950966, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 596.5535888671875, "epoch": 0.012545739675901725, "grad_norm": 0.604847252368927, "kl": 0.07000732421875, "learning_rate": 2.507462686567164e-06, "loss": 0.0062, "reward": 0.5859375223517418, "reward_std": 0.5702894032001495, "rewards/accuracy_reward": 0.05133928684517741, "rewards/format_reward": 0.1071428619325161, "rewards/tag_count_reward": 0.4274553656578064, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 583.3013610839844, "epoch": 0.012844447763423195, "grad_norm": 0.542985737323761, "kl": 0.0556640625, "learning_rate": 2.5671641791044776e-06, "loss": 0.0077, "reward": 0.6350446790456772, "reward_std": 0.6255106031894684, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.1049107201397419, "rewards/tag_count_reward": 0.4743303805589676, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 527.7277069091797, "epoch": 0.013143155850944664, "grad_norm": 408.0224304199219, "kl": 20.5604248046875, "learning_rate": 2.6268656716417912e-06, "loss": 0.1213, "reward": 0.6316964477300644, "reward_std": 0.5613156408071518, "rewards/accuracy_reward": 0.09375000186264515, "rewards/format_reward": 0.06250000279396772, "rewards/tag_count_reward": 0.4754464477300644, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 607.1674346923828, "epoch": 0.013441863938466135, "grad_norm": 0.46973535418510437, "kl": 0.046905517578125, "learning_rate": 2.686567164179105e-06, "loss": 0.0263, "reward": 0.6294643133878708, "reward_std": 0.5647499933838844, "rewards/accuracy_reward": 0.07142857694998384, "rewards/format_reward": 0.082589291036129, "rewards/tag_count_reward": 0.475446455180645, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 575.8817138671875, "epoch": 0.013740572025987604, "grad_norm": 0.5494984984397888, "kl": 0.05377197265625, "learning_rate": 2.746268656716418e-06, "loss": -0.0006, "reward": 0.6618303880095482, "reward_std": 0.5017362609505653, "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.058035716880112886, "rewards/tag_count_reward": 0.510044664144516, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 594.6763610839844, "epoch": 0.014039280113509073, "grad_norm": 1.4997072219848633, "kl": 0.07806396484375, "learning_rate": 2.8059701492537316e-06, "loss": -0.0259, "reward": 0.7315848618745804, "reward_std": 0.5183757320046425, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.049107144586741924, "rewards/tag_count_reward": 0.561941996216774, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 605.9710083007812, "epoch": 0.014337988201030542, "grad_norm": 0.5479697585105896, "kl": 0.0784912109375, "learning_rate": 2.8656716417910452e-06, "loss": -0.0428, "reward": 0.7053571790456772, "reward_std": 0.4893234893679619, "rewards/accuracy_reward": 0.042410717345774174, "rewards/format_reward": 0.06250000232830644, "rewards/tag_count_reward": 0.6004464477300644, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 642.8549346923828, "epoch": 0.014636696288552013, "grad_norm": 0.49467140436172485, "kl": 0.08184814453125, "learning_rate": 2.925373134328359e-06, "loss": -0.031, "reward": 0.6607143133878708, "reward_std": 0.47167380154132843, "rewards/accuracy_reward": 0.06250000325962901, "rewards/format_reward": 0.04687500186264515, "rewards/tag_count_reward": 0.551339328289032, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 632.5402069091797, "epoch": 0.014935404376073482, "grad_norm": 0.473756343126297, "kl": 0.09033203125, "learning_rate": 2.9850746268656716e-06, "loss": -0.047, "reward": 0.7081473618745804, "reward_std": 0.515781968832016, "rewards/accuracy_reward": 0.05803571594879031, "rewards/format_reward": 0.0424107164144516, "rewards/tag_count_reward": 0.6077009290456772, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 601.0379791259766, "epoch": 0.015234112463594952, "grad_norm": 0.6481013298034668, "kl": 0.14013671875, "learning_rate": 3.044776119402985e-06, "loss": -0.0109, "reward": 0.8660714626312256, "reward_std": 0.39530912041664124, "rewards/accuracy_reward": 0.15178572502918541, "rewards/format_reward": 0.026785716181620955, "rewards/tag_count_reward": 0.6875000298023224, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 643.8170013427734, "epoch": 0.01553282055111642, "grad_norm": 0.5966575145721436, "kl": 0.1656494140625, "learning_rate": 3.1044776119402988e-06, "loss": -0.0286, "reward": 0.792410746216774, "reward_std": 0.3898973986506462, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.01562500069849193, "rewards/tag_count_reward": 0.6964285969734192, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 612.4509124755859, "epoch": 0.01583152863863789, "grad_norm": 0.6085568070411682, "kl": 0.17529296875, "learning_rate": 3.164179104477612e-06, "loss": -0.0338, "reward": 0.8297991305589676, "reward_std": 0.36907682567834854, "rewards/accuracy_reward": 0.13392857764847577, "rewards/format_reward": 0.011160715017467737, "rewards/tag_count_reward": 0.6847098618745804, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 622.8817367553711, "epoch": 0.01613023672615936, "grad_norm": 1.5537220239639282, "kl": 0.315185546875, "learning_rate": 3.2238805970149255e-06, "loss": 0.0223, "reward": 0.7723214626312256, "reward_std": 0.35823817551136017, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.011160715017467737, "rewards/tag_count_reward": 0.6406250298023224, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 681.591552734375, "epoch": 0.016428944813680832, "grad_norm": 0.6584349870681763, "kl": 0.1607666015625, "learning_rate": 3.283582089552239e-06, "loss": 0.0037, "reward": 0.6941964477300644, "reward_std": 0.3179081305861473, "rewards/accuracy_reward": 0.026785715715959668, "rewards/format_reward": 0.015625000931322575, "rewards/tag_count_reward": 0.6517857313156128, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 652.4643249511719, "epoch": 0.0167276529012023, "grad_norm": 319911.15625, "kl": 2160.0887451171875, "learning_rate": 3.3432835820895528e-06, "loss": 85.1686, "reward": 0.6791294813156128, "reward_std": 0.3063816428184509, "rewards/accuracy_reward": 0.02008928661234677, "rewards/format_reward": 0.008928572060540318, "rewards/tag_count_reward": 0.65011166036129, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 618.1027221679688, "epoch": 0.01702636098872377, "grad_norm": 0.733905553817749, "kl": 0.1719970703125, "learning_rate": 3.402985074626866e-06, "loss": -0.0077, "reward": 0.6902901977300644, "reward_std": 0.33867958933115005, "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.024553572526201606, "rewards/tag_count_reward": 0.6010044813156128, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 607.0692138671875, "epoch": 0.017325069076245238, "grad_norm": 1.3766008615493774, "kl": 0.1568603515625, "learning_rate": 3.4626865671641795e-06, "loss": -0.0344, "reward": 0.741629496216774, "reward_std": 0.34756314754486084, "rewards/accuracy_reward": 0.10044643515720963, "rewards/format_reward": 0.01562500116415322, "rewards/tag_count_reward": 0.6255580484867096, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 636.8303833007812, "epoch": 0.01762377716376671, "grad_norm": 1.3912423849105835, "kl": 0.2027587890625, "learning_rate": 3.5223880597014927e-06, "loss": 0.022, "reward": 0.7399553805589676, "reward_std": 0.2899435833096504, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.011160714784637094, "rewards/tag_count_reward": 0.6417410969734192, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 670.0536041259766, "epoch": 0.01792248525128818, "grad_norm": 1.0158106088638306, "kl": 0.157958984375, "learning_rate": 3.582089552238806e-06, "loss": -0.0016, "reward": 0.7282366305589676, "reward_std": 0.30513499677181244, "rewards/accuracy_reward": 0.09151786286383867, "rewards/format_reward": 0.013392857741564512, "rewards/tag_count_reward": 0.623325914144516, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 628.0513610839844, "epoch": 0.018221193338809647, "grad_norm": 1.7644014358520508, "kl": 0.09783935546875, "learning_rate": 3.6417910447761195e-06, "loss": 0.0107, "reward": 0.7460937798023224, "reward_std": 0.2715965509414673, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.006696428870782256, "rewards/tag_count_reward": 0.6590401977300644, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 594.9263610839844, "epoch": 0.018519901426331118, "grad_norm": 19498222.0, "kl": 40960.060119628906, "learning_rate": 3.701492537313433e-06, "loss": 2746.5063, "reward": 0.722098246216774, "reward_std": 0.2685219645500183, "rewards/accuracy_reward": 0.04241071757860482, "rewards/format_reward": 0.006696428870782256, "rewards/tag_count_reward": 0.6729910969734192, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 641.5089721679688, "epoch": 0.01881860951385259, "grad_norm": 1.591345191001892, "kl": 0.0908203125, "learning_rate": 3.7611940298507467e-06, "loss": 0.0215, "reward": 0.6679687798023224, "reward_std": 0.2643398717045784, "rewards/accuracy_reward": 0.022321430267766118, "rewards/format_reward": 0.004464285913854837, "rewards/tag_count_reward": 0.6411830484867096, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 636.1652069091797, "epoch": 0.019117317601374056, "grad_norm": 1.9380568265914917, "kl": 0.112548828125, "learning_rate": 3.82089552238806e-06, "loss": -0.026, "reward": 0.832589328289032, "reward_std": 0.30492619052529335, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.008928571827709675, "rewards/tag_count_reward": 0.6540178656578064, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 725.0826263427734, "epoch": 0.019416025688895527, "grad_norm": 1.2353018522262573, "kl": 0.1473388671875, "learning_rate": 3.8805970149253735e-06, "loss": 0.025, "reward": 0.7393973618745804, "reward_std": 0.29288356378674507, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.004464285913854837, "rewards/tag_count_reward": 0.6679687947034836, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 677.9040374755859, "epoch": 0.019714733776416995, "grad_norm": 1.4373409748077393, "kl": 0.341796875, "learning_rate": 3.940298507462687e-06, "loss": 0.0133, "reward": 0.7751116305589676, "reward_std": 0.22390684857964516, "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.6925223469734192, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 730.4777069091797, "epoch": 0.020013441863938466, "grad_norm": 2.9818594455718994, "kl": 0.37646484375, "learning_rate": 4.000000000000001e-06, "loss": 0.0583, "reward": 0.816964328289032, "reward_std": 0.2748863026499748, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.6875000298023224, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 711.4844055175781, "epoch": 0.020312149951459937, "grad_norm": 18.574230194091797, "kl": 0.6044921875, "learning_rate": 4.059701492537314e-06, "loss": 0.0513, "reward": 0.8554687947034836, "reward_std": 0.26562799140810966, "rewards/accuracy_reward": 0.165178582072258, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.688058078289032, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 674.1652069091797, "epoch": 0.020610858038981404, "grad_norm": 4.039796829223633, "kl": 0.404541015625, "learning_rate": 4.119402985074627e-06, "loss": 0.0403, "reward": 0.7678571939468384, "reward_std": 0.23195307329297066, "rewards/accuracy_reward": 0.07142857415601611, "rewards/format_reward": 0.004464285913854837, "rewards/tag_count_reward": 0.691964328289032, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 695.4286041259766, "epoch": 0.020909566126502875, "grad_norm": 4.75510835647583, "kl": 0.724609375, "learning_rate": 4.17910447761194e-06, "loss": 0.0554, "reward": 0.7885045111179352, "reward_std": 0.2551274336874485, "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.694754496216774, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 636.7968902587891, "epoch": 0.021208274214024346, "grad_norm": 3.2227749824523926, "kl": 0.4111328125, "learning_rate": 4.238805970149254e-06, "loss": 0.0538, "reward": 0.8627232611179352, "reward_std": 0.2549118548631668, "rewards/accuracy_reward": 0.1406250037252903, "rewards/format_reward": 0.004464285913854837, "rewards/tag_count_reward": 0.7176339775323868, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 661.5580749511719, "epoch": 0.021506982301545814, "grad_norm": 3.1343188285827637, "kl": 0.2376708984375, "learning_rate": 4.298507462686567e-06, "loss": 0.0123, "reward": 0.808035746216774, "reward_std": 0.16757013276219368, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.707589328289032, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 650.7277069091797, "epoch": 0.021805690389067284, "grad_norm": 2.557448387145996, "kl": 0.16552734375, "learning_rate": 4.358208955223881e-06, "loss": 0.0486, "reward": 0.8922991305589676, "reward_std": 0.19188811630010605, "rewards/accuracy_reward": 0.1941964440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6981027126312256, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 673.9241333007812, "epoch": 0.022104398476588755, "grad_norm": 8.732964515686035, "kl": 1.0693359375, "learning_rate": 4.417910447761195e-06, "loss": 0.0962, "reward": 0.7366071939468384, "reward_std": 0.1954764500260353, "rewards/accuracy_reward": 0.022321429336443543, "rewards/format_reward": 0.004464285913854837, "rewards/tag_count_reward": 0.7098214775323868, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 702.5915374755859, "epoch": 0.022403106564110223, "grad_norm": 11.969191551208496, "kl": 0.97265625, "learning_rate": 4.477611940298508e-06, "loss": 0.0544, "reward": 0.792410746216774, "reward_std": 0.21050791814923286, "rewards/accuracy_reward": 0.07812500209547579, "rewards/format_reward": 0.006696428870782256, "rewards/tag_count_reward": 0.7075893133878708, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 691.4977874755859, "epoch": 0.022701814651631694, "grad_norm": 2.7221531867980957, "kl": 0.202392578125, "learning_rate": 4.537313432835822e-06, "loss": 0.0326, "reward": 0.7695312798023224, "reward_std": 0.17252591252326965, "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.004464285913854837, "rewards/tag_count_reward": 0.695870578289032, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 663.5312957763672, "epoch": 0.02300052273915316, "grad_norm": 2.4278721809387207, "kl": 0.326171875, "learning_rate": 4.597014925373134e-06, "loss": 0.0282, "reward": 0.8493303805589676, "reward_std": 0.22954249009490013, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.004464285913854837, "rewards/tag_count_reward": 0.7087053805589676, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 703.4107513427734, "epoch": 0.023299230826674632, "grad_norm": 2.6952664852142334, "kl": 0.4169921875, "learning_rate": 4.656716417910448e-06, "loss": 0.0594, "reward": 0.8805803954601288, "reward_std": 0.2391996942460537, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.006696428870782256, "rewards/tag_count_reward": 0.7154018133878708, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 656.3326110839844, "epoch": 0.023597938914196103, "grad_norm": 4.472315788269043, "kl": 0.3486328125, "learning_rate": 4.716417910447761e-06, "loss": 0.0149, "reward": 0.8498884290456772, "reward_std": 0.26259494200348854, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.029017858672887087, "rewards/tag_count_reward": 0.7449777126312256, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 705.1049499511719, "epoch": 0.02389664700171757, "grad_norm": 2.735755443572998, "kl": 0.244384765625, "learning_rate": 4.7761194029850745e-06, "loss": 0.0282, "reward": 0.785714328289032, "reward_std": 0.21690813452005386, "rewards/accuracy_reward": 0.08928571920841932, "rewards/format_reward": 0.004464285913854837, "rewards/tag_count_reward": 0.6919643133878708, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 696.075927734375, "epoch": 0.02419535508923904, "grad_norm": 5.9472246170043945, "kl": 0.2030029296875, "learning_rate": 4.8358208955223885e-06, "loss": 0.0006, "reward": 0.8420759290456772, "reward_std": 0.22303017601370811, "rewards/accuracy_reward": 0.10937500721774995, "rewards/format_reward": 0.011160714784637094, "rewards/tag_count_reward": 0.7215402126312256, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 666.2589721679688, "epoch": 0.024494063176760512, "grad_norm": 1.790752649307251, "kl": 0.259033203125, "learning_rate": 4.895522388059702e-06, "loss": 0.0118, "reward": 0.8113839626312256, "reward_std": 0.24997325986623764, "rewards/accuracy_reward": 0.05803571827709675, "rewards/format_reward": 0.01562500116415322, "rewards/tag_count_reward": 0.737723246216774, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 646.8437805175781, "epoch": 0.02479277126428198, "grad_norm": 1.9481605291366577, "kl": 0.1195068359375, "learning_rate": 4.955223880597016e-06, "loss": 0.0255, "reward": 0.8560268133878708, "reward_std": 0.2701725475490093, "rewards/accuracy_reward": 0.06473214412108064, "rewards/format_reward": 0.03125000069849193, "rewards/tag_count_reward": 0.7600446790456772, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 647.9129638671875, "epoch": 0.02509147935180345, "grad_norm": 2.0290024280548096, "kl": 0.20458984375, "learning_rate": 5.014925373134328e-06, "loss": 0.0139, "reward": 0.913504496216774, "reward_std": 0.33315127342939377, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.03794643026776612, "rewards/tag_count_reward": 0.79073666036129, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 678.9955596923828, "epoch": 0.02539018743932492, "grad_norm": 2.2025809288024902, "kl": 0.270263671875, "learning_rate": 5.074626865671642e-06, "loss": 0.0846, "reward": 0.9319197088479996, "reward_std": 0.3726152628660202, "rewards/accuracy_reward": 0.13169643771834671, "rewards/format_reward": 0.05803571827709675, "rewards/tag_count_reward": 0.7421875447034836, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 685.1830596923828, "epoch": 0.02568889552684639, "grad_norm": 2.2146761417388916, "kl": 0.1302490234375, "learning_rate": 5.134328358208955e-06, "loss": 0.037, "reward": 0.9017857611179352, "reward_std": 0.2916329875588417, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.04910714481957257, "rewards/tag_count_reward": 0.7991071790456772, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 625.5558319091797, "epoch": 0.02598760361436786, "grad_norm": 5.157720565795898, "kl": 0.218505859375, "learning_rate": 5.194029850746269e-06, "loss": 0.0166, "reward": 1.0150670260190964, "reward_std": 0.3997092768549919, "rewards/accuracy_reward": 0.16294643771834671, "rewards/format_reward": 0.06250000093132257, "rewards/tag_count_reward": 0.7896205633878708, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 607.9330749511719, "epoch": 0.026286311701889328, "grad_norm": 15.284077644348145, "kl": 0.654296875, "learning_rate": 5.2537313432835825e-06, "loss": 0.0238, "reward": 0.8967634290456772, "reward_std": 0.3927447199821472, "rewards/accuracy_reward": 0.029017859138548374, "rewards/format_reward": 0.0848214328289032, "rewards/tag_count_reward": 0.7829241454601288, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 631.1250305175781, "epoch": 0.0265850197894108, "grad_norm": 26.518407821655273, "kl": 0.56591796875, "learning_rate": 5.3134328358208965e-06, "loss": 0.0521, "reward": 1.0658482313156128, "reward_std": 0.41280023008584976, "rewards/accuracy_reward": 0.13839286658912897, "rewards/format_reward": 0.11830357648432255, "rewards/tag_count_reward": 0.809151828289032, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 615.3839569091797, "epoch": 0.02688372787693227, "grad_norm": 155.20545959472656, "kl": 3.462890625, "learning_rate": 5.37313432835821e-06, "loss": 0.2267, "reward": 1.0228795111179352, "reward_std": 0.40816638618707657, "rewards/accuracy_reward": 0.10714285867288709, "rewards/format_reward": 0.1004464328289032, "rewards/tag_count_reward": 0.8152902126312256, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 554.3326187133789, "epoch": 0.027182435964453737, "grad_norm": 320.642333984375, "kl": 7.6875, "learning_rate": 5.432835820895522e-06, "loss": 0.3924, "reward": 0.9654018431901932, "reward_std": 0.3856255114078522, "rewards/accuracy_reward": 0.02901785750873387, "rewards/format_reward": 0.0959821492433548, "rewards/tag_count_reward": 0.840401828289032, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 601.6428833007812, "epoch": 0.027481144051975208, "grad_norm": 51.33663558959961, "kl": 2.2158203125, "learning_rate": 5.492537313432836e-06, "loss": 0.1367, "reward": 1.0122768133878708, "reward_std": 0.4686165601015091, "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.1339285746216774, "rewards/tag_count_reward": 0.8180803805589676, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 567.8370819091797, "epoch": 0.027779852139496675, "grad_norm": 18.307682037353516, "kl": 0.73828125, "learning_rate": 5.552238805970149e-06, "loss": 0.02, "reward": 0.9503348618745804, "reward_std": 0.37829604744911194, "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.0870535746216774, "rewards/tag_count_reward": 0.7918527126312256, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 618.0937805175781, "epoch": 0.028078560227018146, "grad_norm": 10.085908889770508, "kl": 0.359130859375, "learning_rate": 5.611940298507463e-06, "loss": -0.0028, "reward": 0.9631696939468384, "reward_std": 0.4257892817258835, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0870535783469677, "rewards/tag_count_reward": 0.7979910969734192, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 596.5893096923828, "epoch": 0.028377268314539617, "grad_norm": 639.044189453125, "kl": 3.857177734375, "learning_rate": 5.671641791044776e-06, "loss": 0.1781, "reward": 0.9882812947034836, "reward_std": 0.42067524790763855, "rewards/accuracy_reward": 0.07812500488944352, "rewards/format_reward": 0.1160714328289032, "rewards/tag_count_reward": 0.7940848618745804, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 637.9442138671875, "epoch": 0.028675976402061085, "grad_norm": 129.73947143554688, "kl": 1.29052734375, "learning_rate": 5.7313432835820904e-06, "loss": 0.0323, "reward": 0.9464286118745804, "reward_std": 0.4271984025835991, "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.1004464328289032, "rewards/tag_count_reward": 0.754464328289032, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 641.9129791259766, "epoch": 0.028974684489582556, "grad_norm": 1091.4658203125, "kl": 7.48779296875, "learning_rate": 5.791044776119404e-06, "loss": 0.4203, "reward": 0.8152902126312256, "reward_std": 0.3594542294740677, "rewards/accuracy_reward": 0.04687500325962901, "rewards/format_reward": 0.05133928777649999, "rewards/tag_count_reward": 0.7170759290456772, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 607.3080596923828, "epoch": 0.029273392577104027, "grad_norm": 22.719772338867188, "kl": 0.8359375, "learning_rate": 5.850746268656718e-06, "loss": 0.002, "reward": 0.9023437798023224, "reward_std": 0.31143787130713463, "rewards/accuracy_reward": 0.15625000838190317, "rewards/format_reward": 0.03125000186264515, "rewards/tag_count_reward": 0.7148437947034836, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 603.1339416503906, "epoch": 0.029572100664625494, "grad_norm": 13162.8974609375, "kl": 40.5048828125, "learning_rate": 5.91044776119403e-06, "loss": 2.256, "reward": 0.722098246216774, "reward_std": 0.2875635400414467, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.015625000931322575, "rewards/tag_count_reward": 0.6640625298023224, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 571.9129638671875, "epoch": 0.029870808752146965, "grad_norm": 14.862873077392578, "kl": 1.4189453125, "learning_rate": 5.970149253731343e-06, "loss": 0.0634, "reward": 0.768973246216774, "reward_std": 0.28914401680231094, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.01562500069849193, "rewards/tag_count_reward": 0.6506696790456772, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 648.2745819091797, "epoch": 0.030169516839668432, "grad_norm": 681.864990234375, "kl": 2.2900390625, "learning_rate": 6.029850746268657e-06, "loss": 0.1322, "reward": 0.7706473469734192, "reward_std": 0.23058322817087173, "rewards/accuracy_reward": 0.10267857881262898, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.6657366454601288, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 632.7254791259766, "epoch": 0.030468224927189903, "grad_norm": 5.931107044219971, "kl": 0.41650390625, "learning_rate": 6.08955223880597e-06, "loss": 0.0334, "reward": 0.7645089626312256, "reward_std": 0.1856892667710781, "rewards/accuracy_reward": 0.09598214295692742, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.668526828289032, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 575.7187957763672, "epoch": 0.030766933014711374, "grad_norm": 6.276047706604004, "kl": 0.466064453125, "learning_rate": 6.149253731343284e-06, "loss": 0.0122, "reward": 0.7539062947034836, "reward_std": 0.210929274559021, "rewards/accuracy_reward": 0.0915178582072258, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.6601562649011612, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 566.2098541259766, "epoch": 0.03106564110223284, "grad_norm": 71.37544250488281, "kl": 1.7412109375, "learning_rate": 6.2089552238805975e-06, "loss": 0.1218, "reward": 0.75948666036129, "reward_std": 0.23967062309384346, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6389508992433548, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 633.2455596923828, "epoch": 0.03136434918975431, "grad_norm": 89.9391860961914, "kl": 3.572265625, "learning_rate": 6.2686567164179116e-06, "loss": 0.1555, "reward": 0.748325914144516, "reward_std": 0.22144035249948502, "rewards/accuracy_reward": 0.10044643259607255, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.6456473618745804, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 547.8995742797852, "epoch": 0.03166305727727578, "grad_norm": 12.143160820007324, "kl": 2.091796875, "learning_rate": 6.328358208955224e-06, "loss": -0.0039, "reward": 0.6969866454601288, "reward_std": 0.23053672537207603, "rewards/accuracy_reward": 0.033482144586741924, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.663504496216774, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 570.9710083007812, "epoch": 0.03196176536479725, "grad_norm": 1859.107421875, "kl": 29.529296875, "learning_rate": 6.388059701492538e-06, "loss": 1.5405, "reward": 0.7126116454601288, "reward_std": 0.2603135369718075, "rewards/accuracy_reward": 0.09598214854486287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.616629496216774, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 578.279052734375, "epoch": 0.03226047345231872, "grad_norm": 33.14042282104492, "kl": 2.44140625, "learning_rate": 6.447761194029851e-06, "loss": 0.0424, "reward": 0.6835937798023224, "reward_std": 0.30326637253165245, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6032366156578064, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 556.8951263427734, "epoch": 0.03255918153984019, "grad_norm": 5.115396022796631, "kl": 2.03515625, "learning_rate": 6.507462686567164e-06, "loss": 0.0011, "reward": 0.7734375298023224, "reward_std": 0.23361939936876297, "rewards/accuracy_reward": 0.1607142905704677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6127232611179352, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 532.5982360839844, "epoch": 0.032857889627361664, "grad_norm": 4.742506980895996, "kl": 1.3671875, "learning_rate": 6.567164179104478e-06, "loss": 0.0036, "reward": 0.702566996216774, "reward_std": 0.2176562026143074, "rewards/accuracy_reward": 0.0602678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6422991305589676, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 575.5379791259766, "epoch": 0.03315659771488313, "grad_norm": 7.21842622756958, "kl": 1.1376953125, "learning_rate": 6.6268656716417915e-06, "loss": -0.0231, "reward": 0.6958705633878708, "reward_std": 0.2057226300239563, "rewards/accuracy_reward": 0.04017857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6556919813156128, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 580.1339645385742, "epoch": 0.0334553058024046, "grad_norm": 2.4824275970458984, "kl": 0.450439453125, "learning_rate": 6.6865671641791055e-06, "loss": 0.0094, "reward": 0.7410714626312256, "reward_std": 0.1739942468702793, "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6696428805589676, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 578.1986999511719, "epoch": 0.03375401388992607, "grad_norm": 3.655194044113159, "kl": 0.455322265625, "learning_rate": 6.746268656716418e-06, "loss": -0.0209, "reward": 0.8593750298023224, "reward_std": 0.1618405394256115, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7031250149011612, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 573.0848541259766, "epoch": 0.03405272197744754, "grad_norm": 53.81230163574219, "kl": 2.93359375, "learning_rate": 6.805970149253732e-06, "loss": 0.0937, "reward": 0.7890625298023224, "reward_std": 0.19238078221678734, "rewards/accuracy_reward": 0.09821429336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6908482611179352, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 682.7969055175781, "epoch": 0.03435143006496901, "grad_norm": 2.9026825428009033, "kl": 1.1240234375, "learning_rate": 6.865671641791045e-06, "loss": -0.0357, "reward": 0.7098214626312256, "reward_std": 0.19716962426900864, "rewards/accuracy_reward": 0.026785715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.683035746216774, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 633.2656402587891, "epoch": 0.034650138152490476, "grad_norm": 369.4264221191406, "kl": 6.4189453125, "learning_rate": 6.925373134328359e-06, "loss": 0.3363, "reward": 0.711495578289032, "reward_std": 0.1658963244408369, "rewards/accuracy_reward": 0.011160715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7003348469734192, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 656.0156555175781, "epoch": 0.034948846240011947, "grad_norm": 28.95267677307129, "kl": 2.8515625, "learning_rate": 6.985074626865672e-06, "loss": -0.0012, "reward": 0.7594866305589676, "reward_std": 0.19537752121686935, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6768973469734192, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 616.3839416503906, "epoch": 0.03524755432753342, "grad_norm": 9.347858428955078, "kl": 1.779296875, "learning_rate": 7.044776119402985e-06, "loss": -0.0304, "reward": 0.8465402126312256, "reward_std": 0.16682217456400394, "rewards/accuracy_reward": 0.1562500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6902902126312256, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 616.9375305175781, "epoch": 0.03554626241505489, "grad_norm": 11.9329252243042, "kl": 1.224609375, "learning_rate": 7.1044776119402994e-06, "loss": -0.0, "reward": 0.7667411118745804, "reward_std": 0.20202117785811424, "rewards/accuracy_reward": 0.07142857392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6953125298023224, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 646.5513763427734, "epoch": 0.03584497050257636, "grad_norm": 31.18320655822754, "kl": 3.89453125, "learning_rate": 7.164179104477612e-06, "loss": 0.0397, "reward": 0.7527902126312256, "reward_std": 0.1763654351234436, "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6992187649011612, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 586.3928680419922, "epoch": 0.03614367859009783, "grad_norm": 11.957996368408203, "kl": 2.8125, "learning_rate": 7.223880597014926e-06, "loss": -0.0448, "reward": 0.7343750447034836, "reward_std": 0.19101672433316708, "rewards/accuracy_reward": 0.04017857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6941964626312256, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 621.8750152587891, "epoch": 0.036442386677619294, "grad_norm": 70.43120574951172, "kl": 4.501953125, "learning_rate": 7.283582089552239e-06, "loss": 0.1132, "reward": 0.7170759439468384, "reward_std": 0.13490702770650387, "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7103795111179352, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 651.0111694335938, "epoch": 0.036741094765140765, "grad_norm": 8.444682121276855, "kl": 2.6015625, "learning_rate": 7.343283582089553e-06, "loss": -0.0273, "reward": 0.8085937798023224, "reward_std": 0.23777784779667854, "rewards/accuracy_reward": 0.11383929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6947545111179352, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 606.4598541259766, "epoch": 0.037039802852662236, "grad_norm": 5.6161932945251465, "kl": 0.8427734375, "learning_rate": 7.402985074626866e-06, "loss": -0.0143, "reward": 0.7862723618745804, "reward_std": 0.18006107583642006, "rewards/accuracy_reward": 0.07812500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7081473618745804, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 627.4486846923828, "epoch": 0.03733851094018371, "grad_norm": 7.0942559242248535, "kl": 0.931640625, "learning_rate": 7.46268656716418e-06, "loss": -0.0676, "reward": 0.7840402126312256, "reward_std": 0.1681873705238104, "rewards/accuracy_reward": 0.08482143259607255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6992187798023224, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 655.1027069091797, "epoch": 0.03763721902770518, "grad_norm": 1.4384223222732544, "kl": 0.2396240234375, "learning_rate": 7.522388059701493e-06, "loss": -0.0145, "reward": 0.8526786118745804, "reward_std": 0.12629853375256062, "rewards/accuracy_reward": 0.13392857881262898, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7187500298023224, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 548.4419860839844, "epoch": 0.03793592711522664, "grad_norm": 92.60130310058594, "kl": 5.88916015625, "learning_rate": 7.582089552238806e-06, "loss": 0.2181, "reward": 0.8465402275323868, "reward_std": 0.16613734140992165, "rewards/accuracy_reward": 0.13839286682195961, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.7059152126312256, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 454.8616256713867, "epoch": 0.03823463520274811, "grad_norm": 692.6163330078125, "kl": 32.390625, "learning_rate": 7.64179104477612e-06, "loss": 1.8762, "reward": 0.7583705633878708, "reward_std": 0.23884085938334465, "rewards/accuracy_reward": 0.06250000349245965, "rewards/format_reward": 0.015625000465661287, "rewards/tag_count_reward": 0.6802455633878708, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 306.15179443359375, "epoch": 0.038533343290269584, "grad_norm": 881.03857421875, "kl": 39.21875, "learning_rate": 7.701492537313433e-06, "loss": 2.8906, "reward": 0.72433041036129, "reward_std": 0.2969442456960678, "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.022321430034935474, "rewards/tag_count_reward": 0.5904017984867096, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 289.5870704650879, "epoch": 0.038832051377791055, "grad_norm": 109.1254653930664, "kl": 10.421875, "learning_rate": 7.761194029850747e-06, "loss": 0.6701, "reward": 0.6216517984867096, "reward_std": 0.32256292924284935, "rewards/accuracy_reward": 0.0379464291036129, "rewards/format_reward": 0.03125000116415322, "rewards/tag_count_reward": 0.5524553805589676, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 291.3348274230957, "epoch": 0.039130759465312526, "grad_norm": 18.161333084106445, "kl": 5.46875, "learning_rate": 7.82089552238806e-06, "loss": 0.2608, "reward": 0.5898437649011612, "reward_std": 0.32965492457151413, "rewards/accuracy_reward": 0.020089286379516125, "rewards/format_reward": 0.029017857741564512, "rewards/tag_count_reward": 0.5407366305589676, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 471.39064025878906, "epoch": 0.03942946755283399, "grad_norm": 3.7280235290527344, "kl": 0.69140625, "learning_rate": 7.880597014925373e-06, "loss": -0.0155, "reward": 0.7315848618745804, "reward_std": 0.23588190972805023, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.013392857741564512, "rewards/tag_count_reward": 0.6824777126312256, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 492.9018096923828, "epoch": 0.03972817564035546, "grad_norm": 2.8974409103393555, "kl": 0.49462890625, "learning_rate": 7.940298507462687e-06, "loss": -0.0298, "reward": 0.8097098469734192, "reward_std": 0.22627347335219383, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6757812798023224, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 563.3013610839844, "epoch": 0.04002688372787693, "grad_norm": 3.7794578075408936, "kl": 0.626953125, "learning_rate": 8.000000000000001e-06, "loss": 0.0741, "reward": 0.7126116305589676, "reward_std": 0.22166389599442482, "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.630022332072258, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 572.0201187133789, "epoch": 0.0403255918153984, "grad_norm": 1.9304888248443604, "kl": 0.6162109375, "learning_rate": 8.059701492537314e-06, "loss": -0.0071, "reward": 0.714285746216774, "reward_std": 0.26515786349773407, "rewards/accuracy_reward": 0.0870535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6272321492433548, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 619.4218902587891, "epoch": 0.04062429990291987, "grad_norm": 2.774110794067383, "kl": 0.47900390625, "learning_rate": 8.119402985074628e-06, "loss": -0.0093, "reward": 0.664620578289032, "reward_std": 0.2247898168861866, "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.646763414144516, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 599.2991333007812, "epoch": 0.040923007990441344, "grad_norm": 1.9465110301971436, "kl": 0.43994140625, "learning_rate": 8.179104477611942e-06, "loss": 0.0262, "reward": 0.6875000298023224, "reward_std": 0.22883658856153488, "rewards/accuracy_reward": 0.02678571525029838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6607143133878708, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 600.9754791259766, "epoch": 0.04122171607796281, "grad_norm": 5.5333099365234375, "kl": 1.1787109375, "learning_rate": 8.238805970149254e-06, "loss": 0.064, "reward": 0.761160746216774, "reward_std": 0.24738822877407074, "rewards/accuracy_reward": 0.09821429150179029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6629464477300644, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 534.7701263427734, "epoch": 0.04152042416548428, "grad_norm": 3.5813043117523193, "kl": 1.251953125, "learning_rate": 8.298507462686568e-06, "loss": -0.0506, "reward": 0.8683036118745804, "reward_std": 0.28808651119470596, "rewards/accuracy_reward": 0.2343750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6339285969734192, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 563.6629791259766, "epoch": 0.04181913225300575, "grad_norm": 7.13264274597168, "kl": 2.8828125, "learning_rate": 8.35820895522388e-06, "loss": -0.0155, "reward": 0.6802455633878708, "reward_std": 0.29592887312173843, "rewards/accuracy_reward": 0.07589286030270159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6043526977300644, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 602.6384124755859, "epoch": 0.04211784034052722, "grad_norm": 4.31461238861084, "kl": 2.30078125, "learning_rate": 8.417910447761194e-06, "loss": -0.02, "reward": 0.6685268133878708, "reward_std": 0.29320163652300835, "rewards/accuracy_reward": 0.07142857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5970982313156128, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 634.0000457763672, "epoch": 0.04241654842804869, "grad_norm": 6.87827730178833, "kl": 3.171875, "learning_rate": 8.477611940298508e-06, "loss": 0.0329, "reward": 0.6456473469734192, "reward_std": 0.30153534561395645, "rewards/accuracy_reward": 0.06026786146685481, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5853794813156128, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 750.6897735595703, "epoch": 0.042715256515570156, "grad_norm": 2.5603559017181396, "kl": 0.984375, "learning_rate": 8.537313432835822e-06, "loss": 0.0253, "reward": 0.6523437798023224, "reward_std": 0.30198313295841217, "rewards/accuracy_reward": 0.11607143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5362723469734192, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 704.1808319091797, "epoch": 0.04301396460309163, "grad_norm": 3.121093988418579, "kl": 1.4931640625, "learning_rate": 8.597014925373135e-06, "loss": -0.0404, "reward": 0.655691996216774, "reward_std": 0.2981906682252884, "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6043527126312256, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 607.7500152587891, "epoch": 0.0433126726906131, "grad_norm": 6.171100616455078, "kl": 1.71484375, "learning_rate": 8.656716417910447e-06, "loss": -0.0752, "reward": 0.713169664144516, "reward_std": 0.2695082351565361, "rewards/accuracy_reward": 0.07812500279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6350446790456772, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 585.2567291259766, "epoch": 0.04361138077813457, "grad_norm": 2.2366538047790527, "kl": 0.5595703125, "learning_rate": 8.716417910447761e-06, "loss": -0.0949, "reward": 0.7276786118745804, "reward_std": 0.16472654417157173, "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6875000447034836, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 531.5647430419922, "epoch": 0.04391008886565604, "grad_norm": 2.517665386199951, "kl": 0.990234375, "learning_rate": 8.776119402985075e-06, "loss": -0.1096, "reward": 0.8058036118745804, "reward_std": 0.2503645643591881, "rewards/accuracy_reward": 0.12053572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6852678954601288, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 572.716552734375, "epoch": 0.04420879695317751, "grad_norm": 8.145984649658203, "kl": 1.001953125, "learning_rate": 8.83582089552239e-06, "loss": -0.1024, "reward": 0.7388393133878708, "reward_std": 0.23698630928993225, "rewards/accuracy_reward": 0.07366071571595967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6651786118745804, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 565.3192291259766, "epoch": 0.044507505040698975, "grad_norm": 1.9990880489349365, "kl": 0.85595703125, "learning_rate": 8.895522388059702e-06, "loss": -0.1114, "reward": 0.7767857611179352, "reward_std": 0.23966813832521439, "rewards/accuracy_reward": 0.10044643701985478, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6763392984867096, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 607.0759124755859, "epoch": 0.044806213128220446, "grad_norm": 37.12355422973633, "kl": 3.30078125, "learning_rate": 8.955223880597016e-06, "loss": 0.0284, "reward": 0.7165178954601288, "reward_std": 0.24247300624847412, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.658482164144516, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 562.0759124755859, "epoch": 0.04510492121574192, "grad_norm": 2.849501848220825, "kl": 1.08984375, "learning_rate": 9.01492537313433e-06, "loss": -0.0994, "reward": 0.7014509290456772, "reward_std": 0.2193455882370472, "rewards/accuracy_reward": 0.04687500046566129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6545759290456772, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 625.0982513427734, "epoch": 0.04540362930326339, "grad_norm": 1.179031491279602, "kl": 1.7783203125, "learning_rate": 9.074626865671644e-06, "loss": -0.151, "reward": 0.7695312798023224, "reward_std": 0.24140552058815956, "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6579241305589676, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 574.3549194335938, "epoch": 0.04570233739078486, "grad_norm": 1.3275632858276367, "kl": 1.2822265625, "learning_rate": 9.134328358208956e-06, "loss": -0.125, "reward": 0.8180803954601288, "reward_std": 0.2165326178073883, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6819196939468384, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 557.7678833007812, "epoch": 0.04600104547830632, "grad_norm": 5.618509292602539, "kl": 2.06640625, "learning_rate": 9.194029850746268e-06, "loss": -0.1089, "reward": 0.7243303954601288, "reward_std": 0.2585441805422306, "rewards/accuracy_reward": 0.06250000419095159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6618303954601288, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 542.6540374755859, "epoch": 0.04629975356582779, "grad_norm": 16.494457244873047, "kl": 3.75, "learning_rate": 9.253731343283582e-06, "loss": -0.042, "reward": 0.8203125298023224, "reward_std": 0.2708568014204502, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6551339477300644, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 596.7835083007812, "epoch": 0.046598461653349264, "grad_norm": 7.047369003295898, "kl": 6.22265625, "learning_rate": 9.313432835820896e-06, "loss": -0.01, "reward": 0.7544643133878708, "reward_std": 0.23737580329179764, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6272321790456772, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 544.8393096923828, "epoch": 0.046897169740870735, "grad_norm": 73.30524444580078, "kl": 5.23046875, "learning_rate": 9.37313432835821e-06, "loss": 0.0731, "reward": 0.7193080633878708, "reward_std": 0.25406264513731003, "rewards/accuracy_reward": 0.07589286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6434151977300644, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 512.1339569091797, "epoch": 0.047195877828392206, "grad_norm": 5.48018217086792, "kl": 4.4453125, "learning_rate": 9.432835820895523e-06, "loss": -0.0052, "reward": 0.6735491305589676, "reward_std": 0.26916007697582245, "rewards/accuracy_reward": 0.040178572526201606, "rewards/format_reward": 0.004464285913854837, "rewards/tag_count_reward": 0.6289062798023224, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 595.1808319091797, "epoch": 0.04749458591591367, "grad_norm": 8.918991088867188, "kl": 5.83203125, "learning_rate": 9.492537313432837e-06, "loss": 0.0126, "reward": 0.6590401977300644, "reward_std": 0.28360046446323395, "rewards/accuracy_reward": 0.10491071944124997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5541294813156128, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 678.6451110839844, "epoch": 0.04779329400343514, "grad_norm": 6.007251739501953, "kl": 4.13671875, "learning_rate": 9.552238805970149e-06, "loss": 0.0801, "reward": 0.7003348469734192, "reward_std": 0.30545568466186523, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5284598469734192, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 866.1629943847656, "epoch": 0.04809200209095661, "grad_norm": 2.1484997272491455, "kl": 5.203125, "learning_rate": 9.611940298507465e-06, "loss": 0.1072, "reward": 0.4754464402794838, "reward_std": 0.2935132309794426, "rewards/accuracy_reward": 0.037946430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375000223517418, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 832.1897735595703, "epoch": 0.04839071017847808, "grad_norm": 16.664348602294922, "kl": 2.7734375, "learning_rate": 9.671641791044777e-06, "loss": 0.1104, "reward": 0.5825893059372902, "reward_std": 0.3373779430985451, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466517873108387, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 832.8460083007812, "epoch": 0.048689418265999554, "grad_norm": 2.403515338897705, "kl": 1.3701171875, "learning_rate": 9.73134328358209e-06, "loss": 0.0829, "reward": 0.5792410969734192, "reward_std": 0.29818272590637207, "rewards/accuracy_reward": 0.06696428684517741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5122768059372902, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 751.0536041259766, "epoch": 0.048988126353521025, "grad_norm": 5.558374404907227, "kl": 0.8759765625, "learning_rate": 9.791044776119403e-06, "loss": 0.0501, "reward": 0.6729910969734192, "reward_std": 0.2854542136192322, "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6015625298023224, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 631.8750457763672, "epoch": 0.04928683444104249, "grad_norm": 6.625753879547119, "kl": 0.787109375, "learning_rate": 9.850746268656717e-06, "loss": 0.0348, "reward": 0.6930803805589676, "reward_std": 0.27067309617996216, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6395089477300644, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 526.381721496582, "epoch": 0.04958554252856396, "grad_norm": 4.093009948730469, "kl": 0.61474609375, "learning_rate": 9.910447761194031e-06, "loss": 0.0128, "reward": 0.7857143133878708, "reward_std": 0.2249591127038002, "rewards/accuracy_reward": 0.10937500488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.676339328289032, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 478.11163330078125, "epoch": 0.04988425061608543, "grad_norm": 10.771910667419434, "kl": 1.1572265625, "learning_rate": 9.970149253731344e-06, "loss": 0.0404, "reward": 0.7952009290456772, "reward_std": 0.20312092825770378, "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6724330633878708, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 498.13172149658203, "epoch": 0.0501829587036069, "grad_norm": 5.420681476593018, "kl": 0.6552734375, "learning_rate": 1.0029850746268656e-05, "loss": 0.0384, "reward": 0.686941996216774, "reward_std": 0.17045965418219566, "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6802455633878708, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 468.08038330078125, "epoch": 0.05048166679112837, "grad_norm": 4.691173076629639, "kl": 1.19140625, "learning_rate": 1.008955223880597e-05, "loss": -0.0262, "reward": 0.6824777126312256, "reward_std": 0.20448832213878632, "rewards/accuracy_reward": 0.015625000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6668526977300644, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 474.25894927978516, "epoch": 0.05078037487864984, "grad_norm": 26.995803833007812, "kl": 3.599609375, "learning_rate": 1.0149253731343284e-05, "loss": 0.0257, "reward": 0.7494420111179352, "reward_std": 0.19775470346212387, "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6579241305589676, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 440.9955596923828, "epoch": 0.05107908296617131, "grad_norm": 29.177736282348633, "kl": 2.677734375, "learning_rate": 1.0208955223880598e-05, "loss": 0.0445, "reward": 0.7276785969734192, "reward_std": 0.2155926115810871, "rewards/accuracy_reward": 0.06250000116415322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6651786118745804, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 432.8348388671875, "epoch": 0.05137779105369278, "grad_norm": 85.32209777832031, "kl": 3.17578125, "learning_rate": 1.026865671641791e-05, "loss": 0.1168, "reward": 0.6841518133878708, "reward_std": 0.20892326161265373, "rewards/accuracy_reward": 0.02455357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6595982313156128, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 443.72322845458984, "epoch": 0.05167649914121425, "grad_norm": 230.63125610351562, "kl": 4.80078125, "learning_rate": 1.0328358208955225e-05, "loss": 0.1995, "reward": 0.683035746216774, "reward_std": 0.23798871040344238, "rewards/accuracy_reward": 0.05580357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.627232164144516, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 451.38841247558594, "epoch": 0.05197520722873572, "grad_norm": 38.79052734375, "kl": 5.0390625, "learning_rate": 1.0388059701492539e-05, "loss": 0.1041, "reward": 0.6428571790456772, "reward_std": 0.21869463101029396, "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6339285969734192, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 471.7902069091797, "epoch": 0.052273915316257184, "grad_norm": 6.357179164886475, "kl": 1.984375, "learning_rate": 1.0447761194029851e-05, "loss": 0.0271, "reward": 0.6718750298023224, "reward_std": 0.22826843336224556, "rewards/accuracy_reward": 0.04687500209547579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6250000149011612, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 418.2098388671875, "epoch": 0.052572623403778655, "grad_norm": 9.104899406433105, "kl": 2.357421875, "learning_rate": 1.0507462686567165e-05, "loss": 0.0394, "reward": 0.685825914144516, "reward_std": 0.23972102627158165, "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6277901828289032, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 438.3594055175781, "epoch": 0.052871331491300126, "grad_norm": 4.820700645446777, "kl": 0.718505859375, "learning_rate": 1.0567164179104479e-05, "loss": 0.0122, "reward": 0.7929687798023224, "reward_std": 0.20634136348962784, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6702009290456772, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 404.12278747558594, "epoch": 0.0531700395788216, "grad_norm": 6.183660507202148, "kl": 1.603515625, "learning_rate": 1.0626865671641793e-05, "loss": 0.0197, "reward": 0.7600446790456772, "reward_std": 0.19636428728699684, "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6685268133878708, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 394.4799270629883, "epoch": 0.05346874766634307, "grad_norm": 8.238167762756348, "kl": 0.5869140625, "learning_rate": 1.0686567164179105e-05, "loss": 0.0814, "reward": 0.7907366454601288, "reward_std": 0.17860116064548492, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6992187798023224, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 406.69422149658203, "epoch": 0.05376745575386454, "grad_norm": 12.241090774536133, "kl": 1.65625, "learning_rate": 1.074626865671642e-05, "loss": 0.0287, "reward": 0.7349330633878708, "reward_std": 0.17263685166835785, "rewards/accuracy_reward": 0.04910714412108064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6858259290456772, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 383.90626525878906, "epoch": 0.054066163841386, "grad_norm": 16.162078857421875, "kl": 2.28125, "learning_rate": 1.0805970149253733e-05, "loss": 0.15, "reward": 0.7332589626312256, "reward_std": 0.20817912742495537, "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.6774553805589676, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 384.0959930419922, "epoch": 0.054364871928907474, "grad_norm": 49.17191696166992, "kl": 3.2109375, "learning_rate": 1.0865671641791044e-05, "loss": 0.3096, "reward": 0.8085937798023224, "reward_std": 0.2004268877208233, "rewards/accuracy_reward": 0.12276786123402417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6858259290456772, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 405.17413330078125, "epoch": 0.054663580016428945, "grad_norm": 42.98146438598633, "kl": 4.00390625, "learning_rate": 1.0925373134328358e-05, "loss": 0.2146, "reward": 0.6981027126312256, "reward_std": 0.17543894797563553, "rewards/accuracy_reward": 0.011160714784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6869420111179352, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 408.05804443359375, "epoch": 0.054962288103950416, "grad_norm": 22.397188186645508, "kl": 2.37109375, "learning_rate": 1.0985074626865672e-05, "loss": 0.0178, "reward": 0.772879496216774, "reward_std": 0.1552120391279459, "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6969866305589676, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 385.2210006713867, "epoch": 0.05526099619147189, "grad_norm": 21.956212997436523, "kl": 2.228515625, "learning_rate": 1.1044776119402986e-05, "loss": 0.1133, "reward": 0.7371652126312256, "reward_std": 0.17240698635578156, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6925223469734192, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 407.43528747558594, "epoch": 0.05555970427899335, "grad_norm": 16.977989196777344, "kl": 1.37109375, "learning_rate": 1.1104477611940298e-05, "loss": 0.0394, "reward": 0.8046875447034836, "reward_std": 0.19122829288244247, "rewards/accuracy_reward": 0.10044643236324191, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7042411118745804, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 460.31028747558594, "epoch": 0.05585841236651482, "grad_norm": 15.689048767089844, "kl": 1.525390625, "learning_rate": 1.1164179104477612e-05, "loss": 0.067, "reward": 0.7879464775323868, "reward_std": 0.13776052743196487, "rewards/accuracy_reward": 0.07812500232830644, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.707589328289032, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 459.2567138671875, "epoch": 0.05615712045403629, "grad_norm": 26.968746185302734, "kl": 1.947265625, "learning_rate": 1.1223880597014926e-05, "loss": 0.0978, "reward": 0.7901785969734192, "reward_std": 0.15505405701696873, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.705357164144516, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 429.7076110839844, "epoch": 0.056455828541557763, "grad_norm": 3.0215508937835693, "kl": 1.0, "learning_rate": 1.128358208955224e-05, "loss": -0.0445, "reward": 0.8085937798023224, "reward_std": 0.09623875468969345, "rewards/accuracy_reward": 0.08035714528523386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7282366454601288, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 439.5424346923828, "epoch": 0.056754536629079234, "grad_norm": 25.676021575927734, "kl": 2.220703125, "learning_rate": 1.1343283582089553e-05, "loss": 0.0608, "reward": 0.7606027126312256, "reward_std": 0.1290527991950512, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7159598618745804, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 440.6808166503906, "epoch": 0.057053244716600705, "grad_norm": 28.6173095703125, "kl": 2.1845703125, "learning_rate": 1.1402985074626867e-05, "loss": 0.0686, "reward": 0.7349330633878708, "reward_std": 0.135705491527915, "rewards/accuracy_reward": 0.017857143422588706, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7170759290456772, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 483.4844055175781, "epoch": 0.05735195280412217, "grad_norm": 7.870949745178223, "kl": 0.73046875, "learning_rate": 1.1462686567164181e-05, "loss": 0.0552, "reward": 0.8214286118745804, "reward_std": 0.1312213260680437, "rewards/accuracy_reward": 0.09375000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7276786118745804, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 411.5535888671875, "epoch": 0.05765066089164364, "grad_norm": 14.988609313964844, "kl": 1.5576171875, "learning_rate": 1.1522388059701493e-05, "loss": 0.0119, "reward": 0.8214285969734192, "reward_std": 0.11604922823607922, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7321428805589676, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 480.34153747558594, "epoch": 0.05794936897916511, "grad_norm": 14.300446510314941, "kl": 2.9853515625, "learning_rate": 1.1582089552238807e-05, "loss": -0.0035, "reward": 0.776785746216774, "reward_std": 0.12889769300818443, "rewards/accuracy_reward": 0.05580357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.720982164144516, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 481.9799270629883, "epoch": 0.05824807706668658, "grad_norm": 20.02129364013672, "kl": 1.0458984375, "learning_rate": 1.1641791044776121e-05, "loss": 0.0742, "reward": 0.7472098469734192, "reward_std": 0.1360456757247448, "rewards/accuracy_reward": 0.020089287078008056, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.727120578289032, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 468.16967010498047, "epoch": 0.05854678515420805, "grad_norm": 12.612679481506348, "kl": 1.4716796875, "learning_rate": 1.1701492537313435e-05, "loss": 0.0751, "reward": 0.7957589626312256, "reward_std": 0.12527215853333473, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7176339626312256, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 491.8482437133789, "epoch": 0.05884549324172952, "grad_norm": 7.476356506347656, "kl": 1.470703125, "learning_rate": 1.1761194029850746e-05, "loss": -0.0055, "reward": 0.7444196939468384, "reward_std": 0.13655710965394974, "rewards/accuracy_reward": 0.0223214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7220982611179352, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 527.334846496582, "epoch": 0.05914420132925099, "grad_norm": 83.48664855957031, "kl": 4.490234375, "learning_rate": 1.182089552238806e-05, "loss": 0.3062, "reward": 0.7840402126312256, "reward_std": 0.12132020108401775, "rewards/accuracy_reward": 0.05803571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.726004496216774, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 463.00225830078125, "epoch": 0.05944290941677246, "grad_norm": 14.865174293518066, "kl": 1.05078125, "learning_rate": 1.1880597014925374e-05, "loss": 0.0596, "reward": 0.770089328289032, "reward_std": 0.13829203136265278, "rewards/accuracy_reward": 0.0401785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.729910746216774, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 468.4576110839844, "epoch": 0.05974161750429393, "grad_norm": 62.87678527832031, "kl": 2.71484375, "learning_rate": 1.1940298507462686e-05, "loss": 0.1985, "reward": 0.7896205633878708, "reward_std": 0.14301526732742786, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7248884290456772, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 486.06029510498047, "epoch": 0.0600403255918154, "grad_norm": 8.948474884033203, "kl": 1.1298828125, "learning_rate": 1.2e-05, "loss": 0.0676, "reward": 0.7929687947034836, "reward_std": 0.1129743019118905, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7282366454601288, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 513.5290451049805, "epoch": 0.060339033679336865, "grad_norm": 7.83298921585083, "kl": 1.6298828125, "learning_rate": 1.2059701492537314e-05, "loss": 0.0235, "reward": 0.809151828289032, "reward_std": 0.153180293738842, "rewards/accuracy_reward": 0.09375000419095159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.715401828289032, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 510.15404510498047, "epoch": 0.060637741766858336, "grad_norm": 12.203262329101562, "kl": 2.6728515625, "learning_rate": 1.2119402985074628e-05, "loss": 0.0576, "reward": 0.7561384290456772, "reward_std": 0.17380356043577194, "rewards/accuracy_reward": 0.04241071501746774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7137277126312256, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 507.80137634277344, "epoch": 0.06093644985437981, "grad_norm": 19.412694931030273, "kl": 0.93115234375, "learning_rate": 1.217910447761194e-05, "loss": 0.104, "reward": 0.813058078289032, "reward_std": 0.1318833101540804, "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7170759290456772, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 501.4843978881836, "epoch": 0.06123515794190128, "grad_norm": 10.158946990966797, "kl": 0.75537109375, "learning_rate": 1.2238805970149255e-05, "loss": 0.0339, "reward": 0.7940848469734192, "reward_std": 0.1467900201678276, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7248884290456772, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 505.91966247558594, "epoch": 0.06153386602942275, "grad_norm": 15.84266185760498, "kl": 0.9404296875, "learning_rate": 1.2298507462686569e-05, "loss": 0.1109, "reward": 0.7472098618745804, "reward_std": 0.12871465273201466, "rewards/accuracy_reward": 0.026785715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7204241454601288, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 489.5067138671875, "epoch": 0.06183257411694422, "grad_norm": 24.107410430908203, "kl": 0.87646484375, "learning_rate": 1.2358208955223883e-05, "loss": 0.0592, "reward": 0.8571428805589676, "reward_std": 0.15421994775533676, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.729910746216774, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 526.1027069091797, "epoch": 0.06213128220446568, "grad_norm": 46.528526306152344, "kl": 2.1494140625, "learning_rate": 1.2417910447761195e-05, "loss": 0.1191, "reward": 0.7332589626312256, "reward_std": 0.15206742845475674, "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.7131696790456772, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 446.1964416503906, "epoch": 0.062429990291987154, "grad_norm": 9.137903213500977, "kl": 1.134765625, "learning_rate": 1.2477611940298509e-05, "loss": -0.0023, "reward": 0.8007812947034836, "reward_std": 0.14373326860368252, "rewards/accuracy_reward": 0.07812500186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7226562798023224, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 516.4777145385742, "epoch": 0.06272869837950862, "grad_norm": 2.907829523086548, "kl": 1.02734375, "learning_rate": 1.2537313432835823e-05, "loss": 0.0568, "reward": 0.7745536118745804, "reward_std": 0.15503721311688423, "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.714285746216774, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 494.1830520629883, "epoch": 0.06302740646703009, "grad_norm": 6.369578838348389, "kl": 2.951171875, "learning_rate": 1.2597014925373134e-05, "loss": 0.0769, "reward": 0.83370541036129, "reward_std": 0.1491466723382473, "rewards/accuracy_reward": 0.11160714644938707, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.7198661118745804, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 485.4553756713867, "epoch": 0.06332611455455156, "grad_norm": 3.0892059803009033, "kl": 0.6728515625, "learning_rate": 1.2656716417910448e-05, "loss": -0.0143, "reward": 0.828683078289032, "reward_std": 0.1957986131310463, "rewards/accuracy_reward": 0.10714286053553224, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.7193080633878708, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 478.9955520629883, "epoch": 0.06362482264207303, "grad_norm": 6.118807315826416, "kl": 0.3583984375, "learning_rate": 1.2716417910447762e-05, "loss": 0.0708, "reward": 0.797433078289032, "reward_std": 0.1371159553527832, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7237723469734192, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 502.25225830078125, "epoch": 0.0639235307295945, "grad_norm": 16.688175201416016, "kl": 0.95849609375, "learning_rate": 1.2776119402985076e-05, "loss": 0.137, "reward": 0.8577009290456772, "reward_std": 0.2026694379746914, "rewards/accuracy_reward": 0.13616071827709675, "rewards/format_reward": 0.004464285913854837, "rewards/tag_count_reward": 0.7170759290456772, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 498.12279510498047, "epoch": 0.06422223881711597, "grad_norm": 5.849637985229492, "kl": 0.7177734375, "learning_rate": 1.2835820895522388e-05, "loss": 0.0868, "reward": 0.7678571790456772, "reward_std": 0.09462153166532516, "rewards/accuracy_reward": 0.0379464291036129, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.7276785969734192, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 454.32814025878906, "epoch": 0.06452094690463744, "grad_norm": 12.799789428710938, "kl": 2.037109375, "learning_rate": 1.2895522388059702e-05, "loss": 0.091, "reward": 0.7594866454601288, "reward_std": 0.1505365278571844, "rewards/accuracy_reward": 0.03125000232830644, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.726004496216774, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 505.4888610839844, "epoch": 0.06481965499215891, "grad_norm": 3.6148412227630615, "kl": 0.5224609375, "learning_rate": 1.2955223880597016e-05, "loss": 0.0476, "reward": 0.8805803954601288, "reward_std": 0.15522352047264576, "rewards/accuracy_reward": 0.15178572200238705, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.7265625298023224, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 476.419677734375, "epoch": 0.06511836307968039, "grad_norm": 16.161510467529297, "kl": 1.33984375, "learning_rate": 1.3014925373134329e-05, "loss": 0.1755, "reward": 0.8046875298023224, "reward_std": 0.2599400207400322, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.006696428870782256, "rewards/tag_count_reward": 0.7198660969734192, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 540.0669937133789, "epoch": 0.06541707116720186, "grad_norm": 3.7163259983062744, "kl": 0.831298828125, "learning_rate": 1.3074626865671643e-05, "loss": 0.0892, "reward": 0.7656250447034836, "reward_std": 0.16333656385540962, "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.011160714784637094, "rewards/tag_count_reward": 0.7120536118745804, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 519.3281402587891, "epoch": 0.06571577925472333, "grad_norm": 6.375974655151367, "kl": 0.658203125, "learning_rate": 1.3134328358208957e-05, "loss": 0.0401, "reward": 0.809151828289032, "reward_std": 0.15928427502512932, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.006696428870782256, "rewards/tag_count_reward": 0.7265625298023224, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 497.72547149658203, "epoch": 0.06601448734224478, "grad_norm": 15.193469047546387, "kl": 0.92822265625, "learning_rate": 1.319402985074627e-05, "loss": 0.0902, "reward": 0.7873884439468384, "reward_std": 0.17838457226753235, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.718191996216774, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 493.6919937133789, "epoch": 0.06631319542976626, "grad_norm": 29.90961456298828, "kl": 0.336669921875, "learning_rate": 1.3253731343283583e-05, "loss": 0.1101, "reward": 0.7773437798023224, "reward_std": 0.14213409833610058, "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.7327009439468384, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 459.14734649658203, "epoch": 0.06661190351728773, "grad_norm": 20.184417724609375, "kl": 0.81982421875, "learning_rate": 1.3313432835820897e-05, "loss": 0.1498, "reward": 0.8264509439468384, "reward_std": 0.1742965541779995, "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.7282366454601288, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 461.33484649658203, "epoch": 0.0669106116048092, "grad_norm": 57.7462272644043, "kl": 1.6474609375, "learning_rate": 1.3373134328358211e-05, "loss": 0.202, "reward": 0.8649553954601288, "reward_std": 0.21641451120376587, "rewards/accuracy_reward": 0.12500000419095159, "rewards/format_reward": 0.011160714784637094, "rewards/tag_count_reward": 0.7287946790456772, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 458.02457427978516, "epoch": 0.06720931969233067, "grad_norm": 153.24508666992188, "kl": 4.109375, "learning_rate": 1.3432835820895525e-05, "loss": 0.3834, "reward": 0.7879464626312256, "reward_std": 0.24546049162745476, "rewards/accuracy_reward": 0.07589286100119352, "rewards/format_reward": 0.006696428870782256, "rewards/tag_count_reward": 0.705357164144516, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 486.8214416503906, "epoch": 0.06750802777985214, "grad_norm": 230.93350219726562, "kl": 6.64453125, "learning_rate": 1.3492537313432836e-05, "loss": 0.6304, "reward": 0.7209821790456772, "reward_std": 0.2257479541003704, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.006696428870782256, "rewards/tag_count_reward": 0.674107164144516, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 468.6986846923828, "epoch": 0.06780673586737361, "grad_norm": 88.42382049560547, "kl": 4.5546875, "learning_rate": 1.355223880597015e-05, "loss": 0.4531, "reward": 0.7779018133878708, "reward_std": 0.29119322821497917, "rewards/accuracy_reward": 0.07589286123402417, "rewards/format_reward": 0.008928572060540318, "rewards/tag_count_reward": 0.6930803954601288, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 454.1406555175781, "epoch": 0.06810544395489508, "grad_norm": 6.749752521514893, "kl": 1.478515625, "learning_rate": 1.3611940298507464e-05, "loss": 0.1679, "reward": 0.7700892984867096, "reward_std": 0.21358053386211395, "rewards/accuracy_reward": 0.02455357206054032, "rewards/format_reward": 0.008928571827709675, "rewards/tag_count_reward": 0.7366071790456772, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 478.28126525878906, "epoch": 0.06840415204241655, "grad_norm": 3.1345064640045166, "kl": 1.3037109375, "learning_rate": 1.3671641791044776e-05, "loss": 0.0728, "reward": 0.7734375298023224, "reward_std": 0.16471873968839645, "rewards/accuracy_reward": 0.026785715715959668, "rewards/format_reward": 0.006696428870782256, "rewards/tag_count_reward": 0.7399553954601288, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 520.4174346923828, "epoch": 0.06870286012993802, "grad_norm": 5.174854755401611, "kl": 0.75244140625, "learning_rate": 1.373134328358209e-05, "loss": 0.0667, "reward": 0.8258928954601288, "reward_std": 0.19952703639864922, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7343750447034836, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 488.7455596923828, "epoch": 0.0690015682174595, "grad_norm": 2.4805526733398438, "kl": 0.4931640625, "learning_rate": 1.3791044776119404e-05, "loss": 0.0367, "reward": 0.8476562798023224, "reward_std": 0.191145870834589, "rewards/accuracy_reward": 0.10267857508733869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7449777126312256, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 493.04467010498047, "epoch": 0.06930027630498095, "grad_norm": 1.9530426263809204, "kl": 0.37939453125, "learning_rate": 1.3850746268656718e-05, "loss": -0.011, "reward": 0.832589328289032, "reward_std": 0.14235105365514755, "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7455357313156128, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 474.9598388671875, "epoch": 0.06959898439250242, "grad_norm": 0.7341502904891968, "kl": 0.304931640625, "learning_rate": 1.391044776119403e-05, "loss": 0.0394, "reward": 0.8906250447034836, "reward_std": 0.12886167038232088, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7566964477300644, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 472.09375762939453, "epoch": 0.06989769248002389, "grad_norm": 2.147273302078247, "kl": 0.9658203125, "learning_rate": 1.3970149253731344e-05, "loss": 0.0596, "reward": 0.8856027275323868, "reward_std": 0.18216846883296967, "rewards/accuracy_reward": 0.09821428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7873884439468384, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 520.6183319091797, "epoch": 0.07019640056754536, "grad_norm": 21.264009475708008, "kl": 2.72900390625, "learning_rate": 1.4029850746268658e-05, "loss": 0.2315, "reward": 0.9425223767757416, "reward_std": 0.15863293409347534, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8666295260190964, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 430.4643020629883, "epoch": 0.07049510865506683, "grad_norm": 10.449820518493652, "kl": 2.263916015625, "learning_rate": 1.408955223880597e-05, "loss": 0.1005, "reward": 1.1032366752624512, "reward_std": 0.18828385695815086, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.9358259439468384, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 395.85491943359375, "epoch": 0.0707938167425883, "grad_norm": 0.6474987864494324, "kl": 0.6591796875, "learning_rate": 1.4149253731343285e-05, "loss": -0.0367, "reward": 1.0279018431901932, "reward_std": 0.15339142456650734, "rewards/accuracy_reward": 0.06250000419095159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.965401828289032, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 483.5870666503906, "epoch": 0.07109252483010978, "grad_norm": 1.2652392387390137, "kl": 0.28466796875, "learning_rate": 1.4208955223880599e-05, "loss": 0.0035, "reward": 1.0898438096046448, "reward_std": 0.12878621648997068, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 577.4844055175781, "epoch": 0.07139123291763125, "grad_norm": 1.7291364669799805, "kl": 4.41259765625, "learning_rate": 1.4268656716417913e-05, "loss": -0.0317, "reward": 0.8816964626312256, "reward_std": 0.1994314193725586, "rewards/accuracy_reward": 0.10267857229337096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7790178954601288, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 531.1473541259766, "epoch": 0.07168994100515272, "grad_norm": 0.5824922323226929, "kl": 0.26953125, "learning_rate": 1.4328358208955224e-05, "loss": -0.0477, "reward": 0.7633928954601288, "reward_std": 0.14217619970440865, "rewards/accuracy_reward": 0.0446428582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7187500298023224, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 531.0982437133789, "epoch": 0.07198864909267419, "grad_norm": 1.525701880455017, "kl": 0.32861328125, "learning_rate": 1.4388059701492538e-05, "loss": -0.064, "reward": 0.8080357611179352, "reward_std": 0.15583969466388226, "rewards/accuracy_reward": 0.09151785937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7165178805589676, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 483.8705596923828, "epoch": 0.07228735718019566, "grad_norm": 0.731819748878479, "kl": 0.31689453125, "learning_rate": 1.4447761194029852e-05, "loss": -0.0435, "reward": 0.7628348618745804, "reward_std": 0.2970057800412178, "rewards/accuracy_reward": 0.09375000232830644, "rewards/format_reward": 0.022321429569274187, "rewards/tag_count_reward": 0.646763414144516, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 351.93750762939453, "epoch": 0.07258606526771712, "grad_norm": 3.810457706451416, "kl": 1.53125, "learning_rate": 1.4507462686567166e-05, "loss": -0.0764, "reward": 0.602120578289032, "reward_std": 0.3570767939090729, "rewards/accuracy_reward": 0.0446428582072258, "rewards/format_reward": 0.0357142873108387, "rewards/tag_count_reward": 0.521763414144516, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 335.51564025878906, "epoch": 0.07288477335523859, "grad_norm": 119.48861694335938, "kl": 6.3759765625, "learning_rate": 1.4567164179104478e-05, "loss": 0.2415, "reward": 0.712053582072258, "reward_std": 0.4064289703965187, "rewards/accuracy_reward": 0.14508929220028222, "rewards/format_reward": 0.0625000037252903, "rewards/tag_count_reward": 0.5044643059372902, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 296.8973274230957, "epoch": 0.07318348144276006, "grad_norm": 2.24727463722229, "kl": 1.0439453125, "learning_rate": 1.4626865671641792e-05, "loss": -0.056, "reward": 0.5518973395228386, "reward_std": 0.4314607158303261, "rewards/accuracy_reward": 0.020089285913854837, "rewards/format_reward": 0.0558035746216774, "rewards/tag_count_reward": 0.4760044813156128, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 405.62278747558594, "epoch": 0.07348218953028153, "grad_norm": 1.620252013206482, "kl": 0.96728515625, "learning_rate": 1.4686567164179106e-05, "loss": -0.0342, "reward": 0.965401828289032, "reward_std": 0.5722060948610306, "rewards/accuracy_reward": 0.16741072619333863, "rewards/format_reward": 0.1517857201397419, "rewards/tag_count_reward": 0.6462053805589676, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 464.4062728881836, "epoch": 0.073780897617803, "grad_norm": 12.730090141296387, "kl": 4.80859375, "learning_rate": 1.4746268656716418e-05, "loss": 0.0763, "reward": 1.064732164144516, "reward_std": 0.6384020149707794, "rewards/accuracy_reward": 0.10491072060540318, "rewards/format_reward": 0.2366071566939354, "rewards/tag_count_reward": 0.7232143133878708, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 483.9866256713867, "epoch": 0.07407960570532447, "grad_norm": 91.02135467529297, "kl": 7.79296875, "learning_rate": 1.4805970149253732e-05, "loss": 0.5277, "reward": 1.1484375596046448, "reward_std": 0.7344646155834198, "rewards/accuracy_reward": 0.05357143213041127, "rewards/format_reward": 0.3437500149011612, "rewards/tag_count_reward": 0.7511161118745804, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 470.24778747558594, "epoch": 0.07437831379284594, "grad_norm": 44.906158447265625, "kl": 5.1943359375, "learning_rate": 1.4865671641791046e-05, "loss": 0.2877, "reward": 1.2578125596046448, "reward_std": 0.738121822476387, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.4174107313156128, "rewards/tag_count_reward": 0.746651828289032, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 504.11610412597656, "epoch": 0.07467702188036741, "grad_norm": 9.303950309753418, "kl": 10.578125, "learning_rate": 1.492537313432836e-05, "loss": 0.0772, "reward": 1.2243303954601288, "reward_std": 0.7864609807729721, "rewards/accuracy_reward": 0.033482144586741924, "rewards/format_reward": 0.470982164144516, "rewards/tag_count_reward": 0.7198661118745804, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 534.9152069091797, "epoch": 0.07497572996788888, "grad_norm": 11.445030212402344, "kl": 2.943359375, "learning_rate": 1.4985074626865673e-05, "loss": 0.286, "reward": 1.2438616454601288, "reward_std": 0.7538609355688095, "rewards/accuracy_reward": 0.013392857741564512, "rewards/format_reward": 0.5044643208384514, "rewards/tag_count_reward": 0.726004496216774, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 574.3683166503906, "epoch": 0.07527443805541036, "grad_norm": 21.95745086669922, "kl": 3.5087890625, "learning_rate": 1.5044776119402987e-05, "loss": 0.3617, "reward": 1.469866156578064, "reward_std": 0.7231039106845856, "rewards/accuracy_reward": 0.05803571850992739, "rewards/format_reward": 0.6160714626312256, "rewards/tag_count_reward": 0.7957589626312256, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 496.8794937133789, "epoch": 0.07557314614293181, "grad_norm": 6.889185905456543, "kl": 0.9482421875, "learning_rate": 1.51044776119403e-05, "loss": 0.1966, "reward": 1.6568081378936768, "reward_std": 0.6832538694143295, "rewards/accuracy_reward": 0.14732143143191934, "rewards/format_reward": 0.6852678805589676, "rewards/tag_count_reward": 0.8242187947034836, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 540.5558395385742, "epoch": 0.07587185423045328, "grad_norm": 4.581815242767334, "kl": 1.1728515625, "learning_rate": 1.5164179104477611e-05, "loss": 0.2023, "reward": 1.713727742433548, "reward_std": 0.6475653871893883, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.77901791036129, "rewards/tag_count_reward": 0.8766741305589676, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 525.8750228881836, "epoch": 0.07617056231797475, "grad_norm": 31.92071533203125, "kl": 6.1630859375, "learning_rate": 1.5223880597014925e-05, "loss": 0.4711, "reward": 1.7751116752624512, "reward_std": 0.5429321750998497, "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.8147321939468384, "rewards/tag_count_reward": 0.8934152275323868, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 501.1585006713867, "epoch": 0.07646927040549623, "grad_norm": 9.136240005493164, "kl": 2.2060546875, "learning_rate": 1.528358208955224e-05, "loss": 0.2656, "reward": 1.8844866752624512, "reward_std": 0.42135708034038544, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.8861607611179352, "rewards/tag_count_reward": 0.9358259290456772, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 528.9866409301758, "epoch": 0.0767679784930177, "grad_norm": 6.086483955383301, "kl": 1.92236328125, "learning_rate": 1.5343283582089555e-05, "loss": 0.2513, "reward": 1.8638393580913544, "reward_std": 0.5190890952944756, "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.8816964626312256, "rewards/tag_count_reward": 0.9218750447034836, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 474.8013610839844, "epoch": 0.07706668658053917, "grad_norm": 1813.65966796875, "kl": 143.59375, "learning_rate": 1.5402985074626866e-05, "loss": 9.4272, "reward": 1.9866072237491608, "reward_std": 0.34973709285259247, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.8906250447034836, "rewards/tag_count_reward": 0.9397321790456772, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 461.46653747558594, "epoch": 0.07736539466806064, "grad_norm": 59.01897430419922, "kl": 5.46875, "learning_rate": 1.546268656716418e-05, "loss": 0.6333, "reward": 1.91350457072258, "reward_std": 0.3741312511265278, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.9196428954601288, "rewards/tag_count_reward": 0.9559152275323868, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 528.912971496582, "epoch": 0.07766410275558211, "grad_norm": 3.531203508377075, "kl": 0.38720703125, "learning_rate": 1.5522388059701494e-05, "loss": 0.0958, "reward": 1.8883929550647736, "reward_std": 0.43083757907152176, "rewards/accuracy_reward": 0.03125000116415322, "rewards/format_reward": 0.910714328289032, "rewards/tag_count_reward": 0.9464286118745804, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 495.6361846923828, "epoch": 0.07796281084310358, "grad_norm": 3.42118763923645, "kl": 0.35693359375, "learning_rate": 1.5582089552238808e-05, "loss": 0.1157, "reward": 1.9068081080913544, "reward_std": 0.41702190041542053, "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.9174107611179352, "rewards/tag_count_reward": 0.9492187798023224, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 477.6897506713867, "epoch": 0.07826151893062505, "grad_norm": 990.6107177734375, "kl": 16.8134765625, "learning_rate": 1.564179104477612e-05, "loss": 2.1072, "reward": 1.920758992433548, "reward_std": 0.47695595026016235, "rewards/accuracy_reward": 0.10267857555299997, "rewards/format_reward": 0.8839286118745804, "rewards/tag_count_reward": 0.9341518133878708, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 504.9107360839844, "epoch": 0.07856022701814652, "grad_norm": 212.58758544921875, "kl": 7.212890625, "learning_rate": 1.5701492537313433e-05, "loss": 0.5568, "reward": 1.8995536863803864, "reward_std": 0.4772149845957756, "rewards/accuracy_reward": 0.08482143003493547, "rewards/format_reward": 0.8906250447034836, "rewards/tag_count_reward": 0.9241071790456772, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 533.6004791259766, "epoch": 0.07885893510566798, "grad_norm": 8.52969741821289, "kl": 3.08203125, "learning_rate": 1.5761194029850747e-05, "loss": 0.4365, "reward": 1.8833706378936768, "reward_std": 0.5824756249785423, "rewards/accuracy_reward": 0.10714286123402417, "rewards/format_reward": 0.8660714626312256, "rewards/tag_count_reward": 0.9101562947034836, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 493.60047149658203, "epoch": 0.07915764319318945, "grad_norm": 3.878610849380493, "kl": 0.68701171875, "learning_rate": 1.582089552238806e-05, "loss": 0.2005, "reward": 1.8928572237491608, "reward_std": 0.3876211829483509, "rewards/accuracy_reward": 0.024553573224693537, "rewards/format_reward": 0.9151786118745804, "rewards/tag_count_reward": 0.9531250447034836, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 488.0513610839844, "epoch": 0.07945635128071092, "grad_norm": 89.87899780273438, "kl": 6.81640625, "learning_rate": 1.5880597014925375e-05, "loss": 0.7769, "reward": 1.9107143878936768, "reward_std": 0.31770212948322296, "rewards/accuracy_reward": 0.0133928582072258, "rewards/format_reward": 0.933035746216774, "rewards/tag_count_reward": 0.9642857760190964, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 493.7232360839844, "epoch": 0.07975505936823239, "grad_norm": 4.644393444061279, "kl": 0.93603515625, "learning_rate": 1.594029850746269e-05, "loss": 0.1953, "reward": 1.9436384737491608, "reward_std": 0.3826867491006851, "rewards/accuracy_reward": 0.07142857369035482, "rewards/format_reward": 0.9129464775323868, "rewards/tag_count_reward": 0.9592634290456772, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 499.6116256713867, "epoch": 0.08005376745575386, "grad_norm": 2.721386194229126, "kl": 0.46484375, "learning_rate": 1.6000000000000003e-05, "loss": 0.1038, "reward": 1.9419643580913544, "reward_std": 0.30240025371313095, "rewards/accuracy_reward": 0.046875000931322575, "rewards/format_reward": 0.9308036118745804, "rewards/tag_count_reward": 0.9642857611179352, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 486.84153747558594, "epoch": 0.08035247554327533, "grad_norm": 12.670315742492676, "kl": 1.7138671875, "learning_rate": 1.6059701492537313e-05, "loss": 0.2237, "reward": 1.9720983505249023, "reward_std": 0.3817445933818817, "rewards/accuracy_reward": 0.12053572200238705, "rewards/format_reward": 0.9017857611179352, "rewards/tag_count_reward": 0.949776828289032, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 516.7902069091797, "epoch": 0.0806511836307968, "grad_norm": 3.5285496711730957, "kl": 0.60205078125, "learning_rate": 1.6119402985074627e-05, "loss": 0.1045, "reward": 2.101562589406967, "reward_std": 0.3499620333313942, "rewards/accuracy_reward": 0.19196429569274187, "rewards/format_reward": 0.9330357611179352, "rewards/tag_count_reward": 0.9765625298023224, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 575.1339416503906, "epoch": 0.08094989171831828, "grad_norm": 0.8695207834243774, "kl": 0.2752685546875, "learning_rate": 1.617910447761194e-05, "loss": 0.1007, "reward": 1.9743304252624512, "reward_std": 0.3766213022172451, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.910714328289032, "rewards/tag_count_reward": 0.9542410969734192, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 568.4107360839844, "epoch": 0.08124859980583975, "grad_norm": 1.0248770713806152, "kl": 0.19677734375, "learning_rate": 1.6238805970149255e-05, "loss": 0.06, "reward": 2.005022406578064, "reward_std": 0.37686654552817345, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.9285714626312256, "rewards/tag_count_reward": 0.9670759290456772, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 542.1451263427734, "epoch": 0.08154730789336122, "grad_norm": 0.22461068630218506, "kl": 0.1669921875, "learning_rate": 1.629850746268657e-05, "loss": 0.0412, "reward": 2.0027903020381927, "reward_std": 0.1967563428916037, "rewards/accuracy_reward": 0.04017857275903225, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9849330633878708, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 552.2723388671875, "epoch": 0.08184601598088269, "grad_norm": 1.061505675315857, "kl": 0.303466796875, "learning_rate": 1.6358208955223883e-05, "loss": 0.1324, "reward": 2.1004464626312256, "reward_std": 0.3149108588695526, "rewards/accuracy_reward": 0.20758929289877415, "rewards/format_reward": 0.933035746216774, "rewards/tag_count_reward": 0.9598214775323868, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 538.2924346923828, "epoch": 0.08214472406840415, "grad_norm": 0.33907872438430786, "kl": 0.1239013671875, "learning_rate": 1.6417910447761197e-05, "loss": 0.0583, "reward": 1.94866082072258, "reward_std": 0.3252243846654892, "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.9330357760190964, "rewards/tag_count_reward": 0.96651791036129, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 551.7433319091797, "epoch": 0.08244343215592562, "grad_norm": 0.36464568972587585, "kl": 0.081787109375, "learning_rate": 1.6477611940298508e-05, "loss": 0.1025, "reward": 1.9609375596046448, "reward_std": 0.3322100192308426, "rewards/accuracy_reward": 0.06250000442378223, "rewards/format_reward": 0.9330357611179352, "rewards/tag_count_reward": 0.965401828289032, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 638.1897583007812, "epoch": 0.08274214024344709, "grad_norm": 0.5089578628540039, "kl": 0.1357421875, "learning_rate": 1.6537313432835822e-05, "loss": 0.1199, "reward": 1.9274554252624512, "reward_std": 0.4158443659543991, "rewards/accuracy_reward": 0.0803571492433548, "rewards/format_reward": 0.8973214775323868, "rewards/tag_count_reward": 0.9497768431901932, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 795.575927734375, "epoch": 0.08304084833096856, "grad_norm": 54.724571228027344, "kl": 5.43359375, "learning_rate": 1.6597014925373136e-05, "loss": 0.4892, "reward": 1.1021205633878708, "reward_std": 0.8348542749881744, "rewards/accuracy_reward": 0.05803571664728224, "rewards/format_reward": 0.4464285895228386, "rewards/tag_count_reward": 0.5976562798023224, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 586.9464721679688, "epoch": 0.08333955641849003, "grad_norm": 3.727760076522827, "kl": 0.4293212890625, "learning_rate": 1.665671641791045e-05, "loss": 0.2158, "reward": 1.8108259439468384, "reward_std": 0.5437498986721039, "rewards/accuracy_reward": 0.09375000279396772, "rewards/format_reward": 0.832589328289032, "rewards/tag_count_reward": 0.88448666036129, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 612.1897583007812, "epoch": 0.0836382645060115, "grad_norm": 3.2088558673858643, "kl": 0.5687255859375, "learning_rate": 1.671641791044776e-05, "loss": 0.2197, "reward": 1.7873885035514832, "reward_std": 0.5478045120835304, "rewards/accuracy_reward": 0.05580357275903225, "rewards/format_reward": 0.8392857611179352, "rewards/tag_count_reward": 0.8922991454601288, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 626.4955749511719, "epoch": 0.08393697259353297, "grad_norm": 1.1047301292419434, "kl": 0.2296142578125, "learning_rate": 1.6776119402985075e-05, "loss": 0.1774, "reward": 1.862165242433548, "reward_std": 0.5900007635354996, "rewards/accuracy_reward": 0.11607143701985478, "rewards/format_reward": 0.8370536118745804, "rewards/tag_count_reward": 0.9090402126312256, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 671.810302734375, "epoch": 0.08423568068105444, "grad_norm": 0.7702422142028809, "kl": 0.233154296875, "learning_rate": 1.683582089552239e-05, "loss": 0.2845, "reward": 1.580915242433548, "reward_std": 0.7319788187742233, "rewards/accuracy_reward": 0.13839286286383867, "rewards/format_reward": 0.6540178954601288, "rewards/tag_count_reward": 0.788504496216774, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 729.2544860839844, "epoch": 0.08453438876857591, "grad_norm": 1.497448444366455, "kl": 0.267578125, "learning_rate": 1.6895522388059703e-05, "loss": 0.3554, "reward": 1.1662946939468384, "reward_std": 0.8138564825057983, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.475446455180645, "rewards/tag_count_reward": 0.6372767984867096, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 706.4687805175781, "epoch": 0.08483309685609738, "grad_norm": 0.6733664274215698, "kl": 0.221923828125, "learning_rate": 1.6955223880597017e-05, "loss": 0.3756, "reward": 1.2550223767757416, "reward_std": 0.7884461730718613, "rewards/accuracy_reward": 0.09598214854486287, "rewards/format_reward": 0.5111607313156128, "rewards/tag_count_reward": 0.6478794813156128, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 715.544677734375, "epoch": 0.08513180494361886, "grad_norm": 0.45224305987358093, "kl": 0.194091796875, "learning_rate": 1.701492537313433e-05, "loss": 0.3561, "reward": 1.2728795111179352, "reward_std": 0.8634754866361618, "rewards/accuracy_reward": 0.08035714598372579, "rewards/format_reward": 0.5357143059372902, "rewards/tag_count_reward": 0.6568080633878708, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 618.0535888671875, "epoch": 0.08543051303114031, "grad_norm": 1.0075457096099854, "kl": 0.17626953125, "learning_rate": 1.7074626865671645e-05, "loss": 0.3724, "reward": 1.5256696939468384, "reward_std": 0.728341281414032, "rewards/accuracy_reward": 0.12276786426082253, "rewards/format_reward": 0.6584821790456772, "rewards/tag_count_reward": 0.7444196939468384, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 629.075927734375, "epoch": 0.08572922111866178, "grad_norm": 10.663813591003418, "kl": 0.411376953125, "learning_rate": 1.7134328358208956e-05, "loss": 0.3981, "reward": 1.5524554252624512, "reward_std": 0.7930385321378708, "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.6696428805589676, "rewards/tag_count_reward": 0.7667411118745804, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 628.4062652587891, "epoch": 0.08602792920618325, "grad_norm": 0.6657167077064514, "kl": 0.1715087890625, "learning_rate": 1.719402985074627e-05, "loss": 0.29, "reward": 1.4748884439468384, "reward_std": 0.6462281346321106, "rewards/accuracy_reward": 0.022321429569274187, "rewards/format_reward": 0.683035746216774, "rewards/tag_count_reward": 0.7695312947034836, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 602.9576263427734, "epoch": 0.08632663729370472, "grad_norm": 0.32371172308921814, "kl": 0.1156005859375, "learning_rate": 1.7253731343283584e-05, "loss": 0.2908, "reward": 1.5284598767757416, "reward_std": 0.6716296523809433, "rewards/accuracy_reward": 0.07812500232830644, "rewards/format_reward": 0.683035746216774, "rewards/tag_count_reward": 0.7672991305589676, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 605.7901992797852, "epoch": 0.0866253453812262, "grad_norm": 0.7334277629852295, "kl": 0.230712890625, "learning_rate": 1.7313432835820894e-05, "loss": 0.3002, "reward": 1.5295759439468384, "reward_std": 0.6785011738538742, "rewards/accuracy_reward": 0.10267857392318547, "rewards/format_reward": 0.667410746216774, "rewards/tag_count_reward": 0.7594866305589676, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 511.5826110839844, "epoch": 0.08692405346874767, "grad_norm": 1.5831133127212524, "kl": 0.258056640625, "learning_rate": 1.7373134328358208e-05, "loss": 0.2826, "reward": 1.7779018580913544, "reward_std": 0.5816572830080986, "rewards/accuracy_reward": 0.08482143236324191, "rewards/format_reward": 0.8214286118745804, "rewards/tag_count_reward": 0.8716518133878708, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 480.4397659301758, "epoch": 0.08722276155626914, "grad_norm": 79.90766906738281, "kl": 1.6121826171875, "learning_rate": 1.7432835820895522e-05, "loss": 0.2488, "reward": 1.8169643580913544, "reward_std": 0.43095812946558, "rewards/accuracy_reward": 0.015625000931322575, "rewards/format_reward": 0.8772321939468384, "rewards/tag_count_reward": 0.9241071939468384, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 490.1652069091797, "epoch": 0.08752146964379061, "grad_norm": 0.444474458694458, "kl": 0.140625, "learning_rate": 1.7492537313432836e-05, "loss": 0.3483, "reward": 1.8889509737491608, "reward_std": 0.5863768309354782, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.8482143431901932, "rewards/tag_count_reward": 0.8867188096046448, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 572.8415374755859, "epoch": 0.08782017773131208, "grad_norm": 0.499798983335495, "kl": 0.192626953125, "learning_rate": 1.755223880597015e-05, "loss": 0.463, "reward": 1.3744420409202576, "reward_std": 0.7605770826339722, "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.6316964477300644, "rewards/tag_count_reward": 0.7382812798023224, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 855.3750457763672, "epoch": 0.08811888581883355, "grad_norm": 2.1087894439697266, "kl": 0.344970703125, "learning_rate": 1.7611940298507464e-05, "loss": 0.3087, "reward": 0.6395089626312256, "reward_std": 0.615754172205925, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.2008928656578064, "rewards/tag_count_reward": 0.4029018059372902, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 883.4509429931641, "epoch": 0.08841759390635502, "grad_norm": 1.190443754196167, "kl": 0.19775390625, "learning_rate": 1.767164179104478e-05, "loss": 0.284, "reward": 0.5318080484867096, "reward_std": 0.5224883034825325, "rewards/accuracy_reward": 0.013392857508733869, "rewards/format_reward": 0.15178571827709675, "rewards/tag_count_reward": 0.3666294813156128, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 424.2946548461914, "epoch": 0.08871630199387648, "grad_norm": 3.7044429779052734, "kl": 0.252685546875, "learning_rate": 1.7731343283582092e-05, "loss": 0.6419, "reward": 1.557477742433548, "reward_std": 0.7066863924264908, "rewards/accuracy_reward": 0.07812500232830644, "rewards/format_reward": 0.7098214626312256, "rewards/tag_count_reward": 0.7695312798023224, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 314.4442138671875, "epoch": 0.08901501008139795, "grad_norm": 17.3271427154541, "kl": 1.191650390625, "learning_rate": 1.7791044776119403e-05, "loss": 0.701, "reward": 1.6908482909202576, "reward_std": 0.5716342777013779, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8236607611179352, "rewards/tag_count_reward": 0.8671875298023224, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 257.1473388671875, "epoch": 0.08931371816891942, "grad_norm": 8.834965705871582, "kl": 0.329833984375, "learning_rate": 1.7850746268656717e-05, "loss": 0.4112, "reward": 1.8381697237491608, "reward_std": 0.43621183186769485, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.886160746216774, "rewards/tag_count_reward": 0.9140625298023224, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 251.26117324829102, "epoch": 0.08961242625644089, "grad_norm": 652.1995239257812, "kl": 19.2177734375, "learning_rate": 1.791044776119403e-05, "loss": 2.5764, "reward": 1.8861607909202576, "reward_std": 0.5205890312790871, "rewards/accuracy_reward": 0.1607142984867096, "rewards/format_reward": 0.8683036118745804, "rewards/tag_count_reward": 0.8571428954601288, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 293.34599685668945, "epoch": 0.08991113434396236, "grad_norm": 2.261232614517212, "kl": 0.2607421875, "learning_rate": 1.7970149253731345e-05, "loss": 0.3646, "reward": 1.5887277722358704, "reward_std": 0.4844237193465233, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.895089328289032, "rewards/tag_count_reward": 0.693638414144516, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 239.4084930419922, "epoch": 0.09020984243148383, "grad_norm": 1.8284614086151123, "kl": 0.3388671875, "learning_rate": 1.802985074626866e-05, "loss": 0.2357, "reward": 1.7215402722358704, "reward_std": 0.46947529911994934, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.8839286118745804, "rewards/tag_count_reward": 0.7996651977300644, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 204.29018783569336, "epoch": 0.0905085505190053, "grad_norm": 3.387589454650879, "kl": 0.42529296875, "learning_rate": 1.8089552238805973e-05, "loss": 0.2419, "reward": 1.7583706378936768, "reward_std": 0.46434441208839417, "rewards/accuracy_reward": 0.008928572060540318, "rewards/format_reward": 0.8348214775323868, "rewards/tag_count_reward": 0.914620578289032, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 182.03572463989258, "epoch": 0.09080725860652678, "grad_norm": 2.6185925006866455, "kl": 0.292724609375, "learning_rate": 1.8149253731343287e-05, "loss": 0.17, "reward": 1.741071492433548, "reward_std": 0.5314472764730453, "rewards/accuracy_reward": 0.0267857164144516, "rewards/format_reward": 0.7946428954601288, "rewards/tag_count_reward": 0.9196428954601288, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 173.52679443359375, "epoch": 0.09110596669404825, "grad_norm": 5.33125638961792, "kl": 0.32275390625, "learning_rate": 1.8208955223880598e-05, "loss": 0.2729, "reward": 1.8744420409202576, "reward_std": 0.3848069757223129, "rewards/accuracy_reward": 0.03571428847499192, "rewards/format_reward": 0.886160746216774, "rewards/tag_count_reward": 0.9525670111179352, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 183.15848541259766, "epoch": 0.09140467478156972, "grad_norm": 9.007524490356445, "kl": 0.34326171875, "learning_rate": 1.8268656716417912e-05, "loss": 0.2635, "reward": 1.8934152722358704, "reward_std": 0.45998039096593857, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.8816964775323868, "rewards/tag_count_reward": 0.9402902275323868, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 143.665189743042, "epoch": 0.09170338286909117, "grad_norm": 11.798418998718262, "kl": 0.419921875, "learning_rate": 1.8328358208955226e-05, "loss": 0.1047, "reward": 1.9068081080913544, "reward_std": 0.48982833325862885, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.8616071939468384, "rewards/tag_count_reward": 0.9268973767757416, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 152.38616561889648, "epoch": 0.09200209095661264, "grad_norm": 23.4014949798584, "kl": 0.5361328125, "learning_rate": 1.8388059701492537e-05, "loss": 0.1068, "reward": 1.8080357611179352, "reward_std": 0.5330165401101112, "rewards/accuracy_reward": 0.029017858672887087, "rewards/format_reward": 0.8616071790456772, "rewards/tag_count_reward": 0.9174107611179352, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 134.25670051574707, "epoch": 0.09230079904413412, "grad_norm": 41.26806640625, "kl": 0.86083984375, "learning_rate": 1.844776119402985e-05, "loss": 0.1647, "reward": 1.8147322237491608, "reward_std": 0.46794815361499786, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8861607611179352, "rewards/tag_count_reward": 0.9285714626312256, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 153.04241943359375, "epoch": 0.09259950713165559, "grad_norm": 2420.448974609375, "kl": 14.3828125, "learning_rate": 1.8507462686567165e-05, "loss": 3.0711, "reward": 1.90178582072258, "reward_std": 0.5291697755455971, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.8593750298023224, "rewards/tag_count_reward": 0.9241071939468384, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 162.15179443359375, "epoch": 0.09289821521917706, "grad_norm": 1015.701904296875, "kl": 8.373046875, "learning_rate": 1.856716417910448e-05, "loss": 1.7268, "reward": 1.813616156578064, "reward_std": 0.5441195145249367, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.85714291036129, "rewards/tag_count_reward": 0.9140625447034836, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 163.82590103149414, "epoch": 0.09319692330669853, "grad_norm": 728.13037109375, "kl": 7.546875, "learning_rate": 1.8626865671641793e-05, "loss": 1.7744, "reward": 1.8883929550647736, "reward_std": 0.5273571014404297, "rewards/accuracy_reward": 0.08705357508733869, "rewards/format_reward": 0.8727678954601288, "rewards/tag_count_reward": 0.9285714775323868, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 187.28795623779297, "epoch": 0.09349563139422, "grad_norm": 34.40441131591797, "kl": 3.7626953125, "learning_rate": 1.8686567164179107e-05, "loss": 0.5712, "reward": 1.7583706080913544, "reward_std": 0.5948084592819214, "rewards/accuracy_reward": 0.05580357299186289, "rewards/format_reward": 0.8147321790456772, "rewards/tag_count_reward": 0.8878348469734192, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 216.57366943359375, "epoch": 0.09379433948174147, "grad_norm": 4.999128818511963, "kl": 0.640625, "learning_rate": 1.874626865671642e-05, "loss": 0.5867, "reward": 1.734933078289032, "reward_std": 0.5646031945943832, "rewards/accuracy_reward": 0.0379464291036129, "rewards/format_reward": 0.8102678805589676, "rewards/tag_count_reward": 0.8867187947034836, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 151.9285774230957, "epoch": 0.09409304756926294, "grad_norm": 2.326687812805176, "kl": 0.36376953125, "learning_rate": 1.8805970149253735e-05, "loss": 0.4366, "reward": 1.848772406578064, "reward_std": 0.41599492728710175, "rewards/accuracy_reward": 0.0022321429569274187, "rewards/format_reward": 0.8995536118745804, "rewards/tag_count_reward": 0.94698666036129, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 168.33482933044434, "epoch": 0.09439175565678441, "grad_norm": 2.4444727897644043, "kl": 0.301513671875, "learning_rate": 1.8865671641791045e-05, "loss": 0.6075, "reward": 2.001674175262451, "reward_std": 0.2957671210169792, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.9218750596046448, "rewards/tag_count_reward": 0.9726562947034836, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 129.15402603149414, "epoch": 0.09469046374430588, "grad_norm": 1.5838786363601685, "kl": 0.369140625, "learning_rate": 1.892537313432836e-05, "loss": 0.2134, "reward": 1.9994420409202576, "reward_std": 0.18555155023932457, "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.9838170111179352, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 197.1093864440918, "epoch": 0.09498917183182734, "grad_norm": 2.569911241531372, "kl": 0.389404296875, "learning_rate": 1.8985074626865673e-05, "loss": 0.5937, "reward": 1.85100457072258, "reward_std": 0.31579456478357315, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.8928571790456772, "rewards/tag_count_reward": 0.9425223767757416, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 204.26340866088867, "epoch": 0.09528787991934881, "grad_norm": 2.8597230911254883, "kl": 0.9580078125, "learning_rate": 1.9044776119402984e-05, "loss": 0.7563, "reward": 1.7299107909202576, "reward_std": 0.5039025098085403, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.7968750447034836, "rewards/tag_count_reward": 0.8906250447034836, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 333.96207427978516, "epoch": 0.09558658800687028, "grad_norm": 141.09234619140625, "kl": 12.84375, "learning_rate": 1.9104477611940298e-05, "loss": 0.8204, "reward": 1.323102742433548, "reward_std": 0.7225542664527893, "rewards/accuracy_reward": 0.0022321429569274187, "rewards/format_reward": 0.5401785969734192, "rewards/tag_count_reward": 0.780691996216774, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 582.4330596923828, "epoch": 0.09588529609439175, "grad_norm": 98.96479034423828, "kl": 8.6953125, "learning_rate": 1.9164179104477612e-05, "loss": 0.5266, "reward": 0.8683036118745804, "reward_std": 0.6286892592906952, "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.2299107275903225, "rewards/tag_count_reward": 0.595982164144516, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 606.5268249511719, "epoch": 0.09618400418191322, "grad_norm": 244.1800079345703, "kl": 11.9453125, "learning_rate": 1.922388059701493e-05, "loss": 0.4585, "reward": 0.7957589775323868, "reward_std": 0.5671640038490295, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.1785714365541935, "rewards/tag_count_reward": 0.5814732164144516, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 730.544677734375, "epoch": 0.0964827122694347, "grad_norm": 57.72209930419922, "kl": 3.845703125, "learning_rate": 1.928358208955224e-05, "loss": 0.2943, "reward": 0.706473246216774, "reward_std": 0.4942045509815216, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0959821455180645, "rewards/tag_count_reward": 0.5212053805589676, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 735.247802734375, "epoch": 0.09678142035695617, "grad_norm": 4.8268256187438965, "kl": 1.2021484375, "learning_rate": 1.9343283582089554e-05, "loss": 0.2422, "reward": 0.5797991156578064, "reward_std": 0.4526684358716011, "rewards/accuracy_reward": 0.0066964291036129, "rewards/format_reward": 0.08035714738070965, "rewards/tag_count_reward": 0.4927455708384514, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 778.0513763427734, "epoch": 0.09708012844447764, "grad_norm": 1.9198248386383057, "kl": 0.62353515625, "learning_rate": 1.9402985074626868e-05, "loss": 0.2789, "reward": 0.7103795111179352, "reward_std": 0.5294370204210281, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.1227678619325161, "rewards/tag_count_reward": 0.5474330484867096, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 683.9442291259766, "epoch": 0.09737883653199911, "grad_norm": 2.5226125717163086, "kl": 0.5927734375, "learning_rate": 1.946268656716418e-05, "loss": 0.455, "reward": 0.9475446790456772, "reward_std": 0.6658075004816055, "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.2477678693830967, "rewards/tag_count_reward": 0.6328125447034836, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 530.0089492797852, "epoch": 0.09767754461952058, "grad_norm": 16.32921600341797, "kl": 0.56787109375, "learning_rate": 1.9522388059701493e-05, "loss": 0.7598, "reward": 1.273995578289032, "reward_std": 0.7027301788330078, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.4375000223517418, "rewards/tag_count_reward": 0.7606027126312256, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 323.8727836608887, "epoch": 0.09797625270704205, "grad_norm": 222.9489288330078, "kl": 4.765625, "learning_rate": 1.9582089552238807e-05, "loss": 1.4323, "reward": 1.6272321939468384, "reward_std": 0.6396510154008865, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.674107164144516, "rewards/tag_count_reward": 0.8616071790456772, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 204.96206283569336, "epoch": 0.0982749607945635, "grad_norm": 64.76848602294922, "kl": 4.00390625, "learning_rate": 1.964179104477612e-05, "loss": 1.3391, "reward": 1.8325893580913544, "reward_std": 0.4944699928164482, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.8303571790456772, "rewards/tag_count_reward": 0.9285714626312256, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 136.52456092834473, "epoch": 0.09857366888208498, "grad_norm": 459.4096984863281, "kl": 13.015625, "learning_rate": 1.9701492537313435e-05, "loss": 1.888, "reward": 1.8543527722358704, "reward_std": 0.3907611668109894, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9017857611179352, "rewards/tag_count_reward": 0.9525670111179352, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 139.8504524230957, "epoch": 0.09887237696960645, "grad_norm": 946.3134155273438, "kl": 14.078125, "learning_rate": 1.976119402985075e-05, "loss": 2.4423, "reward": 1.9453125596046448, "reward_std": 0.3225971683859825, "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.9196428954601288, "rewards/tag_count_reward": 0.9720982611179352, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 120.15178871154785, "epoch": 0.09917108505712792, "grad_norm": 339.1749267578125, "kl": 6.3671875, "learning_rate": 1.9820895522388063e-05, "loss": 1.204, "reward": 1.9681920409202576, "reward_std": 0.29555678740143776, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.9241071939468384, "rewards/tag_count_reward": 0.97042416036129, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 106.15848541259766, "epoch": 0.09946979314464939, "grad_norm": 504.15673828125, "kl": 4.59765625, "learning_rate": 1.9880597014925377e-05, "loss": 1.0259, "reward": 1.9335938394069672, "reward_std": 0.18949715420603752, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9531250298023224, "rewards/tag_count_reward": 0.9804687947034836, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 100.6004524230957, "epoch": 0.09976850123217086, "grad_norm": 129.24249267578125, "kl": 1.47265625, "learning_rate": 1.9940298507462688e-05, "loss": 0.6492, "reward": 1.9319197237491608, "reward_std": 0.3001772500574589, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9218750447034836, "rewards/tag_count_reward": 0.9743303954601288, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 110.17187881469727, "epoch": 0.10006720931969233, "grad_norm": 49.2911262512207, "kl": 0.9755859375, "learning_rate": 2e-05, "loss": 0.6104, "reward": 1.9771206676959991, "reward_std": 0.18325216323137283, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9575893431901932, "rewards/tag_count_reward": 0.9838170111179352, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 98.00447082519531, "epoch": 0.1003659174072138, "grad_norm": 85.24369049072266, "kl": 0.57568359375, "learning_rate": 1.9999994560490055e-05, "loss": 0.4376, "reward": 1.9419643580913544, "reward_std": 0.20692409574985504, "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.9553571790456772, "rewards/tag_count_reward": 0.9821428954601288, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 106.32589530944824, "epoch": 0.10066462549473527, "grad_norm": 91.47799682617188, "kl": 0.654296875, "learning_rate": 1.999997824196613e-05, "loss": 0.695, "reward": 1.9687500894069672, "reward_std": 0.27744976431131363, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.9441964775323868, "rewards/tag_count_reward": 0.9799107611179352, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 96.30580711364746, "epoch": 0.10096333358225675, "grad_norm": 17.84906768798828, "kl": 0.662109375, "learning_rate": 1.999995104444598e-05, "loss": 0.3316, "reward": 2.0435268878936768, "reward_std": 0.20293622091412544, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.95089291036129, "rewards/tag_count_reward": 0.987723246216774, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 94.79018211364746, "epoch": 0.10126204166977822, "grad_norm": 61.29705810546875, "kl": 1.619140625, "learning_rate": 1.9999912967959197e-05, "loss": 0.362, "reward": 1.9765625894069672, "reward_std": 0.18696681037545204, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9531250447034836, "rewards/tag_count_reward": 0.9877232611179352, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 95.77679061889648, "epoch": 0.10156074975729967, "grad_norm": 24.399761199951172, "kl": 0.9208984375, "learning_rate": 1.99998640125472e-05, "loss": 0.342, "reward": 1.9196429550647736, "reward_std": 0.20186872780323029, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9397321939468384, "rewards/tag_count_reward": 0.9799107611179352, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 85.14062881469727, "epoch": 0.10185945784482114, "grad_norm": 30.079076766967773, "kl": 2.0400390625, "learning_rate": 1.9999804178263253e-05, "loss": 0.344, "reward": 2.0474331378936768, "reward_std": 0.22288301587104797, "rewards/accuracy_reward": 0.10937500838190317, "rewards/format_reward": 0.9508928954601288, "rewards/tag_count_reward": 0.9871652275323868, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 80.11384201049805, "epoch": 0.10215816593234262, "grad_norm": 57.7022705078125, "kl": 2.19921875, "learning_rate": 1.999973346517244e-05, "loss": 0.2178, "reward": 2.0318081378936768, "reward_std": 0.11428027413785458, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.99386166036129, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 84.90848731994629, "epoch": 0.10245687401986409, "grad_norm": 25.229990005493164, "kl": 1.423828125, "learning_rate": 1.99996518733517e-05, "loss": 0.3616, "reward": 2.065848261117935, "reward_std": 0.13918795343488455, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9899553805589676, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 85.66964721679688, "epoch": 0.10275558210738556, "grad_norm": 32.54103088378906, "kl": 1.38427734375, "learning_rate": 1.9999559402889794e-05, "loss": 0.2412, "reward": 2.0161831378936768, "reward_std": 0.10550626553595066, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9938616454601288, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 79.98884201049805, "epoch": 0.10305429019490703, "grad_norm": 19.474151611328125, "kl": 1.1337890625, "learning_rate": 1.9999456053887315e-05, "loss": 0.1865, "reward": 1.9799107909202576, "reward_std": 0.05830700974911451, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.995535746216774, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 85.25223541259766, "epoch": 0.1033529982824285, "grad_norm": 17.662050247192383, "kl": 1.27734375, "learning_rate": 1.9999341826456703e-05, "loss": 0.3661, "reward": 1.9899554252624512, "reward_std": 0.13949797488749027, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9899553805589676, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 89.11384201049805, "epoch": 0.10365170636994997, "grad_norm": 12.672257423400879, "kl": 0.765625, "learning_rate": 1.999921672072223e-05, "loss": 0.2178, "reward": 2.0491071939468384, "reward_std": 0.0879162922501564, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9955357313156128, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 88.40848541259766, "epoch": 0.10395041445747144, "grad_norm": 188.44488525390625, "kl": 2.802734375, "learning_rate": 1.9999080736819986e-05, "loss": 0.8431, "reward": 1.9960938096046448, "reward_std": 0.1443361733108759, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.991629496216774, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 95.58928680419922, "epoch": 0.10424912254499291, "grad_norm": 48.2551155090332, "kl": 1.7470703125, "learning_rate": 1.9998933874897922e-05, "loss": 0.5782, "reward": 1.9531250596046448, "reward_std": 0.1788277104496956, "rewards/accuracy_reward": 0.0066964291036129, "rewards/format_reward": 0.9598214626312256, "rewards/tag_count_reward": 0.9866071790456772, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 105.2656307220459, "epoch": 0.10454783063251437, "grad_norm": 71.82964324951172, "kl": 1.087890625, "learning_rate": 1.99987761351158e-05, "loss": 0.4856, "reward": 2.024553656578064, "reward_std": 0.17829704843461514, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.9553571939468384, "rewards/tag_count_reward": 0.9843750596046448, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 92.18973731994629, "epoch": 0.10484653872003584, "grad_norm": 61.039249420166016, "kl": 0.77734375, "learning_rate": 1.9998607517645227e-05, "loss": 0.4137, "reward": 2.0904018580913544, "reward_std": 0.15252108871936798, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.9732143431901932, "rewards/tag_count_reward": 0.9899553805589676, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 95.23661231994629, "epoch": 0.10514524680755731, "grad_norm": 14.11559009552002, "kl": 1.453125, "learning_rate": 1.9998428022669646e-05, "loss": 0.5124, "reward": 1.9681920409202576, "reward_std": 0.1980996299535036, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9508928954601288, "rewards/tag_count_reward": 0.9815848618745804, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 116.28348731994629, "epoch": 0.10544395489507878, "grad_norm": 51.70475387573242, "kl": 2.75, "learning_rate": 1.9998237650384324e-05, "loss": 0.9807, "reward": 2.0133929550647736, "reward_std": 0.22647612914443016, "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.9508928954601288, "rewards/tag_count_reward": 0.9821428954601288, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 123.81027412414551, "epoch": 0.10574266298260025, "grad_norm": 33.795143127441406, "kl": 2.201171875, "learning_rate": 1.9998036400996374e-05, "loss": 1.138, "reward": 2.023995667695999, "reward_std": 0.26217757537961006, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.9397321939468384, "rewards/tag_count_reward": 0.977120578289032, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 118.21875381469727, "epoch": 0.10604137107012172, "grad_norm": 73.12481689453125, "kl": 3.31640625, "learning_rate": 1.9997824274724734e-05, "loss": 1.0943, "reward": 2.0022322237491608, "reward_std": 0.24115276709198952, "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.941964328289032, "rewards/tag_count_reward": 0.9776786118745804, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 149.7567024230957, "epoch": 0.1063400791576432, "grad_norm": 108.01139831542969, "kl": 5.296875, "learning_rate": 1.999760127180017e-05, "loss": 1.681, "reward": 1.9581474661827087, "reward_std": 0.35018768161535263, "rewards/accuracy_reward": 0.07142857322469354, "rewards/format_reward": 0.9241071790456772, "rewards/tag_count_reward": 0.96261166036129, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 136.7075958251953, "epoch": 0.10663878724516467, "grad_norm": 89.17548370361328, "kl": 1.4326171875, "learning_rate": 1.99973673924653e-05, "loss": 0.7781, "reward": 1.9492188394069672, "reward_std": 0.3303825855255127, "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.9218750298023224, "rewards/tag_count_reward": 0.9670759290456772, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 122.43304061889648, "epoch": 0.10693749533268614, "grad_norm": 50.26328659057617, "kl": 1.689453125, "learning_rate": 1.999712263697455e-05, "loss": 0.7454, "reward": 1.8856027722358704, "reward_std": 0.297866377979517, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9151786118745804, "rewards/tag_count_reward": 0.9704241454601288, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 124.75447082519531, "epoch": 0.10723620342020761, "grad_norm": 85.31748962402344, "kl": 4.2890625, "learning_rate": 1.9996867005594193e-05, "loss": 1.0828, "reward": 1.8867188394069672, "reward_std": 0.30077723413705826, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9151786118745804, "rewards/tag_count_reward": 0.9715402126312256, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 107.68973541259766, "epoch": 0.10753491150772908, "grad_norm": 63.05976486206055, "kl": 3.51171875, "learning_rate": 1.9996600498602334e-05, "loss": 0.6812, "reward": 1.998883992433548, "reward_std": 0.23071251437067986, "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.9553571790456772, "rewards/tag_count_reward": 0.9765625447034836, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 99.57812881469727, "epoch": 0.10783361959525053, "grad_norm": 6.76568603515625, "kl": 1.3369140625, "learning_rate": 1.9996323116288906e-05, "loss": 0.2769, "reward": 1.9430804252624512, "reward_std": 0.1607932336628437, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9575893133878708, "rewards/tag_count_reward": 0.9854910969734192, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 127.0245590209961, "epoch": 0.108132327682772, "grad_norm": 58.9040412902832, "kl": 2.056640625, "learning_rate": 1.9996034858955667e-05, "loss": 0.8712, "reward": 1.9559152722358704, "reward_std": 0.3063451014459133, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.9308036118745804, "rewards/tag_count_reward": 0.976004496216774, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 104.4754524230957, "epoch": 0.10843103577029348, "grad_norm": 42.0968132019043, "kl": 2.794921875, "learning_rate": 1.9995735726916223e-05, "loss": 0.567, "reward": 2.1378349661827087, "reward_std": 0.19813796505331993, "rewards/accuracy_reward": 0.196428582072258, "rewards/format_reward": 0.9598214775323868, "rewards/tag_count_reward": 0.9815848618745804, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 101.3214340209961, "epoch": 0.10872974385781495, "grad_norm": 31.76622772216797, "kl": 2.802734375, "learning_rate": 1.9995425720495993e-05, "loss": 0.533, "reward": 2.1328126192092896, "reward_std": 0.2248883657157421, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.9486607611179352, "rewards/tag_count_reward": 0.9810268133878708, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 90.24330711364746, "epoch": 0.10902845194533642, "grad_norm": 18.62177848815918, "kl": 1.048828125, "learning_rate": 1.999510484003224e-05, "loss": 0.2586, "reward": 2.0245536863803864, "reward_std": 0.22086378652602434, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.9575893431901932, "rewards/tag_count_reward": 0.98214291036129, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 101.2120590209961, "epoch": 0.10932716003285789, "grad_norm": 9.402345657348633, "kl": 1.90625, "learning_rate": 1.9994773085874043e-05, "loss": 0.4769, "reward": 1.944196492433548, "reward_std": 0.19667630456387997, "rewards/accuracy_reward": 0.0022321429569274187, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.9843750298023224, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 98.32366561889648, "epoch": 0.10962586812037936, "grad_norm": 19.75371742248535, "kl": 1.1923828125, "learning_rate": 1.9994430458382323e-05, "loss": 0.3437, "reward": 1.985491156578064, "reward_std": 0.1915955226868391, "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.9598214775323868, "rewards/tag_count_reward": 0.9854910969734192, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 100.68973731994629, "epoch": 0.10992457620790083, "grad_norm": 36.94270324707031, "kl": 1.2021484375, "learning_rate": 1.999407695792982e-05, "loss": 0.4777, "reward": 1.9492188394069672, "reward_std": 0.24172795191407204, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9508928954601288, "rewards/tag_count_reward": 0.9827009439468384, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 106.1339340209961, "epoch": 0.1102232842954223, "grad_norm": 105.19280242919922, "kl": 3.873046875, "learning_rate": 1.9993712584901116e-05, "loss": 0.6056, "reward": 1.9458706378936768, "reward_std": 0.21917320415377617, "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.9531250298023224, "rewards/tag_count_reward": 0.983816996216774, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 109.65402221679688, "epoch": 0.11052199238294377, "grad_norm": 301.12469482421875, "kl": 9.1015625, "learning_rate": 1.999333733969261e-05, "loss": 1.5232, "reward": 2.0591518580913544, "reward_std": 0.20072617754340172, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.95089291036129, "rewards/tag_count_reward": 0.9854911118745804, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 107.2812557220459, "epoch": 0.11082070047046524, "grad_norm": 30.644227981567383, "kl": 2.75390625, "learning_rate": 1.999295122271253e-05, "loss": 0.283, "reward": 1.958147406578064, "reward_std": 0.17421126179397106, "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9893973767757416, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 113.38170433044434, "epoch": 0.1111194085579867, "grad_norm": 8.911680221557617, "kl": 1.34765625, "learning_rate": 1.999255423438093e-05, "loss": 0.3046, "reward": 1.948102742433548, "reward_std": 0.18236742354929447, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9620535969734192, "rewards/tag_count_reward": 0.9860491454601288, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 106.50670051574707, "epoch": 0.11141811664550817, "grad_norm": 29.907556533813477, "kl": 1.44140625, "learning_rate": 1.9992146375129703e-05, "loss": 0.3231, "reward": 2.107701003551483, "reward_std": 0.21211299300193787, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9849330633878708, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 96.7433090209961, "epoch": 0.11171682473302964, "grad_norm": 24.505584716796875, "kl": 0.76171875, "learning_rate": 1.9991727645402556e-05, "loss": 0.2887, "reward": 2.012276828289032, "reward_std": 0.17846343107521534, "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.9598214626312256, "rewards/tag_count_reward": 0.9921875447034836, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 107.72545051574707, "epoch": 0.11201553282055111, "grad_norm": 11.180665969848633, "kl": 1.19140625, "learning_rate": 1.9991298045655022e-05, "loss": 0.3358, "reward": 2.0111607909202576, "reward_std": 0.24664813000708818, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.95089291036129, "rewards/tag_count_reward": 0.9843750298023224, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 91.60714721679688, "epoch": 0.11231424090807259, "grad_norm": 16.66785430908203, "kl": 1.375, "learning_rate": 1.9990857576354466e-05, "loss": 0.1516, "reward": 1.9921875894069672, "reward_std": 0.13831850700080395, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.9687500596046448, "rewards/tag_count_reward": 0.9854911118745804, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 102.55134391784668, "epoch": 0.11261294899559406, "grad_norm": 33.98971176147461, "kl": 1.828125, "learning_rate": 1.999040623798008e-05, "loss": 0.3323, "reward": 2.020089417695999, "reward_std": 0.18774162605404854, "rewards/accuracy_reward": 0.0691964291036129, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9866071790456772, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 100.6160774230957, "epoch": 0.11291165708311553, "grad_norm": 7.860510349273682, "kl": 0.9833984375, "learning_rate": 1.998994403102287e-05, "loss": 0.2444, "reward": 2.0167411863803864, "reward_std": 0.15771889686584473, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.970982164144516, "rewards/tag_count_reward": 0.9899553954601288, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 87.28571701049805, "epoch": 0.113210365170637, "grad_norm": 2.934890031814575, "kl": 0.48486328125, "learning_rate": 1.9989470955985674e-05, "loss": 0.0662, "reward": 2.0100447237491608, "reward_std": 0.08882077224552631, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9944196939468384, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 94.83259582519531, "epoch": 0.11350907325815847, "grad_norm": 0.6139289736747742, "kl": 0.4326171875, "learning_rate": 1.9988987013383153e-05, "loss": 0.0227, "reward": 2.0379465222358704, "reward_std": 0.08432949241250753, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9977678656578064, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 123.67411041259766, "epoch": 0.11380778134567994, "grad_norm": 30.407466888427734, "kl": 0.4951171875, "learning_rate": 1.9988492203741783e-05, "loss": 0.365, "reward": 2.051339417695999, "reward_std": 0.22670074179768562, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.9553571790456772, "rewards/tag_count_reward": 0.9754464775323868, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 139.9598274230957, "epoch": 0.11410648943320141, "grad_norm": 39.44997024536133, "kl": 0.720703125, "learning_rate": 1.9987986527599876e-05, "loss": 0.468, "reward": 1.9285715520381927, "reward_std": 0.2951784208416939, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9263393133878708, "rewards/tag_count_reward": 0.96651791036129, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 194.56921005249023, "epoch": 0.11440519752072287, "grad_norm": 15.612531661987305, "kl": 1.87890625, "learning_rate": 1.9987469985507553e-05, "loss": 0.6274, "reward": 1.8828125596046448, "reward_std": 0.3664553239941597, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.8950893431901932, "rewards/tag_count_reward": 0.9430803954601288, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 228.32590103149414, "epoch": 0.11470390560824434, "grad_norm": 25.729787826538086, "kl": 4.4609375, "learning_rate": 1.9986942578026767e-05, "loss": 1.1501, "reward": 1.9029018878936768, "reward_std": 0.46402931958436966, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.8816964775323868, "rewards/tag_count_reward": 0.9475446790456772, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 349.35939025878906, "epoch": 0.11500261369576581, "grad_norm": 104.3390121459961, "kl": 10.25, "learning_rate": 1.998640430573128e-05, "loss": 1.5828, "reward": 1.6456473767757416, "reward_std": 0.6415105164051056, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.7656250447034836, "rewards/tag_count_reward": 0.8800223618745804, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 326.1116180419922, "epoch": 0.11530132178328728, "grad_norm": 47.45319747924805, "kl": 7.7890625, "learning_rate": 1.9985855169206678e-05, "loss": 1.4996, "reward": 1.766741156578064, "reward_std": 0.6059208810329437, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.770089328289032, "rewards/tag_count_reward": 0.8895089775323868, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 411.5424270629883, "epoch": 0.11560002987080875, "grad_norm": 13.086767196655273, "kl": 5.734375, "learning_rate": 1.9985295169050374e-05, "loss": 0.9771, "reward": 1.6713170409202576, "reward_std": 0.6725886762142181, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.7477678954601288, "rewards/tag_count_reward": 0.852120578289032, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 357.4486846923828, "epoch": 0.11589873795833022, "grad_norm": 50.7424430847168, "kl": 2.212890625, "learning_rate": 1.998472430587159e-05, "loss": 0.8386, "reward": 1.7879465222358704, "reward_std": 0.6029689162969589, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.8191964477300644, "rewards/tag_count_reward": 0.8973214775323868, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 346.38394927978516, "epoch": 0.1161974460458517, "grad_norm": 38.59482955932617, "kl": 1.265625, "learning_rate": 1.9984142580291368e-05, "loss": 0.6204, "reward": 1.844866156578064, "reward_std": 0.5639853626489639, "rewards/accuracy_reward": 0.09375000395812094, "rewards/format_reward": 0.8325893133878708, "rewards/tag_count_reward": 0.918526828289032, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 334.4397430419922, "epoch": 0.11649615413337316, "grad_norm": 27.184743881225586, "kl": 1.5859375, "learning_rate": 1.9983549992942572e-05, "loss": 0.6112, "reward": 1.8593750894069672, "reward_std": 0.5217083916068077, "rewards/accuracy_reward": 0.08482143236324191, "rewards/format_reward": 0.8549107611179352, "rewards/tag_count_reward": 0.9196428954601288, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 313.6651916503906, "epoch": 0.11679486222089464, "grad_norm": 9.731590270996094, "kl": 2.6875, "learning_rate": 1.9982946544469875e-05, "loss": 0.6544, "reward": 1.85100457072258, "reward_std": 0.46598145365715027, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.8616071790456772, "rewards/tag_count_reward": 0.938058078289032, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 319.8325996398926, "epoch": 0.1170935703084161, "grad_norm": 34.422794342041016, "kl": 5.4375, "learning_rate": 1.998233223552977e-05, "loss": 0.9458, "reward": 1.9268974363803864, "reward_std": 0.48969750851392746, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.8660714775323868, "rewards/tag_count_reward": 0.9268973618745804, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 262.2477836608887, "epoch": 0.11739227839593756, "grad_norm": 31.684097290039062, "kl": 4.26953125, "learning_rate": 1.998170706679057e-05, "loss": 0.7104, "reward": 1.8632813394069672, "reward_std": 0.36708933860063553, "rewards/accuracy_reward": 0.0022321429569274187, "rewards/format_reward": 0.901785746216774, "rewards/tag_count_reward": 0.9592634290456772, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 250.0290298461914, "epoch": 0.11769098648345903, "grad_norm": 24.6363582611084, "kl": 2.94921875, "learning_rate": 1.998107103893239e-05, "loss": 0.6222, "reward": 1.9553572237491608, "reward_std": 0.3598385378718376, "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.9151786118745804, "rewards/tag_count_reward": 0.9575893133878708, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 219.6852798461914, "epoch": 0.1179896945709805, "grad_norm": 3.5105626583099365, "kl": 1.1025390625, "learning_rate": 1.9980424152647174e-05, "loss": 0.2716, "reward": 1.9787947237491608, "reward_std": 0.2785516530275345, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.9397321790456772, "rewards/tag_count_reward": 0.981026828289032, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 209.68750762939453, "epoch": 0.11828840265850198, "grad_norm": 11.382622718811035, "kl": 0.85546875, "learning_rate": 1.9979766408638664e-05, "loss": 0.3203, "reward": 1.9620536863803864, "reward_std": 0.3112982511520386, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.9174107611179352, "rewards/tag_count_reward": 0.973214328289032, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 212.46429824829102, "epoch": 0.11858711074602345, "grad_norm": 9.439735412597656, "kl": 0.9912109375, "learning_rate": 1.9979097807622424e-05, "loss": 0.3695, "reward": 1.9213170111179352, "reward_std": 0.3405046910047531, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.9218750447034836, "rewards/tag_count_reward": 0.961495578289032, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 204.20536422729492, "epoch": 0.11888581883354492, "grad_norm": 4.362215995788574, "kl": 1.025390625, "learning_rate": 1.9978418350325825e-05, "loss": 0.4346, "reward": 1.9994420409202576, "reward_std": 0.24720771610736847, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.9704241454601288, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 220.58483123779297, "epoch": 0.11918452692106639, "grad_norm": 10.616738319396973, "kl": 1.423828125, "learning_rate": 1.9977728037488052e-05, "loss": 0.4541, "reward": 1.9877232909202576, "reward_std": 0.3166257292032242, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.941964328289032, "rewards/tag_count_reward": 0.9676339775323868, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 193.14509963989258, "epoch": 0.11948323500858786, "grad_norm": 6.033414840698242, "kl": 0.8818359375, "learning_rate": 1.99770268698601e-05, "loss": 0.2355, "reward": 2.0552456378936768, "reward_std": 0.14855567552149296, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.98604916036129, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 207.89733123779297, "epoch": 0.11978194309610933, "grad_norm": 331.3157043457031, "kl": 33.826171875, "learning_rate": 1.9976314848204762e-05, "loss": 2.2884, "reward": 2.064174234867096, "reward_std": 0.25350454822182655, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.9748884290456772, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 206.30358123779297, "epoch": 0.1200806511836308, "grad_norm": 17.683977127075195, "kl": 2.62158203125, "learning_rate": 1.9975591973296657e-05, "loss": 0.2984, "reward": 2.018415331840515, "reward_std": 0.19692710414528847, "rewards/accuracy_reward": 0.06696429033763707, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9804687947034836, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 233.1540298461914, "epoch": 0.12037935927115227, "grad_norm": 2.453394889831543, "kl": 0.80419921875, "learning_rate": 1.9974858245922192e-05, "loss": 0.2355, "reward": 2.009486675262451, "reward_std": 0.2697831057012081, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.948660746216774, "rewards/tag_count_reward": 0.9715402275323868, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 216.28125762939453, "epoch": 0.12067806735867373, "grad_norm": 2.828160524368286, "kl": 0.41357421875, "learning_rate": 1.99741136668796e-05, "loss": 0.106, "reward": 1.9637277722358704, "reward_std": 0.1368898805230856, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9882813096046448, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 201.18304443359375, "epoch": 0.1209767754461952, "grad_norm": 2.1057558059692383, "kl": 0.88330078125, "learning_rate": 1.997335823697891e-05, "loss": 0.2456, "reward": 1.9960938394069672, "reward_std": 0.15028274059295654, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.984933078289032, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 208.08929061889648, "epoch": 0.12127548353371667, "grad_norm": 2.2948734760284424, "kl": 0.974609375, "learning_rate": 1.997259195704195e-05, "loss": 0.5664, "reward": 2.0050223767757416, "reward_std": 0.30861789733171463, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.948660746216774, "rewards/tag_count_reward": 0.9648437947034836, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 298.37947845458984, "epoch": 0.12157419162123814, "grad_norm": 16.11134147644043, "kl": 3.79296875, "learning_rate": 1.997181482790236e-05, "loss": 1.151, "reward": 1.7767857909202576, "reward_std": 0.5725193098187447, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.8035714775323868, "rewards/tag_count_reward": 0.8995535969734192, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 430.8303756713867, "epoch": 0.12187289970875961, "grad_norm": 40.530540466308594, "kl": 4.06640625, "learning_rate": 1.997102685040558e-05, "loss": 1.1053, "reward": 1.4620536267757416, "reward_std": 0.7285864949226379, "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.6093750298023224, "rewards/tag_count_reward": 0.8058036118745804, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 522.0558319091797, "epoch": 0.12217160779628108, "grad_norm": 16.41097068786621, "kl": 4.5390625, "learning_rate": 1.9970228025408854e-05, "loss": 1.0403, "reward": 1.2857143580913544, "reward_std": 0.7739853709936142, "rewards/accuracy_reward": 0.024553572293370962, "rewards/format_reward": 0.5133928656578064, "rewards/tag_count_reward": 0.7477678954601288, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 567.5089645385742, "epoch": 0.12247031588380256, "grad_norm": 36.32084655761719, "kl": 7.890625, "learning_rate": 1.996941835378122e-05, "loss": 1.0757, "reward": 1.1233259439468384, "reward_std": 0.736218199133873, "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.4151785969734192, "rewards/tag_count_reward": 0.7014509290456772, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 498.4018096923828, "epoch": 0.12276902397132403, "grad_norm": 24.499990463256836, "kl": 5.4375, "learning_rate": 1.9968597836403526e-05, "loss": 1.1346, "reward": 1.3147321939468384, "reward_std": 0.7316595762968063, "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.5290178656578064, "rewards/tag_count_reward": 0.7790178954601288, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 387.1205520629883, "epoch": 0.1230677320588455, "grad_norm": 6.702353477478027, "kl": 3.94921875, "learning_rate": 1.9967766474168416e-05, "loss": 1.0233, "reward": 1.4877232909202576, "reward_std": 0.7043821662664413, "rewards/accuracy_reward": 0.0446428582072258, "rewards/format_reward": 0.6071428656578064, "rewards/tag_count_reward": 0.8359375149011612, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 260.3102836608887, "epoch": 0.12336644014636697, "grad_norm": 21.270042419433594, "kl": 1.5625, "learning_rate": 1.9966924267980326e-05, "loss": 0.8447, "reward": 1.665178656578064, "reward_std": 0.5587232038378716, "rewards/accuracy_reward": 0.0066964291036129, "rewards/format_reward": 0.7500000149011612, "rewards/tag_count_reward": 0.9084821939468384, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 164.52902603149414, "epoch": 0.12366514823388844, "grad_norm": 14.543283462524414, "kl": 0.818359375, "learning_rate": 1.9966071218755497e-05, "loss": 0.4925, "reward": 1.8856027722358704, "reward_std": 0.4624323472380638, "rewards/accuracy_reward": 0.08705357694998384, "rewards/format_reward": 0.8504464775323868, "rewards/tag_count_reward": 0.9481027275323868, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 146.18973922729492, "epoch": 0.1239638563214099, "grad_norm": 10.31971549987793, "kl": 1.025390625, "learning_rate": 1.9965207327421964e-05, "loss": 0.3628, "reward": 1.8922992050647736, "reward_std": 0.4822889342904091, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.84151791036129, "rewards/tag_count_reward": 0.9436384439468384, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 146.0223274230957, "epoch": 0.12426256440893137, "grad_norm": 4.78293514251709, "kl": 1.380859375, "learning_rate": 1.996433259491955e-05, "loss": 0.3747, "reward": 1.8264509439468384, "reward_std": 0.5031522810459137, "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.8303571939468384, "rewards/tag_count_reward": 0.9291295111179352, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 218.83036422729492, "epoch": 0.12456127249645284, "grad_norm": 13.799188613891602, "kl": 3.87890625, "learning_rate": 1.9963447022199884e-05, "loss": 0.9672, "reward": 1.731584906578064, "reward_std": 0.5768922120332718, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.7522321790456772, "rewards/tag_count_reward": 0.9079241454601288, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 201.29241943359375, "epoch": 0.12485998058397431, "grad_norm": 23.20499610900879, "kl": 4.4453125, "learning_rate": 1.9962550610226382e-05, "loss": 1.2794, "reward": 1.8459822237491608, "reward_std": 0.5354795008897781, "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.8258928954601288, "rewards/tag_count_reward": 0.9330357611179352, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 184.8817024230957, "epoch": 0.1251586886714958, "grad_norm": 12.880730628967285, "kl": 3.2578125, "learning_rate": 1.996164335997425e-05, "loss": 1.0718, "reward": 1.846540242433548, "reward_std": 0.3894262984395027, "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.8839286118745804, "rewards/tag_count_reward": 0.9559152126312256, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 124.49777412414551, "epoch": 0.12545739675901724, "grad_norm": 8.311551094055176, "kl": 1.041015625, "learning_rate": 1.9960725272430487e-05, "loss": 0.4118, "reward": 2.074218839406967, "reward_std": 0.2333321049809456, "rewards/accuracy_reward": 0.1361607238650322, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.9804687947034836, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 111.2187557220459, "epoch": 0.1257561048465387, "grad_norm": 1.0475260019302368, "kl": 0.365966796875, "learning_rate": 1.9959796348593886e-05, "loss": 0.0859, "reward": 1.9983259737491608, "reward_std": 0.05460946727544069, "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.998325914144516, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 121.44643211364746, "epoch": 0.12605481293406018, "grad_norm": 1.2835907936096191, "kl": 0.57421875, "learning_rate": 1.9958856589475018e-05, "loss": 0.1846, "reward": 2.0368304550647736, "reward_std": 0.0855904296040535, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9944196939468384, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 140.58036422729492, "epoch": 0.12635352102158165, "grad_norm": 4.180211067199707, "kl": 0.912109375, "learning_rate": 1.995790599609626e-05, "loss": 0.4336, "reward": 2.0385045409202576, "reward_std": 0.12638136185705662, "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.991629496216774, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 156.83482360839844, "epoch": 0.12665222910910312, "grad_norm": 4.879977226257324, "kl": 1.0625, "learning_rate": 1.9956944569491756e-05, "loss": 0.3839, "reward": 2.0859375596046448, "reward_std": 0.23040233924984932, "rewards/accuracy_reward": 0.14062500186264515, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.9877232611179352, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 174.10715103149414, "epoch": 0.1269509371966246, "grad_norm": 2.836599826812744, "kl": 0.763671875, "learning_rate": 1.995597231070744e-05, "loss": 0.3429, "reward": 1.9570313692092896, "reward_std": 0.23029975220561028, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.9352678954601288, "rewards/tag_count_reward": 0.9838170111179352, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 196.18973922729492, "epoch": 0.12724964528414606, "grad_norm": 5.131869316101074, "kl": 1.197265625, "learning_rate": 1.9954989220801046e-05, "loss": 0.4472, "reward": 1.9860492050647736, "reward_std": 0.2938057780265808, "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.9285714775323868, "rewards/tag_count_reward": 0.977120578289032, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 259.0156364440918, "epoch": 0.12754835337166753, "grad_norm": 23.351150512695312, "kl": 2.798828125, "learning_rate": 1.9953995300842073e-05, "loss": 0.81, "reward": 1.8504465222358704, "reward_std": 0.41297096014022827, "rewards/accuracy_reward": 0.0379464291036129, "rewards/format_reward": 0.8504464626312256, "rewards/tag_count_reward": 0.9620536118745804, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 313.92858123779297, "epoch": 0.127847061459189, "grad_norm": 9.90182113647461, "kl": 3.54296875, "learning_rate": 1.9952990551911808e-05, "loss": 0.8487, "reward": 1.8041295409202576, "reward_std": 0.5224436521530151, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.7633928954601288, "rewards/tag_count_reward": 0.93136166036129, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 303.4732208251953, "epoch": 0.12814576954671048, "grad_norm": 37.676483154296875, "kl": 1.833984375, "learning_rate": 1.995197497510332e-05, "loss": 0.696, "reward": 1.774553656578064, "reward_std": 0.4945437088608742, "rewards/accuracy_reward": 0.0200892873108387, "rewards/format_reward": 0.8058036118745804, "rewards/tag_count_reward": 0.948660746216774, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 262.35046005249023, "epoch": 0.12844447763423195, "grad_norm": 27.030941009521484, "kl": 1.0615234375, "learning_rate": 1.9950948571521458e-05, "loss": 0.5843, "reward": 1.8906250894069672, "reward_std": 0.3540447913110256, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.8839286118745804, "rewards/tag_count_reward": 0.9709821939468384, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 263.6629638671875, "epoch": 0.12874318572175342, "grad_norm": 11.580384254455566, "kl": 1.625, "learning_rate": 1.994991134228285e-05, "loss": 0.7227, "reward": 1.9447545111179352, "reward_std": 0.3698681965470314, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.8727678805589676, "rewards/tag_count_reward": 0.9648437798023224, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 314.8526916503906, "epoch": 0.1290418938092749, "grad_norm": 30.86504364013672, "kl": 4.07421875, "learning_rate": 1.9948863288515895e-05, "loss": 0.9746, "reward": 1.81975457072258, "reward_std": 0.47684966772794724, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.7968750298023224, "rewards/tag_count_reward": 0.9469866305589676, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 354.38170623779297, "epoch": 0.12934060189679636, "grad_norm": 35.50193405151367, "kl": 4.890625, "learning_rate": 1.9947804411360775e-05, "loss": 1.0969, "reward": 1.694196492433548, "reward_std": 0.5967399328947067, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.7321428954601288, "rewards/tag_count_reward": 0.9218750298023224, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 362.69420623779297, "epoch": 0.12963930998431783, "grad_norm": 6.776815891265869, "kl": 3.30078125, "learning_rate": 1.9946734711969447e-05, "loss": 0.941, "reward": 1.6830357909202576, "reward_std": 0.5641478002071381, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.7455357611179352, "rewards/tag_count_reward": 0.9218750447034836, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 294.2522430419922, "epoch": 0.1299380180718393, "grad_norm": 24.09188461303711, "kl": 1.29296875, "learning_rate": 1.994565419150564e-05, "loss": 0.6386, "reward": 1.9202010035514832, "reward_std": 0.41428857296705246, "rewards/accuracy_reward": 0.11160714784637094, "rewards/format_reward": 0.8571428954601288, "rewards/tag_count_reward": 0.9514509290456772, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 295.29466247558594, "epoch": 0.13023672615936077, "grad_norm": 29.491731643676758, "kl": 1.23828125, "learning_rate": 1.9944562851144846e-05, "loss": 0.7042, "reward": 1.8264509737491608, "reward_std": 0.42318835109472275, "rewards/accuracy_reward": 0.0066964291036129, "rewards/format_reward": 0.8683036267757416, "rewards/tag_count_reward": 0.9514509290456772, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 330.76116943359375, "epoch": 0.13053543424688224, "grad_norm": 15.287300109863281, "kl": 2.2255859375, "learning_rate": 1.9943460692074345e-05, "loss": 0.8275, "reward": 1.7806920409202576, "reward_std": 0.48121991008520126, "rewards/accuracy_reward": 0.0290178582072258, "rewards/format_reward": 0.8214286267757416, "rewards/tag_count_reward": 0.930245578289032, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 335.4665298461914, "epoch": 0.1308341423344037, "grad_norm": 18.96664047241211, "kl": 3.75, "learning_rate": 1.994234771549317e-05, "loss": 1.1076, "reward": 1.8816965520381927, "reward_std": 0.5650221183896065, "rewards/accuracy_reward": 0.14508929592557251, "rewards/format_reward": 0.8214286118745804, "rewards/tag_count_reward": 0.9151786118745804, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 328.87500762939453, "epoch": 0.13113285042192518, "grad_norm": 22.485137939453125, "kl": 3.7734375, "learning_rate": 1.9941223922612143e-05, "loss": 1.0507, "reward": 1.7600446939468384, "reward_std": 0.5595772713422775, "rewards/accuracy_reward": 0.0379464291036129, "rewards/format_reward": 0.823660746216774, "rewards/tag_count_reward": 0.8984375447034836, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 389.1250228881836, "epoch": 0.13143155850944666, "grad_norm": 23.768217086791992, "kl": 4.77734375, "learning_rate": 1.9940089314653826e-05, "loss": 1.2209, "reward": 1.7332590222358704, "reward_std": 0.6468324810266495, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.7500000447034836, "rewards/tag_count_reward": 0.8671875596046448, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 356.25001525878906, "epoch": 0.13173026659696813, "grad_norm": 20.457050323486328, "kl": 2.8828125, "learning_rate": 1.9938943892852575e-05, "loss": 0.9757, "reward": 1.7544643878936768, "reward_std": 0.6150671541690826, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.8035714626312256, "rewards/tag_count_reward": 0.8883928954601288, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 347.8549270629883, "epoch": 0.13202897468448957, "grad_norm": 28.949844360351562, "kl": 2.35546875, "learning_rate": 1.9937787658454484e-05, "loss": 0.9389, "reward": 1.776227742433548, "reward_std": 0.5770691707730293, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.8102678954601288, "rewards/tag_count_reward": 0.8945312947034836, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 314.71653747558594, "epoch": 0.13232768277201104, "grad_norm": 25.478309631347656, "kl": 2.162109375, "learning_rate": 1.993662061271743e-05, "loss": 0.8651, "reward": 1.8515625894069672, "reward_std": 0.46982043981552124, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.854910746216774, "rewards/tag_count_reward": 0.9229911118745804, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 382.3884048461914, "epoch": 0.1326263908595325, "grad_norm": 5.048300266265869, "kl": 3.61328125, "learning_rate": 1.9935442756911044e-05, "loss": 1.0595, "reward": 1.6718750894069672, "reward_std": 0.6182444095611572, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.767857164144516, "rewards/tag_count_reward": 0.8660714626312256, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 330.24554443359375, "epoch": 0.13292509894705398, "grad_norm": 14.027213096618652, "kl": 3.3515625, "learning_rate": 1.9934254092316716e-05, "loss": 1.0238, "reward": 1.84319207072258, "reward_std": 0.5435174703598022, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.8370536118745804, "rewards/tag_count_reward": 0.8989955633878708, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 266.3928756713867, "epoch": 0.13322380703457545, "grad_norm": 10.927474021911621, "kl": 2.40234375, "learning_rate": 1.9933054620227595e-05, "loss": 0.8233, "reward": 1.9319196939468384, "reward_std": 0.43095193058252335, "rewards/accuracy_reward": 0.11160715110599995, "rewards/format_reward": 0.8883928954601288, "rewards/tag_count_reward": 0.9319196790456772, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 235.73439025878906, "epoch": 0.13352251512209692, "grad_norm": 5.911532402038574, "kl": 1.482421875, "learning_rate": 1.9931844341948595e-05, "loss": 0.5513, "reward": 2.0429688692092896, "reward_std": 0.265589214861393, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.9397321939468384, "rewards/tag_count_reward": 0.960379496216774, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 210.03795623779297, "epoch": 0.1338212232096184, "grad_norm": 6.347651958465576, "kl": 0.75048828125, "learning_rate": 1.9930623258796373e-05, "loss": 0.3741, "reward": 1.9570313394069672, "reward_std": 0.19616254791617393, "rewards/accuracy_reward": 0.01116071455180645, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9793527126312256, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 209.86608123779297, "epoch": 0.13411993129713987, "grad_norm": 11.930597305297852, "kl": 0.9833984375, "learning_rate": 1.9929391372099352e-05, "loss": 0.4424, "reward": 1.9933037161827087, "reward_std": 0.24066746979951859, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.9531250447034836, "rewards/tag_count_reward": 0.9687500447034836, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 207.59822463989258, "epoch": 0.13441863938466134, "grad_norm": 6.748196125030518, "kl": 0.653564453125, "learning_rate": 1.9928148683197705e-05, "loss": 0.2876, "reward": 1.9966519176959991, "reward_std": 0.12447400391101837, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9854910969734192, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 180.7946548461914, "epoch": 0.1347173474721828, "grad_norm": 4.706009864807129, "kl": 0.94287109375, "learning_rate": 1.9926895193443352e-05, "loss": 0.444, "reward": 1.9944197535514832, "reward_std": 0.2124867830425501, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9743303954601288, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 201.66964721679688, "epoch": 0.13501605555970428, "grad_norm": 16.50455093383789, "kl": 1.794921875, "learning_rate": 1.992563090419997e-05, "loss": 0.7863, "reward": 1.9202010035514832, "reward_std": 0.29516417905688286, "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.9397321790456772, "rewards/tag_count_reward": 0.9626116305589676, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 198.2232208251953, "epoch": 0.13531476364722575, "grad_norm": 21.572233200073242, "kl": 2.01953125, "learning_rate": 1.992435581684298e-05, "loss": 0.8455, "reward": 1.9514510035514832, "reward_std": 0.35679007321596146, "rewards/accuracy_reward": 0.0825892873108387, "rewards/format_reward": 0.9151786118745804, "rewards/tag_count_reward": 0.953683078289032, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 215.06697463989258, "epoch": 0.13561347173474722, "grad_norm": 24.052757263183594, "kl": 2.48046875, "learning_rate": 1.9923069932759554e-05, "loss": 0.9742, "reward": 1.876116156578064, "reward_std": 0.4185222163796425, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.8928571790456772, "rewards/tag_count_reward": 0.9408482611179352, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 199.23884963989258, "epoch": 0.1359121798222687, "grad_norm": 9.227279663085938, "kl": 1.841796875, "learning_rate": 1.9921773253348604e-05, "loss": 0.8241, "reward": 2.0200893878936768, "reward_std": 0.3371174670755863, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.9241071939468384, "rewards/tag_count_reward": 0.9531250447034836, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 216.81474685668945, "epoch": 0.13621088790979016, "grad_norm": 16.004623413085938, "kl": 1.4072265625, "learning_rate": 1.9920465780020794e-05, "loss": 0.7111, "reward": 1.8922991752624512, "reward_std": 0.35329464077949524, "rewards/accuracy_reward": 0.0133928582072258, "rewards/format_reward": 0.9285714626312256, "rewards/tag_count_reward": 0.9503348767757416, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 205.61831283569336, "epoch": 0.13650959599731163, "grad_norm": 24.311433792114258, "kl": 1.0244140625, "learning_rate": 1.9919147514198526e-05, "loss": 0.6727, "reward": 1.9447545409202576, "reward_std": 0.3132414221763611, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9464286118745804, "rewards/tag_count_reward": 0.96261166036129, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 184.50670623779297, "epoch": 0.1368083040848331, "grad_norm": 19.71268653869629, "kl": 0.82177734375, "learning_rate": 1.991781845731594e-05, "loss": 0.5589, "reward": 2.044642925262451, "reward_std": 0.21709422580897808, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9754464626312256, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 192.54241943359375, "epoch": 0.13710701217235458, "grad_norm": 2.0376386642456055, "kl": 1.169921875, "learning_rate": 1.991647861081893e-05, "loss": 0.5359, "reward": 1.9123884737491608, "reward_std": 0.3281407877802849, "rewards/accuracy_reward": 0.01785714295692742, "rewards/format_reward": 0.93526791036129, "rewards/tag_count_reward": 0.9592634439468384, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 174.62723922729492, "epoch": 0.13740572025987605, "grad_norm": 2.084235191345215, "kl": 0.63037109375, "learning_rate": 1.9915127976165104e-05, "loss": 0.3418, "reward": 2.035714328289032, "reward_std": 0.20712930709123611, "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9776786118745804, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 197.79241943359375, "epoch": 0.13770442834739752, "grad_norm": 2.1881840229034424, "kl": 0.410888671875, "learning_rate": 1.991376655482383e-05, "loss": 0.1832, "reward": 2.238281339406967, "reward_std": 0.1333402916789055, "rewards/accuracy_reward": 0.2656250149011612, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9882812798023224, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 179.73884963989258, "epoch": 0.138003136434919, "grad_norm": 0.6293359994888306, "kl": 0.23828125, "learning_rate": 1.9912394348276197e-05, "loss": 0.0846, "reward": 2.0970982909202576, "reward_std": 0.09921832010149956, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.994419664144516, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 200.7098274230957, "epoch": 0.13830184452244043, "grad_norm": 0.6090904474258423, "kl": 0.32666015625, "learning_rate": 1.9911011358015033e-05, "loss": 0.1687, "reward": 2.0513393580913544, "reward_std": 0.0803571455180645, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9910714477300644, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 221.5714340209961, "epoch": 0.1386005526099619, "grad_norm": 0.5349171757698059, "kl": 0.44384765625, "learning_rate": 1.9909617585544894e-05, "loss": 0.1809, "reward": 2.064174234867096, "reward_std": 0.1600718181580305, "rewards/accuracy_reward": 0.09821429196745157, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9882812947034836, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 209.5312614440918, "epoch": 0.13889926069748337, "grad_norm": 0.6949582695960999, "kl": 0.384521484375, "learning_rate": 1.9908213032382072e-05, "loss": 0.153, "reward": 2.07366082072258, "reward_std": 0.1384084653109312, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.986607164144516, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 258.38171005249023, "epoch": 0.13919796878500484, "grad_norm": 1.3697000741958618, "kl": 0.712890625, "learning_rate": 1.9906797700054576e-05, "loss": 0.3014, "reward": 2.014508992433548, "reward_std": 0.24822691082954407, "rewards/accuracy_reward": 0.08705357275903225, "rewards/format_reward": 0.948660746216774, "rewards/tag_count_reward": 0.9787946939468384, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 307.5602798461914, "epoch": 0.13949667687252632, "grad_norm": 0.6018795371055603, "kl": 1.1552734375, "learning_rate": 1.9905371590102157e-05, "loss": 0.4506, "reward": 1.9771206378936768, "reward_std": 0.3474641740322113, "rewards/accuracy_reward": 0.08705357322469354, "rewards/format_reward": 0.926339328289032, "rewards/tag_count_reward": 0.9637277275323868, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 376.6428756713867, "epoch": 0.13979538496004779, "grad_norm": 34.19148635864258, "kl": 4.427734375, "learning_rate": 1.9903934704076273e-05, "loss": 0.699, "reward": 1.9029018878936768, "reward_std": 0.3891730532050133, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.9062500298023224, "rewards/tag_count_reward": 0.9453125447034836, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 423.13616943359375, "epoch": 0.14009409304756926, "grad_norm": 1.6306413412094116, "kl": 1.71875, "learning_rate": 1.9902487043540125e-05, "loss": 0.5591, "reward": 1.8102679550647736, "reward_std": 0.5160161182284355, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.8482143431901932, "rewards/tag_count_reward": 0.9196428954601288, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 477.8951110839844, "epoch": 0.14039280113509073, "grad_norm": 1.2516266107559204, "kl": 2.341796875, "learning_rate": 1.990102861006862e-05, "loss": 0.669, "reward": 1.7282366752624512, "reward_std": 0.5892422199249268, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.7968750447034836, "rewards/tag_count_reward": 0.8911830633878708, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 468.0937805175781, "epoch": 0.1406915092226122, "grad_norm": 1.7512751817703247, "kl": 2.6875, "learning_rate": 1.989955940524839e-05, "loss": 0.7296, "reward": 1.766183078289032, "reward_std": 0.6171711832284927, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.7991071790456772, "rewards/tag_count_reward": 0.8978795111179352, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 492.8705596923828, "epoch": 0.14099021731013367, "grad_norm": 1.8903162479400635, "kl": 2.689453125, "learning_rate": 1.9898079430677796e-05, "loss": 0.6421, "reward": 1.8632813692092896, "reward_std": 0.6257495880126953, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.8058036118745804, "rewards/tag_count_reward": 0.8967634439468384, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 487.77903747558594, "epoch": 0.14128892539765514, "grad_norm": 3.101919412612915, "kl": 3.015625, "learning_rate": 1.989658868796689e-05, "loss": 0.6687, "reward": 1.7377232909202576, "reward_std": 0.5915535241365433, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.8080357611179352, "rewards/tag_count_reward": 0.893973246216774, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 505.06029510498047, "epoch": 0.1415876334851766, "grad_norm": 4.454689979553223, "kl": 2.205078125, "learning_rate": 1.9895087178737467e-05, "loss": 0.5181, "reward": 1.8247768878936768, "reward_std": 0.5208612680435181, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.839285746216774, "rewards/tag_count_reward": 0.9140625447034836, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 503.87279510498047, "epoch": 0.14188634157269808, "grad_norm": 6.992595672607422, "kl": 2.443359375, "learning_rate": 1.9893574904623013e-05, "loss": 0.563, "reward": 1.774553656578064, "reward_std": 0.5661075338721275, "rewards/accuracy_reward": 0.05580357275903225, "rewards/format_reward": 0.8147321939468384, "rewards/tag_count_reward": 0.90401791036129, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 537.756721496582, "epoch": 0.14218504966021955, "grad_norm": 5.319524765014648, "kl": 1.75390625, "learning_rate": 1.989205186726874e-05, "loss": 0.3347, "reward": 1.8649554252624512, "reward_std": 0.49945591390132904, "rewards/accuracy_reward": 0.06250000465661287, "rewards/format_reward": 0.8727678954601288, "rewards/tag_count_reward": 0.9296875298023224, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 575.3526916503906, "epoch": 0.14248375774774102, "grad_norm": 1.4435358047485352, "kl": 2.087890625, "learning_rate": 1.9890518068331555e-05, "loss": 0.3066, "reward": 1.9213170409202576, "reward_std": 0.3829821050167084, "rewards/accuracy_reward": 0.12723214784637094, "rewards/format_reward": 0.8660714626312256, "rewards/tag_count_reward": 0.9280134588479996, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 558.1317291259766, "epoch": 0.1427824658352625, "grad_norm": 14.625527381896973, "kl": 6.5390625, "learning_rate": 1.988897350948009e-05, "loss": 0.9569, "reward": 1.5390625596046448, "reward_std": 0.7099626362323761, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.611607164144516, "rewards/tag_count_reward": 0.8046875298023224, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 419.8549270629883, "epoch": 0.14308117392278397, "grad_norm": 3.06449294090271, "kl": 2.517578125, "learning_rate": 1.988741819239467e-05, "loss": 0.6205, "reward": 1.7366072535514832, "reward_std": 0.5187483206391335, "rewards/accuracy_reward": 0.0022321429569274187, "rewards/format_reward": 0.8191964626312256, "rewards/tag_count_reward": 0.9151786267757416, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 430.3973388671875, "epoch": 0.14337988201030544, "grad_norm": 1.4234956502914429, "kl": 1.5146484375, "learning_rate": 1.988585211876733e-05, "loss": 0.3937, "reward": 1.9469867050647736, "reward_std": 0.4831159636378288, "rewards/accuracy_reward": 0.16517857275903225, "rewards/format_reward": 0.8504464626312256, "rewards/tag_count_reward": 0.93136166036129, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 424.3370819091797, "epoch": 0.1436785900978269, "grad_norm": 0.8560442924499512, "kl": 0.433837890625, "learning_rate": 1.98842752903018e-05, "loss": 0.0966, "reward": 2.0267857909202576, "reward_std": 0.21115188673138618, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.9419643431901932, "rewards/tag_count_reward": 0.979910746216774, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 336.2611770629883, "epoch": 0.14397729818534838, "grad_norm": 0.8059180974960327, "kl": 0.7724609375, "learning_rate": 1.9882687708713514e-05, "loss": 0.1591, "reward": 2.1478796005249023, "reward_std": 0.18104978278279305, "rewards/accuracy_reward": 0.1986607275903225, "rewards/format_reward": 0.9665178805589676, "rewards/tag_count_reward": 0.9827009290456772, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 345.65403747558594, "epoch": 0.14427600627286985, "grad_norm": 0.46935588121414185, "kl": 0.38232421875, "learning_rate": 1.9881089375729614e-05, "loss": 0.1162, "reward": 2.0256697833538055, "reward_std": 0.09921832475811243, "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.994419664144516, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 354.8080520629883, "epoch": 0.14457471436039132, "grad_norm": 0.43385452032089233, "kl": 0.6015625, "learning_rate": 1.987948029308892e-05, "loss": 0.1798, "reward": 2.048549145460129, "reward_std": 0.1856505163013935, "rewards/accuracy_reward": 0.09375000186264515, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9815848618745804, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 360.87947845458984, "epoch": 0.14487342244791276, "grad_norm": 0.5367817282676697, "kl": 0.65380859375, "learning_rate": 1.9877860462541964e-05, "loss": 0.1861, "reward": 2.0083706080913544, "reward_std": 0.11174611561000347, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9905134439468384, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 389.4620666503906, "epoch": 0.14517213053543424, "grad_norm": 1.2923502922058105, "kl": 1.4609375, "learning_rate": 1.9876229885850957e-05, "loss": 0.4322, "reward": 1.9330357909202576, "reward_std": 0.3044450432062149, "rewards/accuracy_reward": 0.0267857164144516, "rewards/format_reward": 0.9397321790456772, "rewards/tag_count_reward": 0.96651791036129, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 310.62947845458984, "epoch": 0.1454708386229557, "grad_norm": 1.0061389207839966, "kl": 1.021484375, "learning_rate": 1.987458856478981e-05, "loss": 0.2893, "reward": 2.0273438692092896, "reward_std": 0.17399480566382408, "rewards/accuracy_reward": 0.07366071874275804, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.984933078289032, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 361.3080596923828, "epoch": 0.14576954671047718, "grad_norm": 0.36473631858825684, "kl": 0.53857421875, "learning_rate": 1.987293650114412e-05, "loss": 0.0827, "reward": 2.006696492433548, "reward_std": 0.11607143469154835, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9910714775323868, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 410.08707427978516, "epoch": 0.14606825479799865, "grad_norm": 0.27904367446899414, "kl": 0.3759765625, "learning_rate": 1.9871273696711166e-05, "loss": 0.0907, "reward": 1.9726563096046448, "reward_std": 0.12751024402678013, "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9882812798023224, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 391.2076110839844, "epoch": 0.14636696288552012, "grad_norm": 0.7404503226280212, "kl": 0.468017578125, "learning_rate": 1.9869600153299916e-05, "loss": 0.1158, "reward": 2.0178572237491608, "reward_std": 0.16072743013501167, "rewards/accuracy_reward": 0.066964291036129, "rewards/format_reward": 0.9732143133878708, "rewards/tag_count_reward": 0.9776786118745804, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 411.15626525878906, "epoch": 0.1466656709730416, "grad_norm": 0.2604890465736389, "kl": 0.294189453125, "learning_rate": 1.986791587273103e-05, "loss": 0.0718, "reward": 1.9960938394069672, "reward_std": 0.15427246689796448, "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.9687500149011612, "rewards/tag_count_reward": 0.9827009290456772, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 391.1942138671875, "epoch": 0.14696437906056306, "grad_norm": 0.24699480831623077, "kl": 0.220703125, "learning_rate": 1.986622085683683e-05, "loss": 0.0708, "reward": 2.1088170409202576, "reward_std": 0.13562809117138386, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.9821428656578064, "rewards/tag_count_reward": 0.990513414144516, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 464.11832427978516, "epoch": 0.14726308714808453, "grad_norm": 0.5009165406227112, "kl": 0.248779296875, "learning_rate": 1.9864515107461332e-05, "loss": 0.069, "reward": 2.025111675262451, "reward_std": 0.13461188971996307, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9827009290456772, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 388.8303756713867, "epoch": 0.147561795235606, "grad_norm": 0.8935766220092773, "kl": 0.335693359375, "learning_rate": 1.9862798626460225e-05, "loss": 0.1059, "reward": 2.053013503551483, "reward_std": 0.14593994058668613, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9793527126312256, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 456.9799270629883, "epoch": 0.14786050332312747, "grad_norm": 4.045127868652344, "kl": 0.28759765625, "learning_rate": 1.9861071415700866e-05, "loss": 0.0718, "reward": 1.9877233505249023, "reward_std": 0.25016994401812553, "rewards/accuracy_reward": 0.05133928777649999, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9743303954601288, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 414.5335006713867, "epoch": 0.14815921141064894, "grad_norm": 14.779664993286133, "kl": 0.321044921875, "learning_rate": 1.98593334770623e-05, "loss": 0.0675, "reward": 2.0050223767757416, "reward_std": 0.2558499500155449, "rewards/accuracy_reward": 0.06919643026776612, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.969308078289032, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 431.9844055175781, "epoch": 0.14845791949817042, "grad_norm": 3.915581226348877, "kl": 0.20751953125, "learning_rate": 1.985758481243523e-05, "loss": 0.0269, "reward": 2.114397406578064, "reward_std": 0.23193156346678734, "rewards/accuracy_reward": 0.17633928824216127, "rewards/format_reward": 0.964285746216774, "rewards/tag_count_reward": 0.9737723767757416, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 461.8906555175781, "epoch": 0.1487566275856919, "grad_norm": 0.7710988521575928, "kl": 0.24072265625, "learning_rate": 1.9855825423722027e-05, "loss": 0.0093, "reward": 2.016741156578064, "reward_std": 0.1885010376572609, "rewards/accuracy_reward": 0.04910714668221772, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9877232611179352, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 414.06028747558594, "epoch": 0.14905533567321336, "grad_norm": 0.8562597632408142, "kl": 0.298095703125, "learning_rate": 1.9854055312836742e-05, "loss": -0.0085, "reward": 2.0731027722358704, "reward_std": 0.1466812090948224, "rewards/accuracy_reward": 0.10267857322469354, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9860491305589676, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 466.70092010498047, "epoch": 0.14935404376073483, "grad_norm": 0.28009530901908875, "kl": 0.203369140625, "learning_rate": 1.9852274481705078e-05, "loss": -0.0045, "reward": 1.997767984867096, "reward_std": 0.18909288942813873, "rewards/accuracy_reward": 0.03794643050059676, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9866071939468384, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 474.7299346923828, "epoch": 0.1496527518482563, "grad_norm": 8.873626708984375, "kl": 0.787109375, "learning_rate": 1.98504829322644e-05, "loss": -0.0155, "reward": 1.9743304550647736, "reward_std": 0.2196179386228323, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.9598214775323868, "rewards/tag_count_reward": 0.97433041036129, "step": 501 }, { "clip_ratio": 0.0, "completion_length": 496.75672149658203, "epoch": 0.14995145993577777, "grad_norm": 0.6570752859115601, "kl": 0.324951171875, "learning_rate": 1.9848680666463748e-05, "loss": -0.0076, "reward": 1.9419643580913544, "reward_std": 0.20667514204978943, "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.9598214626312256, "rewards/tag_count_reward": 0.9754464626312256, "step": 502 }, { "clip_ratio": 0.0, "completion_length": 536.0446548461914, "epoch": 0.15025016802329924, "grad_norm": 1.8225966691970825, "kl": 0.29248046875, "learning_rate": 1.98468676862638e-05, "loss": 0.0307, "reward": 2.037946492433548, "reward_std": 0.2455221489071846, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.9598214775323868, "rewards/tag_count_reward": 0.9776786118745804, "step": 503 }, { "clip_ratio": 0.0, "completion_length": 481.0535888671875, "epoch": 0.1505488761108207, "grad_norm": 1.4279563426971436, "kl": 0.4560546875, "learning_rate": 1.984504399363691e-05, "loss": 0.0286, "reward": 2.0189733505249023, "reward_std": 0.2109445370733738, "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9832589626312256, "step": 504 }, { "clip_ratio": 0.0, "completion_length": 508.6473388671875, "epoch": 0.15084758419834218, "grad_norm": 1.544642686843872, "kl": 0.5458984375, "learning_rate": 1.9843209590567073e-05, "loss": -0.0054, "reward": 1.9910715520381927, "reward_std": 0.2858654819428921, "rewards/accuracy_reward": 0.06696429033763707, "rewards/format_reward": 0.9553571939468384, "rewards/tag_count_reward": 0.9687500447034836, "step": 505 }, { "clip_ratio": 0.0, "completion_length": 540.575927734375, "epoch": 0.15114629228586363, "grad_norm": 1.4210388660430908, "kl": 0.526611328125, "learning_rate": 1.9841364479049937e-05, "loss": 0.011, "reward": 2.040736734867096, "reward_std": 0.2511899098753929, "rewards/accuracy_reward": 0.09151786146685481, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9827009290456772, "step": 506 }, { "clip_ratio": 0.0, "completion_length": 481.9062728881836, "epoch": 0.1514450003733851, "grad_norm": 1.648840308189392, "kl": 0.4580078125, "learning_rate": 1.983950866109281e-05, "loss": -0.0048, "reward": 2.035156339406967, "reward_std": 0.22547372803092003, "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9726562798023224, "step": 507 }, { "clip_ratio": 0.0, "completion_length": 489.59600830078125, "epoch": 0.15174370846090657, "grad_norm": 3.40177059173584, "kl": 0.8023681640625, "learning_rate": 1.983764213871463e-05, "loss": 0.0021, "reward": 2.1891742050647736, "reward_std": 0.1192919984459877, "rewards/accuracy_reward": 0.22098214644938707, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9882812947034836, "step": 508 }, { "clip_ratio": 0.0, "completion_length": 569.7924499511719, "epoch": 0.15204241654842804, "grad_norm": 0.18565990030765533, "kl": 0.3614501953125, "learning_rate": 1.9835764913945998e-05, "loss": -0.0023, "reward": 2.0044643878936768, "reward_std": 0.18401647731661797, "rewards/accuracy_reward": 0.044642857974395156, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9866071939468384, "step": 509 }, { "clip_ratio": 0.0, "completion_length": 540.1384201049805, "epoch": 0.1523411246359495, "grad_norm": 0.5188076496124268, "kl": 0.751220703125, "learning_rate": 1.9833876988829147e-05, "loss": -0.0065, "reward": 1.9665179252624512, "reward_std": 0.12632295303046703, "rewards/accuracy_reward": 0.0022321429569274187, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9866071790456772, "step": 510 }, { "clip_ratio": 0.0, "completion_length": 533.8995819091797, "epoch": 0.15263983272347098, "grad_norm": 1.6509788036346436, "kl": 0.16748046875, "learning_rate": 1.9831978365417958e-05, "loss": 0.0134, "reward": 2.165736734867096, "reward_std": 0.11005903966724873, "rewards/accuracy_reward": 0.1941964365541935, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.991629496216774, "step": 511 }, { "clip_ratio": 0.0, "completion_length": 518.0022583007812, "epoch": 0.15293854081099245, "grad_norm": 0.1702807992696762, "kl": 0.1463623046875, "learning_rate": 1.9830069045777943e-05, "loss": 0.0182, "reward": 2.1551340222358704, "reward_std": 0.15914164576679468, "rewards/accuracy_reward": 0.176339291036129, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.994419664144516, "step": 512 }, { "clip_ratio": 0.0, "completion_length": 544.7656402587891, "epoch": 0.15323724889851392, "grad_norm": 0.08476340025663376, "kl": 0.205078125, "learning_rate": 1.9828149031986256e-05, "loss": 0.0044, "reward": 2.0273438096046448, "reward_std": 0.05646504834294319, "rewards/accuracy_reward": 0.0401785746216774, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9960937798023224, "step": 513 }, { "clip_ratio": 0.0, "completion_length": 465.79913330078125, "epoch": 0.1535359569860354, "grad_norm": 0.17899715900421143, "kl": 0.0899658203125, "learning_rate": 1.982621832613169e-05, "loss": 0.0147, "reward": 2.0379465222358704, "reward_std": 0.1428947001695633, "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9977678954601288, "step": 514 }, { "clip_ratio": 0.0, "completion_length": 513.2254791259766, "epoch": 0.15383466507355686, "grad_norm": 0.42034488916397095, "kl": 0.1688232421875, "learning_rate": 1.982427693031465e-05, "loss": -0.0082, "reward": 1.9927456080913544, "reward_std": 0.14459028095006943, "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9905134290456772, "step": 515 }, { "clip_ratio": 0.0, "completion_length": 490.4397659301758, "epoch": 0.15413337316107834, "grad_norm": 0.13159312307834625, "kl": 0.138427734375, "learning_rate": 1.9822324846647195e-05, "loss": -0.0076, "reward": 1.9933035969734192, "reward_std": 0.10039532743394375, "rewards/accuracy_reward": 0.013392857508733869, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9933035969734192, "step": 516 }, { "clip_ratio": 0.0, "completion_length": 403.91297149658203, "epoch": 0.1544320812485998, "grad_norm": 0.1740248203277588, "kl": 0.124755859375, "learning_rate": 1.9820362077253e-05, "loss": -0.0035, "reward": 2.1183037161827087, "reward_std": 0.06855395436286926, "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9977678656578064, "step": 517 }, { "clip_ratio": 0.0, "completion_length": 473.31029510498047, "epoch": 0.15473078933612128, "grad_norm": 0.17018836736679077, "kl": 0.1016845703125, "learning_rate": 1.9818388624267362e-05, "loss": 0.0038, "reward": 2.064174234867096, "reward_std": 0.05341536086052656, "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 518 }, { "clip_ratio": 0.0, "completion_length": 449.14957427978516, "epoch": 0.15502949742364275, "grad_norm": 0.18472273647785187, "kl": 0.258544921875, "learning_rate": 1.9816404489837205e-05, "loss": -0.0057, "reward": 2.1545759439468384, "reward_std": 0.17156707495450974, "rewards/accuracy_reward": 0.1718750037252903, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9916294813156128, "step": 519 }, { "clip_ratio": 0.0, "completion_length": 488.76342010498047, "epoch": 0.15532820551116422, "grad_norm": 0.13975724577903748, "kl": 0.1357421875, "learning_rate": 1.981440967612108e-05, "loss": -0.0133, "reward": 2.0407367646694183, "reward_std": 0.08793554082512856, "rewards/accuracy_reward": 0.05133928940631449, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9960937649011612, "step": 520 }, { "clip_ratio": 0.0, "completion_length": 467.54466247558594, "epoch": 0.1556269135986857, "grad_norm": 0.2423180490732193, "kl": 0.609130859375, "learning_rate": 1.981240418528914e-05, "loss": -0.0353, "reward": 2.0195313692092896, "reward_std": 0.16960963793098927, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9838170111179352, "step": 521 }, { "clip_ratio": 0.0, "completion_length": 473.01788330078125, "epoch": 0.15592562168620716, "grad_norm": 0.2458086460828781, "kl": 0.681396484375, "learning_rate": 1.981038801952316e-05, "loss": -0.0193, "reward": 2.0585938096046448, "reward_std": 0.11923796869814396, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9916294813156128, "step": 522 }, { "clip_ratio": 0.0, "completion_length": 505.3058319091797, "epoch": 0.15622432977372863, "grad_norm": 0.42828235030174255, "kl": 1.3394775390625, "learning_rate": 1.9808361181016543e-05, "loss": -0.042, "reward": 2.064174234867096, "reward_std": 0.13647043146193027, "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.98604916036129, "step": 523 }, { "clip_ratio": 0.0, "completion_length": 524.5803756713867, "epoch": 0.1565230378612501, "grad_norm": 0.26205769181251526, "kl": 1.290771484375, "learning_rate": 1.980632367197428e-05, "loss": -0.0649, "reward": 2.059709906578064, "reward_std": 0.19578610360622406, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9815848469734192, "step": 524 }, { "clip_ratio": 0.0, "completion_length": 560.9955596923828, "epoch": 0.15682174594877157, "grad_norm": 0.32088807225227356, "kl": 1.16845703125, "learning_rate": 1.9804275494612988e-05, "loss": -0.0838, "reward": 2.0212054550647736, "reward_std": 0.2719709537923336, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9698661118745804, "step": 525 }, { "clip_ratio": 0.0, "completion_length": 602.4531402587891, "epoch": 0.15712045403629304, "grad_norm": 3.786750078201294, "kl": 5.116455078125, "learning_rate": 1.980221665116088e-05, "loss": -0.0383, "reward": 1.9453125894069672, "reward_std": 0.30507900565862656, "rewards/accuracy_reward": 0.07589286239817739, "rewards/format_reward": 0.917410746216774, "rewards/tag_count_reward": 0.9520089775323868, "step": 526 }, { "clip_ratio": 0.0, "completion_length": 609.6986846923828, "epoch": 0.15741916212381452, "grad_norm": 0.5301466584205627, "kl": 1.1728515625, "learning_rate": 1.9800147143857774e-05, "loss": -0.102, "reward": 1.9335938096046448, "reward_std": 0.37417176365852356, "rewards/accuracy_reward": 0.04910714668221772, "rewards/format_reward": 0.9352678954601288, "rewards/tag_count_reward": 0.9492187947034836, "step": 527 }, { "clip_ratio": 0.0, "completion_length": 607.3326110839844, "epoch": 0.15771787021133596, "grad_norm": 0.1828712522983551, "kl": 0.444091796875, "learning_rate": 1.979806697495509e-05, "loss": -0.0469, "reward": 2.1205357909202576, "reward_std": 0.25962116569280624, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.970982164144516, "rewards/tag_count_reward": 0.9799107313156128, "step": 528 }, { "clip_ratio": 0.0, "completion_length": 617.4933166503906, "epoch": 0.15801657829885743, "grad_norm": 0.23170025646686554, "kl": 0.5003662109375, "learning_rate": 1.979597614671586e-05, "loss": -0.034, "reward": 2.0200893878936768, "reward_std": 0.27646172791719437, "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.9486607611179352, "rewards/tag_count_reward": 0.9687500447034836, "step": 529 }, { "clip_ratio": 0.0, "completion_length": 583.5803833007812, "epoch": 0.1583152863863789, "grad_norm": 0.2696598172187805, "kl": 0.59619140625, "learning_rate": 1.9793874661414682e-05, "loss": -0.0629, "reward": 2.0312501192092896, "reward_std": 0.3072678670287132, "rewards/accuracy_reward": 0.10714286426082253, "rewards/format_reward": 0.9553571790456772, "rewards/tag_count_reward": 0.9687500447034836, "step": 530 }, { "clip_ratio": 0.0, "completion_length": 538.2946701049805, "epoch": 0.15861399447390037, "grad_norm": 0.8620467782020569, "kl": 2.65234375, "learning_rate": 1.979176252133778e-05, "loss": -0.0765, "reward": 2.0535715520381927, "reward_std": 0.352413572371006, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.95089291036129, "rewards/tag_count_reward": 0.9642857760190964, "step": 531 }, { "clip_ratio": 0.0, "completion_length": 587.9375228881836, "epoch": 0.15891270256142184, "grad_norm": 0.20593012869358063, "kl": 0.561279296875, "learning_rate": 1.978963972878295e-05, "loss": -0.0289, "reward": 1.993303656578064, "reward_std": 0.30338891595602036, "rewards/accuracy_reward": 0.07366071455180645, "rewards/format_reward": 0.9531250447034836, "rewards/tag_count_reward": 0.96651791036129, "step": 532 }, { "clip_ratio": 0.0, "completion_length": 585.5960006713867, "epoch": 0.1592114106489433, "grad_norm": 1.1809929609298706, "kl": 1.87744140625, "learning_rate": 1.9787506286059584e-05, "loss": -0.0611, "reward": 2.037946492433548, "reward_std": 0.3022092320024967, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.9709821939468384, "step": 533 }, { "clip_ratio": 0.0, "completion_length": 575.9531555175781, "epoch": 0.15951011873646478, "grad_norm": 0.1776747852563858, "kl": 0.9912109375, "learning_rate": 1.9785362195488656e-05, "loss": -0.0728, "reward": 2.0390625596046448, "reward_std": 0.2791432626545429, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.9620536267757416, "rewards/tag_count_reward": 0.9676339775323868, "step": 534 }, { "clip_ratio": 0.0, "completion_length": 524.7500228881836, "epoch": 0.15980882682398626, "grad_norm": 0.19062799215316772, "kl": 1.5032958984375, "learning_rate": 1.978320745940273e-05, "loss": -0.0419, "reward": 2.010602742433548, "reward_std": 0.18198461830615997, "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.983816996216774, "step": 535 }, { "clip_ratio": 0.0, "completion_length": 548.6585083007812, "epoch": 0.16010753491150773, "grad_norm": 0.4999968707561493, "kl": 0.16796875, "learning_rate": 1.978104208014594e-05, "loss": -0.0067, "reward": 2.181361675262451, "reward_std": 0.17142508551478386, "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9893973469734192, "step": 536 }, { "clip_ratio": 0.0, "completion_length": 505.6674270629883, "epoch": 0.1604062429990292, "grad_norm": 0.576504647731781, "kl": 0.5235595703125, "learning_rate": 1.9778866060074014e-05, "loss": 0.0064, "reward": 2.0937501788139343, "reward_std": 0.16970369592308998, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9933035969734192, "step": 537 }, { "clip_ratio": 0.0, "completion_length": 538.7522659301758, "epoch": 0.16070495108655067, "grad_norm": 0.13818776607513428, "kl": 0.5501708984375, "learning_rate": 1.977667940155425e-05, "loss": -0.0363, "reward": 2.044642925262451, "reward_std": 0.17392255924642086, "rewards/accuracy_reward": 0.07812500558793545, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9866071790456772, "step": 538 }, { "clip_ratio": 0.0, "completion_length": 536.3303833007812, "epoch": 0.16100365917407214, "grad_norm": 0.1488024741411209, "kl": 0.1046142578125, "learning_rate": 1.9774482106965512e-05, "loss": 0.0011, "reward": 2.1467634439468384, "reward_std": 0.10904072038829327, "rewards/accuracy_reward": 0.16071429289877415, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9949777126312256, "step": 539 }, { "clip_ratio": 0.0, "completion_length": 535.5580596923828, "epoch": 0.1613023672615936, "grad_norm": 0.22875918447971344, "kl": 0.140625, "learning_rate": 1.9772274178698245e-05, "loss": -0.0114, "reward": 2.088727831840515, "reward_std": 0.18664614856243134, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.992745578289032, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 491.8995819091797, "epoch": 0.16160107534911508, "grad_norm": 0.4599200487136841, "kl": 0.178466796875, "learning_rate": 1.9770055619154456e-05, "loss": -0.0164, "reward": 2.0228795409202576, "reward_std": 0.15588421002030373, "rewards/accuracy_reward": 0.046875002793967724, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9893973618745804, "step": 541 }, { "clip_ratio": 0.0, "completion_length": 549.1674346923828, "epoch": 0.16189978343663655, "grad_norm": 0.14768573641777039, "kl": 0.1202392578125, "learning_rate": 1.9767826430747724e-05, "loss": -0.0027, "reward": 2.095424175262451, "reward_std": 0.14111971575766802, "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949777126312256, "step": 542 }, { "clip_ratio": 0.0, "completion_length": 564.2857284545898, "epoch": 0.16219849152415802, "grad_norm": 10.593587875366211, "kl": 2.0029296875, "learning_rate": 1.9765586615903183e-05, "loss": 0.0887, "reward": 1.9698661863803864, "reward_std": 0.08819657657295465, "rewards/accuracy_reward": 0.05133928847499192, "rewards/format_reward": 0.9575892984867096, "rewards/tag_count_reward": 0.9609375149011612, "step": 543 }, { "clip_ratio": 0.0, "completion_length": 551.569221496582, "epoch": 0.1624971996116795, "grad_norm": 0.13179217278957367, "kl": 0.0738525390625, "learning_rate": 1.9763336177057536e-05, "loss": -0.0005, "reward": 2.142857253551483, "reward_std": 0.14807425905019045, "rewards/accuracy_reward": 0.1741071529686451, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9910714477300644, "step": 544 }, { "clip_ratio": 0.0, "completion_length": 555.491096496582, "epoch": 0.16279590769920096, "grad_norm": 0.25072595477104187, "kl": 0.0830078125, "learning_rate": 1.9761075116659037e-05, "loss": 0.0223, "reward": 2.0468750596046448, "reward_std": 0.09025902766734362, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9977679252624512, "step": 545 }, { "clip_ratio": 0.0, "completion_length": 550.7299346923828, "epoch": 0.16309461578672244, "grad_norm": 0.17732328176498413, "kl": 0.110107421875, "learning_rate": 1.97588034371675e-05, "loss": 0.0071, "reward": 2.047433167695999, "reward_std": 0.13759984634816647, "rewards/accuracy_reward": 0.07142857694998384, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.991629496216774, "step": 546 }, { "clip_ratio": 0.0, "completion_length": 554.310302734375, "epoch": 0.1633933238742439, "grad_norm": 0.11442292481660843, "kl": 0.10693359375, "learning_rate": 1.9756521141054286e-05, "loss": -0.0052, "reward": 2.039620667695999, "reward_std": 0.07856497168540955, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9949776977300644, "step": 547 }, { "clip_ratio": 0.0, "completion_length": 543.7745742797852, "epoch": 0.16369203196176538, "grad_norm": 0.7307174801826477, "kl": 0.173828125, "learning_rate": 1.9754228230802317e-05, "loss": -0.0004, "reward": 2.055245578289032, "reward_std": 0.11635634303092957, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9927455633878708, "step": 548 }, { "clip_ratio": 0.0, "completion_length": 502.72547912597656, "epoch": 0.16399074004928682, "grad_norm": 0.13765080273151398, "kl": 0.083984375, "learning_rate": 1.9751924708906047e-05, "loss": 0.0117, "reward": 2.0736607909202576, "reward_std": 0.10940977744758129, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 549 }, { "clip_ratio": 0.0, "completion_length": 521.1451187133789, "epoch": 0.1642894481368083, "grad_norm": 0.13661253452301025, "kl": 0.1103515625, "learning_rate": 1.9749610577871486e-05, "loss": -0.0139, "reward": 2.09319207072258, "reward_std": 0.09928274527192116, "rewards/accuracy_reward": 0.10937500488944352, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9927455633878708, "step": 550 }, { "clip_ratio": 0.0, "completion_length": 496.82144927978516, "epoch": 0.16458815622432976, "grad_norm": 0.6601089239120483, "kl": 0.2376708984375, "learning_rate": 1.974728584021618e-05, "loss": 0.0047, "reward": 2.1155134737491608, "reward_std": 0.1093750037252903, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9882812947034836, "step": 551 }, { "clip_ratio": 0.0, "completion_length": 493.2232437133789, "epoch": 0.16488686431185123, "grad_norm": 0.17051751911640167, "kl": 0.097412109375, "learning_rate": 1.9744950498469218e-05, "loss": -0.0063, "reward": 2.1004464626312256, "reward_std": 0.10349253565073013, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9955357313156128, "step": 552 }, { "clip_ratio": 0.0, "completion_length": 453.6674270629883, "epoch": 0.1651855723993727, "grad_norm": 0.17987282574176788, "kl": 0.13525390625, "learning_rate": 1.9742604555171222e-05, "loss": -0.005, "reward": 2.0731027722358704, "reward_std": 0.1226967042312026, "rewards/accuracy_reward": 0.09598215040750802, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.990513414144516, "step": 553 }, { "clip_ratio": 0.0, "completion_length": 466.9955520629883, "epoch": 0.16548428048689418, "grad_norm": 356.9189758300781, "kl": 19.275390625, "learning_rate": 1.9740248012874344e-05, "loss": 1.0984, "reward": 2.0122768580913544, "reward_std": 0.22503631003201008, "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9810268133878708, "step": 554 }, { "clip_ratio": 0.0, "completion_length": 458.71207427978516, "epoch": 0.16578298857441565, "grad_norm": 15.410198211669922, "kl": 0.8046875, "learning_rate": 1.973788087414228e-05, "loss": 0.0043, "reward": 2.071428596973419, "reward_std": 0.144764244556427, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.9754464477300644, "rewards/tag_count_reward": 0.9866071790456772, "step": 555 }, { "clip_ratio": 0.0, "completion_length": 448.1875228881836, "epoch": 0.16608169666193712, "grad_norm": 0.8411842584609985, "kl": 0.341064453125, "learning_rate": 1.9735503141550233e-05, "loss": -0.0205, "reward": 1.9302456080913544, "reward_std": 0.4074000343680382, "rewards/accuracy_reward": 0.07142857392318547, "rewards/format_reward": 0.910714328289032, "rewards/tag_count_reward": 0.9481027275323868, "step": 556 }, { "clip_ratio": 0.0, "completion_length": 394.4955596923828, "epoch": 0.1663804047494586, "grad_norm": 22.906557083129883, "kl": 1.0712890625, "learning_rate": 1.9733114817684957e-05, "loss": -0.0108, "reward": 1.7739956080913544, "reward_std": 0.5364980325102806, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.8258928954601288, "rewards/tag_count_reward": 0.8922991454601288, "step": 557 }, { "clip_ratio": 0.0, "completion_length": 324.5915298461914, "epoch": 0.16667911283698006, "grad_norm": 1.9101874828338623, "kl": 0.5224609375, "learning_rate": 1.9730715905144705e-05, "loss": -0.0525, "reward": 1.5853795111179352, "reward_std": 0.6816754341125488, "rewards/accuracy_reward": 0.04687500116415322, "rewards/format_reward": 0.7008928954601288, "rewards/tag_count_reward": 0.8376116454601288, "step": 558 }, { "clip_ratio": 0.0, "completion_length": 349.7768096923828, "epoch": 0.16697782092450153, "grad_norm": 1.5744434595108032, "kl": 0.22998046875, "learning_rate": 1.972830640653926e-05, "loss": -0.0697, "reward": 1.7734375894069672, "reward_std": 0.5070735216140747, "rewards/accuracy_reward": 0.0223214291036129, "rewards/format_reward": 0.8415178954601288, "rewards/tag_count_reward": 0.909598246216774, "step": 559 }, { "clip_ratio": 0.0, "completion_length": 373.76341247558594, "epoch": 0.167276529012023, "grad_norm": 22241976.0, "kl": 475136.1882324219, "learning_rate": 1.972588632448992e-05, "loss": 31581.2305, "reward": 1.9123884439468384, "reward_std": 0.3713812455534935, "rewards/accuracy_reward": 0.049107145285233855, "rewards/format_reward": 0.9129464626312256, "rewards/tag_count_reward": 0.9503348618745804, "step": 560 }, { "clip_ratio": 0.0, "completion_length": 348.5535888671875, "epoch": 0.16757523709954447, "grad_norm": 2.857132911682129, "kl": 0.43310546875, "learning_rate": 1.97234556616295e-05, "loss": -0.0037, "reward": 1.9436385035514832, "reward_std": 0.4397526904940605, "rewards/accuracy_reward": 0.12276786379516125, "rewards/format_reward": 0.8839286118745804, "rewards/tag_count_reward": 0.936941996216774, "step": 561 }, { "clip_ratio": 0.0, "completion_length": 366.27010345458984, "epoch": 0.16787394518706594, "grad_norm": 0.30276721715927124, "kl": 0.14453125, "learning_rate": 1.972101442060232e-05, "loss": 0.0059, "reward": 2.0000001192092896, "reward_std": 0.22271791473031044, "rewards/accuracy_reward": 0.058035716181620955, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.979910746216774, "step": 562 }, { "clip_ratio": 0.0, "completion_length": 353.6584930419922, "epoch": 0.1681726532745874, "grad_norm": 1.049867868423462, "kl": 0.203369140625, "learning_rate": 1.9718562604064213e-05, "loss": 0.0039, "reward": 2.01506707072258, "reward_std": 0.19152727909386158, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.9598214775323868, "rewards/tag_count_reward": 0.9748884290456772, "step": 563 }, { "clip_ratio": 0.0, "completion_length": 328.75447845458984, "epoch": 0.16847136136210888, "grad_norm": 0.3119976222515106, "kl": 0.13671875, "learning_rate": 1.9716100214682516e-05, "loss": -0.0023, "reward": 2.0089286267757416, "reward_std": 0.18834685161709785, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.98214291036129, "step": 564 }, { "clip_ratio": 0.0, "completion_length": 318.28125762939453, "epoch": 0.16877006944963036, "grad_norm": 0.31292977929115295, "kl": 0.148193359375, "learning_rate": 1.9713627255136062e-05, "loss": 0.0013, "reward": 2.0669643580913544, "reward_std": 0.16226266883313656, "rewards/accuracy_reward": 0.09821429080329835, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9888393431901932, "step": 565 }, { "clip_ratio": 0.0, "completion_length": 320.0826110839844, "epoch": 0.16906877753715183, "grad_norm": 38.92130661010742, "kl": 5.42822265625, "learning_rate": 1.9711143728115196e-05, "loss": 0.2846, "reward": 2.080915331840515, "reward_std": 0.18990927934646606, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.9821429252624512, "rewards/tag_count_reward": 0.991629496216774, "step": 566 }, { "clip_ratio": 0.0, "completion_length": 302.30135345458984, "epoch": 0.1693674856246733, "grad_norm": 17.36566162109375, "kl": 2.7255859375, "learning_rate": 1.9708649636321745e-05, "loss": 0.1654, "reward": 2.0117188096046448, "reward_std": 0.12648217007517815, "rewards/accuracy_reward": 0.029017857974395156, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9938616305589676, "step": 567 }, { "clip_ratio": 0.0, "completion_length": 306.68750762939453, "epoch": 0.16966619371219477, "grad_norm": 0.808485209941864, "kl": 0.23193359375, "learning_rate": 1.970614498246904e-05, "loss": 0.0481, "reward": 2.092076003551483, "reward_std": 0.16425195895135403, "rewards/accuracy_reward": 0.12946429220028222, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9871652275323868, "step": 568 }, { "clip_ratio": 0.0, "completion_length": 286.72545623779297, "epoch": 0.16996490179971624, "grad_norm": 1.5331826210021973, "kl": 0.56982421875, "learning_rate": 1.97036297692819e-05, "loss": 0.0006, "reward": 2.0552456080913544, "reward_std": 0.20441073551774025, "rewards/accuracy_reward": 0.09151786169968545, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.98604916036129, "step": 569 }, { "clip_ratio": 0.0, "completion_length": 258.74108123779297, "epoch": 0.1702636098872377, "grad_norm": 1.0098381042480469, "kl": 0.251953125, "learning_rate": 1.970110399949663e-05, "loss": 0.0313, "reward": 2.1037946939468384, "reward_std": 0.14976497553288937, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9921875298023224, "step": 570 }, { "clip_ratio": 0.0, "completion_length": 252.92411422729492, "epoch": 0.17056231797475915, "grad_norm": 1.6497238874435425, "kl": 0.310302734375, "learning_rate": 1.9698567675861017e-05, "loss": 0.0148, "reward": 1.98381707072258, "reward_std": 0.1216916311532259, "rewards/accuracy_reward": 0.0133928582072258, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9905134290456772, "step": 571 }, { "clip_ratio": 0.0, "completion_length": 255.92635345458984, "epoch": 0.17086102606228062, "grad_norm": 0.9293721318244934, "kl": 0.224609375, "learning_rate": 1.9696020801134333e-05, "loss": -0.0017, "reward": 2.0803572237491608, "reward_std": 0.22600505128502846, "rewards/accuracy_reward": 0.1361607238650322, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.98214291036129, "step": 572 }, { "clip_ratio": 0.0, "completion_length": 241.6718864440918, "epoch": 0.1711597341498021, "grad_norm": 0.8849908113479614, "kl": 0.258056640625, "learning_rate": 1.969346337808733e-05, "loss": -0.0029, "reward": 1.975446492433548, "reward_std": 0.3533182665705681, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.9218750298023224, "rewards/tag_count_reward": 0.9642857611179352, "step": 573 }, { "clip_ratio": 0.0, "completion_length": 214.14733123779297, "epoch": 0.17145844223732357, "grad_norm": 0.865149199962616, "kl": 0.286865234375, "learning_rate": 1.9690895409502237e-05, "loss": 0.0153, "reward": 1.9341518580913544, "reward_std": 0.3090668097138405, "rewards/accuracy_reward": 0.0401785746216774, "rewards/format_reward": 0.9308036118745804, "rewards/tag_count_reward": 0.9631696790456772, "step": 574 }, { "clip_ratio": 0.0, "completion_length": 169.14956283569336, "epoch": 0.17175715032484504, "grad_norm": 4.378220081329346, "kl": 0.4619140625, "learning_rate": 1.9688316898172744e-05, "loss": 0.0379, "reward": 1.8984375894069672, "reward_std": 0.26364049687981606, "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.9285714775323868, "rewards/tag_count_reward": 0.965401828289032, "step": 575 }, { "clip_ratio": 0.0, "completion_length": 139.92857933044434, "epoch": 0.1720558584123665, "grad_norm": 1.569952130317688, "kl": 0.556884765625, "learning_rate": 1.9685727846904026e-05, "loss": 0.0353, "reward": 1.9944197535514832, "reward_std": 0.30904511362314224, "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.9441964775323868, "rewards/tag_count_reward": 0.97433041036129, "step": 576 }, { "clip_ratio": 0.0, "completion_length": 118.90179252624512, "epoch": 0.17235456649988798, "grad_norm": 6.667843818664551, "kl": 2.61181640625, "learning_rate": 1.9683128258512712e-05, "loss": 0.0344, "reward": 1.9966518580913544, "reward_std": 0.276673287153244, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.9531250298023224, "rewards/tag_count_reward": 0.97433041036129, "step": 577 }, { "clip_ratio": 0.0, "completion_length": 107.70759582519531, "epoch": 0.17265327458740945, "grad_norm": 1.515032410621643, "kl": 0.36962890625, "learning_rate": 1.96805181358269e-05, "loss": 0.0197, "reward": 2.0446429550647736, "reward_std": 0.15383674390614033, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9888393431901932, "step": 578 }, { "clip_ratio": 0.0, "completion_length": 93.97545051574707, "epoch": 0.17295198267493092, "grad_norm": 8.787769317626953, "kl": 1.63623046875, "learning_rate": 1.967789748168615e-05, "loss": 0.0248, "reward": 2.0055804550647736, "reward_std": 0.17761345580220222, "rewards/accuracy_reward": 0.03794643096625805, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.987723246216774, "step": 579 }, { "clip_ratio": 0.0, "completion_length": 93.0870590209961, "epoch": 0.1732506907624524, "grad_norm": 2.092597484588623, "kl": 1.345703125, "learning_rate": 1.967526629894148e-05, "loss": 0.0298, "reward": 1.9832590222358704, "reward_std": 0.2003653161227703, "rewards/accuracy_reward": 0.0290178582072258, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9832589626312256, "step": 580 }, { "clip_ratio": 0.0, "completion_length": 90.82143592834473, "epoch": 0.17354939884997386, "grad_norm": 2.285836935043335, "kl": 0.96533203125, "learning_rate": 1.967262459045535e-05, "loss": -0.0003, "reward": 2.0379465222358704, "reward_std": 0.12293106690049171, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9910714626312256, "step": 581 }, { "clip_ratio": 0.0, "completion_length": 91.49777030944824, "epoch": 0.17384810693749533, "grad_norm": 17.34015655517578, "kl": 2.54931640625, "learning_rate": 1.9669972359101685e-05, "loss": 0.0295, "reward": 2.0507813096046448, "reward_std": 0.12915104068815708, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.9732143431901932, "rewards/tag_count_reward": 0.9838170111179352, "step": 582 }, { "clip_ratio": 0.0, "completion_length": 97.48661041259766, "epoch": 0.1741468150250168, "grad_norm": 0.8323637843132019, "kl": 0.306396484375, "learning_rate": 1.9667309607765857e-05, "loss": 0.0076, "reward": 2.1010045409202576, "reward_std": 0.07871063612401485, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9960937649011612, "step": 583 }, { "clip_ratio": 0.0, "completion_length": 92.62500381469727, "epoch": 0.17444552311253828, "grad_norm": 0.351970911026001, "kl": 0.277099609375, "learning_rate": 1.9664636339344668e-05, "loss": 0.0009, "reward": 2.1205357909202576, "reward_std": 0.07513973396271467, "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9955357313156128, "step": 584 }, { "clip_ratio": 0.0, "completion_length": 123.55357551574707, "epoch": 0.17474423120005975, "grad_norm": 6.782862186431885, "kl": 0.658935546875, "learning_rate": 1.966195255674638e-05, "loss": 0.0437, "reward": 2.0217634439468384, "reward_std": 0.04873058386147022, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9972098469734192, "step": 585 }, { "clip_ratio": 0.0, "completion_length": 153.58929061889648, "epoch": 0.17504293928758122, "grad_norm": 16.278785705566406, "kl": 1.673095703125, "learning_rate": 1.9659258262890683e-05, "loss": 0.0586, "reward": 1.9832590520381927, "reward_std": 0.08570339530706406, "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9921875447034836, "step": 586 }, { "clip_ratio": 0.0, "completion_length": 189.91295623779297, "epoch": 0.1753416473751027, "grad_norm": 0.39522528648376465, "kl": 0.244384765625, "learning_rate": 1.9656553460708707e-05, "loss": -0.0132, "reward": 1.9927456378936768, "reward_std": 0.09478804189711809, "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9949777126312256, "step": 587 }, { "clip_ratio": 0.0, "completion_length": 202.18973922729492, "epoch": 0.17564035546262416, "grad_norm": 0.40142056345939636, "kl": 0.282470703125, "learning_rate": 1.9653838153143007e-05, "loss": -0.0231, "reward": 2.0301340520381927, "reward_std": 0.14526123367249966, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9854911267757416, "step": 588 }, { "clip_ratio": 0.0, "completion_length": 232.42188262939453, "epoch": 0.17593906355014563, "grad_norm": 0.1668662577867508, "kl": 0.194580078125, "learning_rate": 1.9651112343147577e-05, "loss": 0.0072, "reward": 2.0664063692092896, "reward_std": 0.0620866846293211, "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9994419813156128, "step": 589 }, { "clip_ratio": 0.0, "completion_length": 266.0022392272949, "epoch": 0.1762377716376671, "grad_norm": 0.6572036147117615, "kl": 0.869873046875, "learning_rate": 1.964837603368783e-05, "loss": -0.0021, "reward": 2.026785731315613, "reward_std": 0.0357142873108387, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9955357313156128, "step": 590 }, { "clip_ratio": 0.0, "completion_length": 308.59376525878906, "epoch": 0.17653647972518857, "grad_norm": 0.27891501784324646, "kl": 0.2060546875, "learning_rate": 1.9645629227740596e-05, "loss": -0.0014, "reward": 2.068638414144516, "reward_std": 0.09742716327309608, "rewards/accuracy_reward": 0.08258929220028222, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9949776977300644, "step": 591 }, { "clip_ratio": 0.0, "completion_length": 366.9776916503906, "epoch": 0.17683518781271004, "grad_norm": 0.3378431797027588, "kl": 0.2001953125, "learning_rate": 1.9642871928294136e-05, "loss": -0.0044, "reward": 2.02287957072258, "reward_std": 0.06900635454803705, "rewards/accuracy_reward": 0.03348214412108064, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9960937798023224, "step": 592 }, { "clip_ratio": 0.0, "completion_length": 417.41966247558594, "epoch": 0.17713389590023149, "grad_norm": 0.5598568320274353, "kl": 1.148193359375, "learning_rate": 1.9640104138348124e-05, "loss": -0.0261, "reward": 2.0206474363803864, "reward_std": 0.13585366681218147, "rewards/accuracy_reward": 0.05133928661234677, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.984933078289032, "step": 593 }, { "clip_ratio": 0.0, "completion_length": 461.0848388671875, "epoch": 0.17743260398775296, "grad_norm": 0.44724971055984497, "kl": 0.374267578125, "learning_rate": 1.963732586091364e-05, "loss": -0.0184, "reward": 2.0987724363803864, "reward_std": 0.14821380004286766, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9871652126312256, "step": 594 }, { "clip_ratio": 0.0, "completion_length": 476.47769927978516, "epoch": 0.17773131207527443, "grad_norm": 5.339955806732178, "kl": 4.17041015625, "learning_rate": 1.963453709901318e-05, "loss": 0.0045, "reward": 2.0312501192092896, "reward_std": 0.10714286006987095, "rewards/accuracy_reward": 0.05133928847499192, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9910714626312256, "step": 595 }, { "clip_ratio": 0.0, "completion_length": 481.08484649658203, "epoch": 0.1780300201627959, "grad_norm": 7429.12646484375, "kl": 704.415283203125, "learning_rate": 1.963173785568064e-05, "loss": 38.9353, "reward": 2.032924175262451, "reward_std": 0.16151831485331059, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9793527275323868, "step": 596 }, { "clip_ratio": 0.0, "completion_length": 548.7076110839844, "epoch": 0.17832872825031737, "grad_norm": 0.3786846101284027, "kl": 0.2353515625, "learning_rate": 1.9628928133961324e-05, "loss": 0.0145, "reward": 2.0664063692092896, "reward_std": 0.21454638242721558, "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.964285746216774, "rewards/tag_count_reward": 0.9793527275323868, "step": 597 }, { "clip_ratio": 0.0, "completion_length": 524.3326263427734, "epoch": 0.17862743633783884, "grad_norm": 0.6902797222137451, "kl": 0.174072265625, "learning_rate": 1.9626107936911936e-05, "loss": -0.0166, "reward": 2.04631707072258, "reward_std": 0.20464317873120308, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9815848618745804, "step": 598 }, { "clip_ratio": 0.0, "completion_length": 479.7410888671875, "epoch": 0.1789261444253603, "grad_norm": 331.8794860839844, "kl": 34.00830078125, "learning_rate": 1.9623277267600574e-05, "loss": 2.3808, "reward": 1.9921875894069672, "reward_std": 0.3697265610098839, "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.9308036267757416, "rewards/tag_count_reward": 0.949776828289032, "step": 599 }, { "clip_ratio": 0.0, "completion_length": 522.2745819091797, "epoch": 0.17922485251288178, "grad_norm": 0.19888563454151154, "kl": 0.17626953125, "learning_rate": 1.9620436129106725e-05, "loss": -0.0296, "reward": 2.0111607909202576, "reward_std": 0.1799715030938387, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.9598214626312256, "rewards/tag_count_reward": 0.9754464626312256, "step": 600 }, { "clip_ratio": 0.0, "completion_length": 505.1629638671875, "epoch": 0.17952356060040325, "grad_norm": 2.646221876144409, "kl": 0.223876953125, "learning_rate": 1.9617584524521273e-05, "loss": -0.0344, "reward": 1.9709822535514832, "reward_std": 0.299914438277483, "rewards/accuracy_reward": 0.04241071757860482, "rewards/format_reward": 0.9598214775323868, "rewards/tag_count_reward": 0.9687500596046448, "step": 601 }, { "clip_ratio": 0.0, "completion_length": 536.8526992797852, "epoch": 0.17982226868792472, "grad_norm": 7.383185386657715, "kl": 1.93115234375, "learning_rate": 1.9614722456946483e-05, "loss": 0.0272, "reward": 1.9860492050647736, "reward_std": 0.3698936477303505, "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.9330357611179352, "rewards/tag_count_reward": 0.9458705931901932, "step": 602 }, { "clip_ratio": 0.0, "completion_length": 465.0379638671875, "epoch": 0.1801209767754462, "grad_norm": 1.7442493438720703, "kl": 0.69482421875, "learning_rate": 1.9611849929496004e-05, "loss": -0.0265, "reward": 1.8984375894069672, "reward_std": 0.33602603524923325, "rewards/accuracy_reward": 0.013392857974395156, "rewards/format_reward": 0.9375000447034836, "rewards/tag_count_reward": 0.9475446790456772, "step": 603 }, { "clip_ratio": 0.0, "completion_length": 490.2500228881836, "epoch": 0.18041968486296767, "grad_norm": 0.5587928891181946, "kl": 0.2197265625, "learning_rate": 1.9608966945294863e-05, "loss": -0.028, "reward": 2.028459906578064, "reward_std": 0.2728619650006294, "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.9620535969734192, "rewards/tag_count_reward": 0.9704241454601288, "step": 604 }, { "clip_ratio": 0.0, "completion_length": 502.0893020629883, "epoch": 0.18071839295048914, "grad_norm": 0.8254040479660034, "kl": 0.437255859375, "learning_rate": 1.9606073507479466e-05, "loss": -0.036, "reward": 1.9994420409202576, "reward_std": 0.33801497146487236, "rewards/accuracy_reward": 0.08482143096625805, "rewards/format_reward": 0.9531250447034836, "rewards/tag_count_reward": 0.9614955633878708, "step": 605 }, { "clip_ratio": 0.0, "completion_length": 506.4844055175781, "epoch": 0.1810171010380106, "grad_norm": 0.7659482359886169, "kl": 0.767578125, "learning_rate": 1.960316961919759e-05, "loss": -0.0495, "reward": 1.9129465222358704, "reward_std": 0.26682592183351517, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9508928954601288, "rewards/tag_count_reward": 0.9620536118745804, "step": 606 }, { "clip_ratio": 0.0, "completion_length": 496.9933090209961, "epoch": 0.18131580912553208, "grad_norm": 0.6160946488380432, "kl": 0.395263671875, "learning_rate": 1.960025528360838e-05, "loss": -0.0494, "reward": 2.0133929550647736, "reward_std": 0.33333227038383484, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.9441964626312256, "rewards/tag_count_reward": 0.9575893133878708, "step": 607 }, { "clip_ratio": 0.0, "completion_length": 518.2477874755859, "epoch": 0.18161451721305355, "grad_norm": 0.3070965111255646, "kl": 1.1220703125, "learning_rate": 1.9597330503882345e-05, "loss": -0.101, "reward": 1.9525670409202576, "reward_std": 0.32578112930059433, "rewards/accuracy_reward": 0.042410717345774174, "rewards/format_reward": 0.9531250447034836, "rewards/tag_count_reward": 0.9570312947034836, "step": 608 }, { "clip_ratio": 0.0, "completion_length": 559.2723388671875, "epoch": 0.18191322530057502, "grad_norm": 0.23510758578777313, "kl": 0.50927734375, "learning_rate": 1.9594395283201362e-05, "loss": -0.0424, "reward": 1.9871653020381927, "reward_std": 0.1929151015356183, "rewards/accuracy_reward": 0.040178574388846755, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9760045111179352, "step": 609 }, { "clip_ratio": 0.0, "completion_length": 507.45985412597656, "epoch": 0.1822119333880965, "grad_norm": 17.865297317504883, "kl": 14.41796875, "learning_rate": 1.959144962475867e-05, "loss": -0.0159, "reward": 1.924665242433548, "reward_std": 0.35971660912036896, "rewards/accuracy_reward": 0.024553573224693537, "rewards/format_reward": 0.9464285969734192, "rewards/tag_count_reward": 0.953683078289032, "step": 610 }, { "clip_ratio": 0.0, "completion_length": 493.4620819091797, "epoch": 0.18251064147561796, "grad_norm": 0.5475573539733887, "kl": 0.568603515625, "learning_rate": 1.9588493531758843e-05, "loss": -0.0372, "reward": 2.1478795409202576, "reward_std": 0.23222080618143082, "rewards/accuracy_reward": 0.1986607275903225, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9782366305589676, "step": 611 }, { "clip_ratio": 0.0, "completion_length": 526.4151992797852, "epoch": 0.18280934956313943, "grad_norm": 0.30542418360710144, "kl": 0.9053955078125, "learning_rate": 1.9585527007417825e-05, "loss": -0.0496, "reward": 2.005580395460129, "reward_std": 0.2602269761264324, "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.9732142984867096, "rewards/tag_count_reward": 0.9787946790456772, "step": 612 }, { "clip_ratio": 0.0, "completion_length": 526.8549499511719, "epoch": 0.1831080576506609, "grad_norm": 0.5271150469779968, "kl": 1.42236328125, "learning_rate": 1.958255005496291e-05, "loss": -0.0716, "reward": 2.0574777722358704, "reward_std": 0.35162755101919174, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.941964328289032, "rewards/tag_count_reward": 0.9592634439468384, "step": 613 }, { "clip_ratio": 0.0, "completion_length": 546.4665374755859, "epoch": 0.18340676573818235, "grad_norm": 47.68092727661133, "kl": 141.75, "learning_rate": 1.9579562677632725e-05, "loss": 0.1722, "reward": 1.9804688692092896, "reward_std": 0.3114727884531021, "rewards/accuracy_reward": 0.06919643399305642, "rewards/format_reward": 0.9486607611179352, "rewards/tag_count_reward": 0.9626116454601288, "step": 614 }, { "clip_ratio": 0.0, "completion_length": 566.8326110839844, "epoch": 0.18370547382570382, "grad_norm": 0.38345587253570557, "kl": 1.25830078125, "learning_rate": 1.957656487867724e-05, "loss": -0.046, "reward": 1.9754465222358704, "reward_std": 0.24533852562308311, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9732143431901932, "step": 615 }, { "clip_ratio": 0.0, "completion_length": 588.2232513427734, "epoch": 0.1840041819132253, "grad_norm": 0.1931915432214737, "kl": 0.1634521484375, "learning_rate": 1.9573556661357777e-05, "loss": -0.0078, "reward": 2.000558167695999, "reward_std": 0.168121675029397, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.9665178805589676, "rewards/tag_count_reward": 0.9804687798023224, "step": 616 }, { "clip_ratio": 0.0, "completion_length": 571.4442138671875, "epoch": 0.18430289000074676, "grad_norm": 0.18561600148677826, "kl": 0.3311767578125, "learning_rate": 1.9570538028946974e-05, "loss": -0.009, "reward": 2.02287957072258, "reward_std": 0.18997739255428314, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9737723618745804, "step": 617 }, { "clip_ratio": 0.0, "completion_length": 563.5424346923828, "epoch": 0.18460159808826823, "grad_norm": 0.1945761889219284, "kl": 0.2628173828125, "learning_rate": 1.956750898472881e-05, "loss": -0.0284, "reward": 2.1328125596046448, "reward_std": 0.21101950109004974, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.970982164144516, "rewards/tag_count_reward": 0.978794664144516, "step": 618 }, { "clip_ratio": 0.0, "completion_length": 554.8393173217773, "epoch": 0.1849003061757897, "grad_norm": 0.19168204069137573, "kl": 0.1702880859375, "learning_rate": 1.9564469531998586e-05, "loss": 0.0128, "reward": 2.083147406578064, "reward_std": 0.1873304881155491, "rewards/accuracy_reward": 0.10937500279396772, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9893973618745804, "step": 619 }, { "clip_ratio": 0.0, "completion_length": 527.5960083007812, "epoch": 0.18519901426331117, "grad_norm": 0.28238868713378906, "kl": 0.1439208984375, "learning_rate": 1.9561419674062928e-05, "loss": 0.0206, "reward": 2.1289063692092896, "reward_std": 0.0992774460464716, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9905134290456772, "step": 620 }, { "clip_ratio": 0.0, "completion_length": 551.037971496582, "epoch": 0.18549772235083264, "grad_norm": 0.2665957510471344, "kl": 0.215576171875, "learning_rate": 1.9558359414239786e-05, "loss": -0.0082, "reward": 2.0306920409202576, "reward_std": 0.18142830207943916, "rewards/accuracy_reward": 0.060267859138548374, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9882812947034836, "step": 621 }, { "clip_ratio": 0.0, "completion_length": 498.1250228881836, "epoch": 0.18579643043835412, "grad_norm": 0.23014047741889954, "kl": 0.1123046875, "learning_rate": 1.9555288755858425e-05, "loss": 0.0131, "reward": 2.1640625596046448, "reward_std": 0.23354042321443558, "rewards/accuracy_reward": 0.1897321492433548, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9921875447034836, "step": 622 }, { "clip_ratio": 0.0, "completion_length": 480.8460006713867, "epoch": 0.1860951385258756, "grad_norm": 0.19973130524158478, "kl": 0.238037109375, "learning_rate": 1.9552207702259412e-05, "loss": -0.016, "reward": 2.1054688692092896, "reward_std": 0.17778103798627853, "rewards/accuracy_reward": 0.1294642947614193, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9893973618745804, "step": 623 }, { "clip_ratio": 0.0, "completion_length": 504.8370819091797, "epoch": 0.18639384661339706, "grad_norm": 0.2913791239261627, "kl": 0.38623046875, "learning_rate": 1.9549116256794636e-05, "loss": -0.0158, "reward": 2.15178582072258, "reward_std": 0.23097272217273712, "rewards/accuracy_reward": 0.18080357694998384, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9866071790456772, "step": 624 }, { "clip_ratio": 0.0, "completion_length": 545.5759124755859, "epoch": 0.18669255470091853, "grad_norm": 0.20784670114517212, "kl": 0.1195068359375, "learning_rate": 1.9546014422827287e-05, "loss": 0.0016, "reward": 2.0753349363803864, "reward_std": 0.17938613891601562, "rewards/accuracy_reward": 0.09375000605359674, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.992745578289032, "step": 625 }, { "clip_ratio": 0.0, "completion_length": 531.7232360839844, "epoch": 0.18699126278844, "grad_norm": 0.2661862373352051, "kl": 0.2977294921875, "learning_rate": 1.954290220373186e-05, "loss": 0.007, "reward": 2.0837054550647736, "reward_std": 0.17835931293666363, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.987723246216774, "step": 626 }, { "clip_ratio": 0.0, "completion_length": 501.5000228881836, "epoch": 0.18728997087596147, "grad_norm": 0.14878065884113312, "kl": 0.156005859375, "learning_rate": 1.9539779602894136e-05, "loss": 0.0149, "reward": 2.075892984867096, "reward_std": 0.14250191673636436, "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9933035969734192, "step": 627 }, { "clip_ratio": 0.0, "completion_length": 537.3616333007812, "epoch": 0.18758867896348294, "grad_norm": 0.1706964522600174, "kl": 0.1995849609375, "learning_rate": 1.9536646623711204e-05, "loss": 0.0099, "reward": 1.9927456378936768, "reward_std": 0.1384572871029377, "rewards/accuracy_reward": 0.022321430267766118, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.990513414144516, "step": 628 }, { "clip_ratio": 0.0, "completion_length": 525.3102798461914, "epoch": 0.1878873870510044, "grad_norm": 0.14688649773597717, "kl": 0.236572265625, "learning_rate": 1.9533503269591438e-05, "loss": -0.0143, "reward": 2.1210938692092896, "reward_std": 0.14372059144079685, "rewards/accuracy_reward": 0.14285714668221772, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9916295111179352, "step": 629 }, { "clip_ratio": 0.0, "completion_length": 503.58929443359375, "epoch": 0.18818609513852588, "grad_norm": 0.20485076308250427, "kl": 0.209716796875, "learning_rate": 1.9530349543954495e-05, "loss": -0.0226, "reward": 2.0396206080913544, "reward_std": 0.2057734802365303, "rewards/accuracy_reward": 0.0825892873108387, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9815848618745804, "step": 630 }, { "clip_ratio": 0.0, "completion_length": 469.70760345458984, "epoch": 0.18848480322604735, "grad_norm": 4.916904926300049, "kl": 0.6185302734375, "learning_rate": 1.9527185450231328e-05, "loss": -0.0168, "reward": 2.065290331840515, "reward_std": 0.1997381765395403, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.984933078289032, "step": 631 }, { "clip_ratio": 0.0, "completion_length": 449.8013610839844, "epoch": 0.18878351131356882, "grad_norm": 0.6611330509185791, "kl": 0.231201171875, "learning_rate": 1.9524010991864152e-05, "loss": 0.0008, "reward": 2.111049234867096, "reward_std": 0.16796598583459854, "rewards/accuracy_reward": 0.12276786239817739, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949776977300644, "step": 632 }, { "clip_ratio": 0.0, "completion_length": 472.7455596923828, "epoch": 0.1890822194010903, "grad_norm": 0.14818908274173737, "kl": 0.386962890625, "learning_rate": 1.952082617230647e-05, "loss": -0.0057, "reward": 1.997767984867096, "reward_std": 0.10368260834366083, "rewards/accuracy_reward": 0.013392857741564512, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9933035969734192, "step": 633 }, { "clip_ratio": 0.0, "completion_length": 472.77903747558594, "epoch": 0.18938092748861177, "grad_norm": 0.495087206363678, "kl": 0.384033203125, "learning_rate": 1.9517630995023057e-05, "loss": -0.0421, "reward": 2.0574777722358704, "reward_std": 0.16824615746736526, "rewards/accuracy_reward": 0.1071428582072258, "rewards/format_reward": 0.9732143133878708, "rewards/tag_count_reward": 0.9771205633878708, "step": 634 }, { "clip_ratio": 0.0, "completion_length": 481.5826110839844, "epoch": 0.18967963557613324, "grad_norm": 0.10086461156606674, "kl": 0.1312255859375, "learning_rate": 1.9514425463489946e-05, "loss": -0.0037, "reward": 1.9921875596046448, "reward_std": 0.058363729156553745, "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9966518133878708, "step": 635 }, { "clip_ratio": 0.0, "completion_length": 519.5714492797852, "epoch": 0.18997834366365468, "grad_norm": 0.12320530414581299, "kl": 0.10693359375, "learning_rate": 1.9511209581194447e-05, "loss": 0.0009, "reward": 2.0463171005249023, "reward_std": 0.09007361624389887, "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 636 }, { "clip_ratio": 0.0, "completion_length": 548.450927734375, "epoch": 0.19027705175117615, "grad_norm": 0.12605080008506775, "kl": 0.2371826171875, "learning_rate": 1.9507983351635124e-05, "loss": -0.0006, "reward": 2.060267925262451, "reward_std": 0.1315991673618555, "rewards/accuracy_reward": 0.08035714784637094, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9933036118745804, "step": 637 }, { "clip_ratio": 0.0, "completion_length": 555.0201110839844, "epoch": 0.19057575983869762, "grad_norm": 0.12246573716402054, "kl": 0.155029296875, "learning_rate": 1.9504746778321793e-05, "loss": -0.008, "reward": 2.0591518878936768, "reward_std": 0.05564750451594591, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.994419664144516, "step": 638 }, { "clip_ratio": 0.0, "completion_length": 544.991096496582, "epoch": 0.1908744679262191, "grad_norm": 0.10026463121175766, "kl": 0.1458740234375, "learning_rate": 1.9501499864775536e-05, "loss": -0.0008, "reward": 2.0803572237491608, "reward_std": 0.08217759989202023, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9910714626312256, "step": 639 }, { "clip_ratio": 0.0, "completion_length": 513.3415451049805, "epoch": 0.19117317601374056, "grad_norm": 0.21127918362617493, "kl": 0.42724609375, "learning_rate": 1.9498242614528672e-05, "loss": -0.0249, "reward": 2.075892925262451, "reward_std": 0.1629748735576868, "rewards/accuracy_reward": 0.10714286053553224, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9843750298023224, "step": 640 }, { "clip_ratio": 0.0, "completion_length": 579.3817291259766, "epoch": 0.19147188410126204, "grad_norm": 0.0812065452337265, "kl": 0.0986328125, "learning_rate": 1.9494975031124768e-05, "loss": 0.0063, "reward": 2.0691965222358704, "reward_std": 0.05616512894630432, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 641 }, { "clip_ratio": 0.0, "completion_length": 581.5312652587891, "epoch": 0.1917705921887835, "grad_norm": 0.1396270990371704, "kl": 0.79296875, "learning_rate": 1.9491697118118643e-05, "loss": -0.0178, "reward": 2.0340402722358704, "reward_std": 0.13228207174688578, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.991629496216774, "step": 642 }, { "clip_ratio": 0.0, "completion_length": 543.3147506713867, "epoch": 0.19206930027630498, "grad_norm": 0.08768697828054428, "kl": 0.25048828125, "learning_rate": 1.9488408879076336e-05, "loss": 0.0073, "reward": 2.0948661267757416, "reward_std": 0.049107144586741924, "rewards/accuracy_reward": 0.1071428656578064, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9966518133878708, "step": 643 }, { "clip_ratio": 0.0, "completion_length": 542.428581237793, "epoch": 0.19236800836382645, "grad_norm": 0.1121445745229721, "kl": 0.0972900390625, "learning_rate": 1.9485110317575134e-05, "loss": 0.0129, "reward": 2.0491071939468384, "reward_std": 0.098214291036129, "rewards/accuracy_reward": 0.07812500349245965, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9933036267757416, "step": 644 }, { "clip_ratio": 0.0, "completion_length": 563.7969055175781, "epoch": 0.19266671645134792, "grad_norm": 0.09002290666103363, "kl": 0.2930908203125, "learning_rate": 1.9481801437203547e-05, "loss": -0.0046, "reward": 2.0669643878936768, "reward_std": 0.0655801072716713, "rewards/accuracy_reward": 0.07366072107106447, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9977678656578064, "step": 645 }, { "clip_ratio": 0.0, "completion_length": 631.0870819091797, "epoch": 0.1929654245388694, "grad_norm": 0.12680764496326447, "kl": 0.091064453125, "learning_rate": 1.9478482241561312e-05, "loss": 0.0199, "reward": 2.0831474661827087, "reward_std": 0.12168790958821774, "rewards/accuracy_reward": 0.10044643189758062, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9960937798023224, "step": 646 }, { "clip_ratio": 0.0, "completion_length": 560.8326263427734, "epoch": 0.19326413262639086, "grad_norm": 0.2790752649307251, "kl": 0.7237548828125, "learning_rate": 1.947515273425939e-05, "loss": -0.018, "reward": 2.05412957072258, "reward_std": 0.15422965213656425, "rewards/accuracy_reward": 0.07589286309666932, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.991629496216774, "step": 647 }, { "clip_ratio": 0.0, "completion_length": 556.6786041259766, "epoch": 0.19356284071391233, "grad_norm": 3.9884138107299805, "kl": 1.3055419921875, "learning_rate": 1.9471812918919958e-05, "loss": 0.0409, "reward": 2.115513563156128, "reward_std": 0.16786598414182663, "rewards/accuracy_reward": 0.13839286053553224, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9927455633878708, "step": 648 }, { "clip_ratio": 0.0, "completion_length": 582.8861846923828, "epoch": 0.1938615488014338, "grad_norm": 0.1168767586350441, "kl": 0.0845947265625, "learning_rate": 1.9468462799176407e-05, "loss": 0.0042, "reward": 2.1238840222358704, "reward_std": 0.10861279629170895, "rewards/accuracy_reward": 0.13169643748551607, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9966517984867096, "step": 649 }, { "clip_ratio": 0.0, "completion_length": 576.388427734375, "epoch": 0.19416025688895527, "grad_norm": 0.1217607781291008, "kl": 0.25244140625, "learning_rate": 1.946510237867334e-05, "loss": -0.0213, "reward": 2.0970982909202576, "reward_std": 0.14817756600677967, "rewards/accuracy_reward": 0.11607143236324191, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9944196790456772, "step": 650 }, { "clip_ratio": 0.0, "completion_length": 591.2165374755859, "epoch": 0.19445896497647674, "grad_norm": 0.15098896622657776, "kl": 0.1259765625, "learning_rate": 1.9461731661066564e-05, "loss": 0.0176, "reward": 2.1021206974983215, "reward_std": 0.14238932728767395, "rewards/accuracy_reward": 0.11830357951112092, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9949777275323868, "step": 651 }, { "clip_ratio": 0.0, "completion_length": 593.1428833007812, "epoch": 0.19475767306399822, "grad_norm": 0.16389621794223785, "kl": 0.1339111328125, "learning_rate": 1.9458350650023092e-05, "loss": 0.0027, "reward": 2.0859376192092896, "reward_std": 0.1726325824856758, "rewards/accuracy_reward": 0.11383929289877415, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9921875447034836, "step": 652 }, { "clip_ratio": 0.0, "completion_length": 587.9062805175781, "epoch": 0.1950563811515197, "grad_norm": 0.5271249413490295, "kl": 0.25830078125, "learning_rate": 1.945495934922113e-05, "loss": 0.0076, "reward": 2.031808167695999, "reward_std": 0.15497082471847534, "rewards/accuracy_reward": 0.06919643259607255, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9871652126312256, "step": 653 }, { "clip_ratio": 0.0, "completion_length": 552.8460083007812, "epoch": 0.19535508923904116, "grad_norm": 0.2027045339345932, "kl": 0.316162109375, "learning_rate": 1.945155776235008e-05, "loss": -0.0155, "reward": 2.0044643878936768, "reward_std": 0.15188300423324108, "rewards/accuracy_reward": 0.029017857741564512, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9910714626312256, "step": 654 }, { "clip_ratio": 0.0, "completion_length": 618.5558166503906, "epoch": 0.19565379732656263, "grad_norm": 0.16521187126636505, "kl": 0.4559326171875, "learning_rate": 1.944814589311054e-05, "loss": -0.0021, "reward": 2.0189732909202576, "reward_std": 0.11499083507806063, "rewards/accuracy_reward": 0.03348214388824999, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.994419664144516, "step": 655 }, { "clip_ratio": 0.0, "completion_length": 567.2388610839844, "epoch": 0.1959525054140841, "grad_norm": 1.138617992401123, "kl": 1.6063232421875, "learning_rate": 1.9444723745214285e-05, "loss": -0.0164, "reward": 2.085937589406967, "reward_std": 0.21431630477309227, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.9854911118745804, "step": 656 }, { "clip_ratio": 0.0, "completion_length": 551.2053833007812, "epoch": 0.19625121350160554, "grad_norm": 0.4997687339782715, "kl": 0.267333984375, "learning_rate": 1.9441291322384275e-05, "loss": -0.012, "reward": 2.0920759439468384, "reward_std": 0.19767209887504578, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9938616305589676, "step": 657 }, { "clip_ratio": 0.0, "completion_length": 626.1049346923828, "epoch": 0.196549921589127, "grad_norm": 0.18581970036029816, "kl": 0.26171875, "learning_rate": 1.9437848628354655e-05, "loss": -0.0386, "reward": 2.058593839406967, "reward_std": 0.14033542573451996, "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9871652126312256, "step": 658 }, { "clip_ratio": 0.0, "completion_length": 573.7411041259766, "epoch": 0.19684862967664848, "grad_norm": 0.229697123169899, "kl": 0.1983642578125, "learning_rate": 1.9434395666870735e-05, "loss": -0.0531, "reward": 2.0362724363803864, "reward_std": 0.16463925503194332, "rewards/accuracy_reward": 0.07812500232830644, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9871652126312256, "step": 659 }, { "clip_ratio": 0.0, "completion_length": 582.9553680419922, "epoch": 0.19714733776416996, "grad_norm": 0.8445436358451843, "kl": 0.509765625, "learning_rate": 1.9430932441688998e-05, "loss": -0.1598, "reward": 1.88616082072258, "reward_std": 0.4923956170678139, "rewards/accuracy_reward": 0.049107145285233855, "rewards/format_reward": 0.90401791036129, "rewards/tag_count_reward": 0.9330357611179352, "step": 660 }, { "clip_ratio": 0.0, "completion_length": 624.3616485595703, "epoch": 0.19744604585169143, "grad_norm": 0.23734590411186218, "kl": 0.3291015625, "learning_rate": 1.9427458956577098e-05, "loss": -0.0711, "reward": 2.2343751192092896, "reward_std": 0.2810874283313751, "rewards/accuracy_reward": 0.2991071566939354, "rewards/format_reward": 0.9620535969734192, "rewards/tag_count_reward": 0.9732143431901932, "step": 661 }, { "clip_ratio": 0.0, "completion_length": 728.2812957763672, "epoch": 0.1977447539392129, "grad_norm": 0.41915419697761536, "kl": 0.738525390625, "learning_rate": 1.942397521531384e-05, "loss": -0.0468, "reward": 1.8694197237491608, "reward_std": 0.3603213392198086, "rewards/accuracy_reward": 0.017857143888249993, "rewards/format_reward": 0.9040178805589676, "rewards/tag_count_reward": 0.9475446790456772, "step": 662 }, { "clip_ratio": 0.0, "completion_length": 754.5424499511719, "epoch": 0.19804346202673437, "grad_norm": 0.8589540719985962, "kl": 1.90576171875, "learning_rate": 1.9420481221689203e-05, "loss": 0.0175, "reward": 1.9665179550647736, "reward_std": 0.37868812680244446, "rewards/accuracy_reward": 0.1049107164144516, "rewards/format_reward": 0.9084821790456772, "rewards/tag_count_reward": 0.9531250298023224, "step": 663 }, { "clip_ratio": 0.0, "completion_length": 688.6451110839844, "epoch": 0.19834217011425584, "grad_norm": 0.2734261751174927, "kl": 0.4468994140625, "learning_rate": 1.9416976979504297e-05, "loss": 0.0142, "reward": 1.9804688394069672, "reward_std": 0.19758136197924614, "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.9531250596046448, "rewards/tag_count_reward": 0.9782366454601288, "step": 664 }, { "clip_ratio": 0.0, "completion_length": 685.1027069091797, "epoch": 0.1986408782017773, "grad_norm": 58.677894592285156, "kl": 7.02685546875, "learning_rate": 1.9413462492571403e-05, "loss": 0.3456, "reward": 1.9827010035514832, "reward_std": 0.1877211220562458, "rewards/accuracy_reward": 0.046875002793967724, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.97823666036129, "step": 665 }, { "clip_ratio": 0.0, "completion_length": 637.513427734375, "epoch": 0.19893958628929878, "grad_norm": 0.18045955896377563, "kl": 0.1737060546875, "learning_rate": 1.940993776471393e-05, "loss": -0.0036, "reward": 2.098214328289032, "reward_std": 0.16076923348009586, "rewards/accuracy_reward": 0.12053571874275804, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9910714775323868, "step": 666 }, { "clip_ratio": 0.0, "completion_length": 647.2924499511719, "epoch": 0.19923829437682025, "grad_norm": 0.5303198099136353, "kl": 0.2049560546875, "learning_rate": 1.9406402799766452e-05, "loss": 0.0147, "reward": 1.9782367050647736, "reward_std": 0.13694825861603022, "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9893973618745804, "step": 667 }, { "clip_ratio": 0.0, "completion_length": 551.7031402587891, "epoch": 0.19953700246434172, "grad_norm": 0.6709874868392944, "kl": 0.403564453125, "learning_rate": 1.940285760157465e-05, "loss": 0.0241, "reward": 2.1244421303272247, "reward_std": 0.18252059444785118, "rewards/accuracy_reward": 0.16071428963914514, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9860491454601288, "step": 668 }, { "clip_ratio": 0.0, "completion_length": 518.5446701049805, "epoch": 0.1998357105518632, "grad_norm": 1.454507827758789, "kl": 1.91650390625, "learning_rate": 1.9399302173995354e-05, "loss": 0.0205, "reward": 2.2053571939468384, "reward_std": 0.28494907543063164, "rewards/accuracy_reward": 0.2678571566939354, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9754464626312256, "step": 669 }, { "clip_ratio": 0.0, "completion_length": 546.7366333007812, "epoch": 0.20013441863938466, "grad_norm": 0.7227110862731934, "kl": 1.278076171875, "learning_rate": 1.9395736520896528e-05, "loss": 0.0882, "reward": 2.0044643580913544, "reward_std": 0.31320128217339516, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.9553571939468384, "rewards/tag_count_reward": 0.9687500447034836, "step": 670 }, { "clip_ratio": 0.0, "completion_length": 569.0402069091797, "epoch": 0.20043312672690614, "grad_norm": 0.8570789098739624, "kl": 0.54248046875, "learning_rate": 1.9392160646157242e-05, "loss": 0.0403, "reward": 2.0535715222358704, "reward_std": 0.14231387339532375, "rewards/accuracy_reward": 0.08258928707800806, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.988839328289032, "step": 671 }, { "clip_ratio": 0.0, "completion_length": 585.0803680419922, "epoch": 0.2007318348144276, "grad_norm": 0.12841270864009857, "kl": 0.1949462890625, "learning_rate": 1.938857455366771e-05, "loss": 0.0031, "reward": 2.138392984867096, "reward_std": 0.09111165255308151, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9977678656578064, "step": 672 }, { "clip_ratio": 0.0, "completion_length": 624.2611694335938, "epoch": 0.20103054290194908, "grad_norm": 0.28217801451683044, "kl": 0.112060546875, "learning_rate": 1.9384978247329238e-05, "loss": 0.0079, "reward": 2.154017984867096, "reward_std": 0.14850426837801933, "rewards/accuracy_reward": 0.1718750111758709, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9955357313156128, "step": 673 }, { "clip_ratio": 0.0, "completion_length": 669.8928985595703, "epoch": 0.20132925098947055, "grad_norm": 0.13751497864723206, "kl": 0.0911865234375, "learning_rate": 1.9381371731054263e-05, "loss": 0.0092, "reward": 2.0245536863803864, "reward_std": 0.11854853946715593, "rewards/accuracy_reward": 0.03571428800933063, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9977678656578064, "step": 674 }, { "clip_ratio": 0.0, "completion_length": 630.7723388671875, "epoch": 0.20162795907699202, "grad_norm": 0.15336832404136658, "kl": 0.0906982421875, "learning_rate": 1.9377755008766316e-05, "loss": 0.0235, "reward": 2.0518974661827087, "reward_std": 0.08325076568871737, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9960937798023224, "step": 675 }, { "clip_ratio": 0.0, "completion_length": 685.700927734375, "epoch": 0.2019266671645135, "grad_norm": 0.16728748381137848, "kl": 0.08837890625, "learning_rate": 1.9374128084400038e-05, "loss": 0.0209, "reward": 2.0814733505249023, "reward_std": 0.16167160123586655, "rewards/accuracy_reward": 0.10491071920841932, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9944196790456772, "step": 676 }, { "clip_ratio": 0.0, "completion_length": 680.8348541259766, "epoch": 0.20222537525203496, "grad_norm": 0.15299268066883087, "kl": 0.0897216796875, "learning_rate": 1.937049096190117e-05, "loss": 0.0311, "reward": 2.070312649011612, "reward_std": 0.19397217221558094, "rewards/accuracy_reward": 0.13392857206054032, "rewards/format_reward": 0.9531250447034836, "rewards/tag_count_reward": 0.9832589626312256, "step": 677 }, { "clip_ratio": 0.0, "completion_length": 625.1763610839844, "epoch": 0.20252408333955643, "grad_norm": 0.9176636338233948, "kl": 2.6920166015625, "learning_rate": 1.936684364522654e-05, "loss": -0.0016, "reward": 2.1177456378936768, "reward_std": 0.08239921554923058, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9972098469734192, "step": 678 }, { "clip_ratio": 0.0, "completion_length": 618.5535888671875, "epoch": 0.20282279142707788, "grad_norm": 0.14589020609855652, "kl": 0.0928955078125, "learning_rate": 1.9363186138344075e-05, "loss": 0.016, "reward": 2.087611675262451, "reward_std": 0.14511973783373833, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9938616454601288, "step": 679 }, { "clip_ratio": 0.0, "completion_length": 669.5558471679688, "epoch": 0.20312149951459935, "grad_norm": 0.14738580584526062, "kl": 0.099853515625, "learning_rate": 1.9359518445232778e-05, "loss": 0.0239, "reward": 2.102678656578064, "reward_std": 0.14304404519498348, "rewards/accuracy_reward": 0.12946429336443543, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9933035969734192, "step": 680 }, { "clip_ratio": 0.0, "completion_length": 609.3839569091797, "epoch": 0.20342020760212082, "grad_norm": 0.14502006769180298, "kl": 0.0941162109375, "learning_rate": 1.935584056988275e-05, "loss": 0.0154, "reward": 2.0825894474983215, "reward_std": 0.12997434847056866, "rewards/accuracy_reward": 0.10044643376022577, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.995535746216774, "step": 681 }, { "clip_ratio": 0.0, "completion_length": 605.0335083007812, "epoch": 0.2037189156896423, "grad_norm": 0.1022934690117836, "kl": 0.0946044921875, "learning_rate": 1.935215251629515e-05, "loss": 0.0112, "reward": 2.0597099661827087, "reward_std": 0.08328117989003658, "rewards/accuracy_reward": 0.06250000139698386, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 682 }, { "clip_ratio": 0.0, "completion_length": 572.9375305175781, "epoch": 0.20401762377716376, "grad_norm": 0.1212196946144104, "kl": 0.1015625, "learning_rate": 1.934845428848222e-05, "loss": 0.0147, "reward": 2.1205358505249023, "reward_std": 0.10844871029257774, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9977678805589676, "step": 683 }, { "clip_ratio": 0.0, "completion_length": 590.8437652587891, "epoch": 0.20431633186468523, "grad_norm": 0.12689407169818878, "kl": 0.0999755859375, "learning_rate": 1.9344745890467273e-05, "loss": 0.0087, "reward": 2.0976563692092896, "reward_std": 0.0915178619325161, "rewards/accuracy_reward": 0.10044643213041127, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 684 }, { "clip_ratio": 0.0, "completion_length": 536.8281402587891, "epoch": 0.2046150399522067, "grad_norm": 0.1197967529296875, "kl": 0.0966796875, "learning_rate": 1.934102732628468e-05, "loss": 0.0122, "reward": 2.0558037161827087, "reward_std": 0.08645018469542265, "rewards/accuracy_reward": 0.05580357578583062, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 685 }, { "clip_ratio": 0.0, "completion_length": 550.3058395385742, "epoch": 0.20491374803972817, "grad_norm": 0.11527325958013535, "kl": 0.09765625, "learning_rate": 1.9337298599979877e-05, "loss": 0.0043, "reward": 2.071428656578064, "reward_std": 0.07494965940713882, "rewards/accuracy_reward": 0.07142857508733869, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 686 }, { "clip_ratio": 0.0, "completion_length": 568.9553833007812, "epoch": 0.20521245612724964, "grad_norm": 0.11371306329965591, "kl": 0.09619140625, "learning_rate": 1.933355971560935e-05, "loss": -0.0016, "reward": 2.022321581840515, "reward_std": 0.06388495303690434, "rewards/accuracy_reward": 0.022321429336443543, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 687 }, { "clip_ratio": 0.0, "completion_length": 530.756721496582, "epoch": 0.2055111642147711, "grad_norm": 0.12117329239845276, "kl": 0.1063232421875, "learning_rate": 1.9329810677240643e-05, "loss": 0.0009, "reward": 2.137834906578064, "reward_std": 0.07577520050108433, "rewards/accuracy_reward": 0.1406250111758709, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 688 }, { "clip_ratio": 0.0, "completion_length": 561.3058166503906, "epoch": 0.20580987230229258, "grad_norm": 0.13554944097995758, "kl": 0.1009521484375, "learning_rate": 1.9326051488952334e-05, "loss": 0.0095, "reward": 2.165178656578064, "reward_std": 0.11694705951958895, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 689 }, { "clip_ratio": 0.0, "completion_length": 548.3683319091797, "epoch": 0.20610858038981406, "grad_norm": 0.12340112775564194, "kl": 0.0985107421875, "learning_rate": 1.9322282154834055e-05, "loss": 0.0072, "reward": 2.107142984867096, "reward_std": 0.0929968785494566, "rewards/accuracy_reward": 0.10714285937137902, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 690 }, { "clip_ratio": 0.0, "completion_length": 577.4665374755859, "epoch": 0.20640728847733553, "grad_norm": 0.07537932693958282, "kl": 0.103759765625, "learning_rate": 1.9318502678986476e-05, "loss": 0.009, "reward": 2.205357253551483, "reward_std": 0.05168620124459267, "rewards/accuracy_reward": 0.2053571529686451, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 691 }, { "clip_ratio": 0.0, "completion_length": 564.7187652587891, "epoch": 0.206705996564857, "grad_norm": 0.11587102711200714, "kl": 0.0955810546875, "learning_rate": 1.9314713065521294e-05, "loss": 0.0102, "reward": 2.0602678656578064, "reward_std": 0.05935155972838402, "rewards/accuracy_reward": 0.0602678582072258, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 692 }, { "clip_ratio": 0.0, "completion_length": 586.7366333007812, "epoch": 0.20700470465237847, "grad_norm": 0.1080361157655716, "kl": 0.1019287109375, "learning_rate": 1.9310913318561235e-05, "loss": 0.0144, "reward": 2.048549234867096, "reward_std": 0.05391834583133459, "rewards/accuracy_reward": 0.05133928684517741, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 693 }, { "clip_ratio": 0.0, "completion_length": 605.2053833007812, "epoch": 0.20730341273989994, "grad_norm": 0.09202894568443298, "kl": 0.0977783203125, "learning_rate": 1.9307103442240054e-05, "loss": 0.0095, "reward": 2.1311384439468384, "reward_std": 0.07837759330868721, "rewards/accuracy_reward": 0.13392857555299997, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 694 }, { "clip_ratio": 0.0, "completion_length": 568.3214569091797, "epoch": 0.2076021208274214, "grad_norm": 0.17058627307415009, "kl": 0.0986328125, "learning_rate": 1.9303283440702524e-05, "loss": 0.0021, "reward": 2.146205484867096, "reward_std": 0.14518412575125694, "rewards/accuracy_reward": 0.15848215483129025, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.996651828289032, "step": 695 }, { "clip_ratio": 0.0, "completion_length": 597.7567291259766, "epoch": 0.20790082891494288, "grad_norm": 0.1451936960220337, "kl": 0.103759765625, "learning_rate": 1.9299453318104428e-05, "loss": 0.0174, "reward": 2.0100447833538055, "reward_std": 0.07243260648101568, "rewards/accuracy_reward": 0.01562500069849193, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9988839328289032, "step": 696 }, { "clip_ratio": 0.0, "completion_length": 638.4754791259766, "epoch": 0.20819953700246435, "grad_norm": 0.10499212145805359, "kl": 0.09912109375, "learning_rate": 1.9295613078612566e-05, "loss": 0.0076, "reward": 2.090959906578064, "reward_std": 0.05907375179231167, "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 697 }, { "clip_ratio": 0.0, "completion_length": 624.7254638671875, "epoch": 0.20849824508998582, "grad_norm": 0.13077150285243988, "kl": 0.1011962890625, "learning_rate": 1.9291762726404742e-05, "loss": 0.0091, "reward": 2.0870536863803864, "reward_std": 0.08406830485910177, "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9977678805589676, "step": 698 }, { "clip_ratio": 0.0, "completion_length": 625.8058471679688, "epoch": 0.2087969531775073, "grad_norm": 0.12930545210838318, "kl": 0.1175537109375, "learning_rate": 1.9287902265669764e-05, "loss": 0.0192, "reward": 2.0970982909202576, "reward_std": 0.15691225230693817, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9966518133878708, "step": 699 }, { "clip_ratio": 0.0, "completion_length": 645.2053833007812, "epoch": 0.20909566126502874, "grad_norm": 0.09799229353666306, "kl": 0.1026611328125, "learning_rate": 1.9284031700607434e-05, "loss": 0.0182, "reward": 2.073102742433548, "reward_std": 0.0825701653957367, "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.997209832072258, "step": 700 }, { "clip_ratio": 0.0, "completion_length": 619.2857360839844, "epoch": 0.2093943693525502, "grad_norm": 0.10841073840856552, "kl": 0.1060791015625, "learning_rate": 1.9280151035428544e-05, "loss": 0.0084, "reward": 2.135044753551483, "reward_std": 0.06937365606427193, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9988839626312256, "step": 701 }, { "clip_ratio": 0.0, "completion_length": 623.8415374755859, "epoch": 0.20969307744007168, "grad_norm": 0.09272396564483643, "kl": 0.1065673828125, "learning_rate": 1.9276260274354884e-05, "loss": 0.0121, "reward": 2.0200893878936768, "reward_std": 0.05242492165416479, "rewards/accuracy_reward": 0.03125000116415322, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9977678805589676, "step": 702 }, { "clip_ratio": 0.0, "completion_length": 612.2165374755859, "epoch": 0.20999178552759315, "grad_norm": 0.13471810519695282, "kl": 0.101318359375, "learning_rate": 1.927235942161921e-05, "loss": 0.0145, "reward": 2.0552456080913544, "reward_std": 0.09991380199790001, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.997209832072258, "step": 703 }, { "clip_ratio": 0.0, "completion_length": 599.2210235595703, "epoch": 0.21029049361511462, "grad_norm": 0.19309881329536438, "kl": 0.12060546875, "learning_rate": 1.9268448481465282e-05, "loss": 0.0039, "reward": 2.1132813692092896, "reward_std": 0.12435752619057894, "rewards/accuracy_reward": 0.11607143026776612, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 704 }, { "clip_ratio": 0.0, "completion_length": 597.2433319091797, "epoch": 0.2105892017026361, "grad_norm": 0.11649996787309647, "kl": 0.1024169921875, "learning_rate": 1.9264527458147807e-05, "loss": 0.0105, "reward": 2.023437589406967, "reward_std": 0.08463135547935963, "rewards/accuracy_reward": 0.0290178582072258, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9988839328289032, "step": 705 }, { "clip_ratio": 0.0, "completion_length": 585.8348388671875, "epoch": 0.21088790979015756, "grad_norm": 0.13827019929885864, "kl": 0.1011962890625, "learning_rate": 1.926059635593248e-05, "loss": 0.0061, "reward": 2.0887277722358704, "reward_std": 0.0761706680059433, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 706 }, { "clip_ratio": 0.0, "completion_length": 581.8236694335938, "epoch": 0.21118661787767903, "grad_norm": 0.08993768692016602, "kl": 0.103271484375, "learning_rate": 1.9256655179095954e-05, "loss": 0.0031, "reward": 2.033482253551483, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.03571428847499192, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 1.0, "step": 707 }, { "clip_ratio": 0.0, "completion_length": 578.1339416503906, "epoch": 0.2114853259652005, "grad_norm": 0.18841010332107544, "kl": 0.0987548828125, "learning_rate": 1.9252703931925843e-05, "loss": 0.0197, "reward": 2.0396206080913544, "reward_std": 0.13379283249378204, "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9949776977300644, "step": 708 }, { "clip_ratio": 0.0, "completion_length": 559.0826110839844, "epoch": 0.21178403405272198, "grad_norm": 0.19534654915332794, "kl": 0.101318359375, "learning_rate": 1.9248742618720714e-05, "loss": 0.0116, "reward": 2.1143974661827087, "reward_std": 0.12324204295873642, "rewards/accuracy_reward": 0.12500000488944352, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 709 }, { "clip_ratio": 0.0, "completion_length": 584.8214721679688, "epoch": 0.21208274214024345, "grad_norm": 0.11433392763137817, "kl": 0.100341796875, "learning_rate": 1.9244771243790092e-05, "loss": 0.012, "reward": 2.126674234867096, "reward_std": 0.0739116258919239, "rewards/accuracy_reward": 0.12946429592557251, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 710 }, { "clip_ratio": 0.0, "completion_length": 570.9375305175781, "epoch": 0.21238145022776492, "grad_norm": 0.12610457837581635, "kl": 0.093994140625, "learning_rate": 1.9240789811454443e-05, "loss": 0.0146, "reward": 2.041294813156128, "reward_std": 0.066774214617908, "rewards/accuracy_reward": 0.049107145285233855, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9988839328289032, "step": 711 }, { "clip_ratio": 0.0, "completion_length": 509.24779510498047, "epoch": 0.2126801583152864, "grad_norm": 0.13902248442173004, "kl": 0.100830078125, "learning_rate": 1.9236798326045173e-05, "loss": 0.0036, "reward": 2.107142925262451, "reward_std": 0.09814595058560371, "rewards/accuracy_reward": 0.10937500465661287, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 1.0, "step": 712 }, { "clip_ratio": 0.0, "completion_length": 527.3750228881836, "epoch": 0.21297886640280786, "grad_norm": 0.27284786105155945, "kl": 0.12939453125, "learning_rate": 1.9232796791904627e-05, "loss": -0.0018, "reward": 2.1417412161827087, "reward_std": 0.09437472652643919, "rewards/accuracy_reward": 0.1495535783469677, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9988839626312256, "step": 713 }, { "clip_ratio": 0.0, "completion_length": 547.7656555175781, "epoch": 0.21327757449032933, "grad_norm": 0.13124559819698334, "kl": 0.105224609375, "learning_rate": 1.9228785213386082e-05, "loss": 0.0085, "reward": 2.064732313156128, "reward_std": 0.10502103343605995, "rewards/accuracy_reward": 0.06473214644938707, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 714 }, { "clip_ratio": 0.0, "completion_length": 534.2120819091797, "epoch": 0.2135762825778508, "grad_norm": 0.10546193271875381, "kl": 0.101318359375, "learning_rate": 1.9224763594853747e-05, "loss": -0.0, "reward": 2.017857253551483, "reward_std": 0.0776920048519969, "rewards/accuracy_reward": 0.026785715715959668, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9977678805589676, "step": 715 }, { "clip_ratio": 0.0, "completion_length": 521.3325958251953, "epoch": 0.21387499066537227, "grad_norm": 42.67045593261719, "kl": 4.138427734375, "learning_rate": 1.9220731940682738e-05, "loss": 0.2302, "reward": 2.105468839406967, "reward_std": 0.059948612935841084, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9983258992433548, "step": 716 }, { "clip_ratio": 0.0, "completion_length": 508.3058395385742, "epoch": 0.21417369875289374, "grad_norm": 0.11214511096477509, "kl": 0.0989990234375, "learning_rate": 1.9216690255259113e-05, "loss": 0.0045, "reward": 2.060267984867096, "reward_std": 0.07735436595976353, "rewards/accuracy_reward": 0.06473214458674192, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 1.0, "step": 717 }, { "clip_ratio": 0.0, "completion_length": 486.5803756713867, "epoch": 0.21447240684041521, "grad_norm": 0.18160708248615265, "kl": 0.1055908203125, "learning_rate": 1.921263854297982e-05, "loss": 0.0083, "reward": 2.0619420409202576, "reward_std": 0.10289645474404097, "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9994419813156128, "step": 718 }, { "clip_ratio": 0.0, "completion_length": 463.61609649658203, "epoch": 0.21477111492793668, "grad_norm": 0.16923706233501434, "kl": 0.1064453125, "learning_rate": 1.9208576808252725e-05, "loss": 0.0084, "reward": 2.0825893878936768, "reward_std": 0.05495638120919466, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 1.0, "step": 719 }, { "clip_ratio": 0.0, "completion_length": 494.5156478881836, "epoch": 0.21506982301545816, "grad_norm": 0.1537003517150879, "kl": 0.109619140625, "learning_rate": 1.9204505055496605e-05, "loss": -0.0007, "reward": 2.0457590222358704, "reward_std": 0.12056603748351336, "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9988839328289032, "step": 720 }, { "clip_ratio": 0.0, "completion_length": 507.2366256713867, "epoch": 0.21536853110297963, "grad_norm": 0.13674713671207428, "kl": 0.112060546875, "learning_rate": 1.920042328914112e-05, "loss": -0.002, "reward": 2.1004465222358704, "reward_std": 0.07840991485863924, "rewards/accuracy_reward": 0.10267857392318547, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 1.0, "step": 721 }, { "clip_ratio": 0.0, "completion_length": 521.3348388671875, "epoch": 0.21566723919050107, "grad_norm": 0.6250355243682861, "kl": 0.1964111328125, "learning_rate": 1.9196331513626836e-05, "loss": -0.004, "reward": 2.0362724661827087, "reward_std": 0.12367313914000988, "rewards/accuracy_reward": 0.04464285867288709, "rewards/format_reward": 0.993303582072258, "rewards/tag_count_reward": 0.9983258992433548, "step": 722 }, { "clip_ratio": 0.0, "completion_length": 524.1049346923828, "epoch": 0.21596594727802254, "grad_norm": 0.2973407208919525, "kl": 0.10546875, "learning_rate": 1.9192229733405204e-05, "loss": 0.0035, "reward": 2.1372768878936768, "reward_std": 0.056736123049631715, "rewards/accuracy_reward": 0.14062500861473382, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9988839626312256, "step": 723 }, { "clip_ratio": 0.0, "completion_length": 524.6183319091797, "epoch": 0.216264655365544, "grad_norm": 0.12272752076387405, "kl": 0.1060791015625, "learning_rate": 1.9188117952938557e-05, "loss": 0.0156, "reward": 2.0998884439468384, "reward_std": 0.07875682786107063, "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 724 }, { "clip_ratio": 0.0, "completion_length": 525.6272430419922, "epoch": 0.21656336345306548, "grad_norm": 0.09453535825014114, "kl": 0.0999755859375, "learning_rate": 1.918399617670011e-05, "loss": 0.0108, "reward": 2.1004465222358704, "reward_std": 0.06966617982834578, "rewards/accuracy_reward": 0.10267857555299997, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 1.0, "step": 725 }, { "clip_ratio": 0.0, "completion_length": 532.013427734375, "epoch": 0.21686207154058695, "grad_norm": 0.13287128508090973, "kl": 0.1015625, "learning_rate": 1.9179864409173947e-05, "loss": -0.003, "reward": 2.0820313692092896, "reward_std": 0.06454207096248865, "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9994419813156128, "step": 726 }, { "clip_ratio": 0.0, "completion_length": 539.0602722167969, "epoch": 0.21716077962810842, "grad_norm": 0.10478438436985016, "kl": 0.09814453125, "learning_rate": 1.9175722654855033e-05, "loss": 0.0057, "reward": 2.058035731315613, "reward_std": 0.07959691435098648, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 727 }, { "clip_ratio": 0.0, "completion_length": 540.6183319091797, "epoch": 0.2174594877156299, "grad_norm": 0.11172876507043839, "kl": 0.109130859375, "learning_rate": 1.917157091824919e-05, "loss": -0.0011, "reward": 2.060267925262451, "reward_std": 0.07847191952168941, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9977678656578064, "step": 728 }, { "clip_ratio": 0.0, "completion_length": 537.1161041259766, "epoch": 0.21775819580315137, "grad_norm": 0.13780635595321655, "kl": 0.100341796875, "learning_rate": 1.9167409203873095e-05, "loss": 0.0096, "reward": 2.1350446939468384, "reward_std": 0.13730753120034933, "rewards/accuracy_reward": 0.14955357555299997, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9966517984867096, "step": 729 }, { "clip_ratio": 0.0, "completion_length": 529.0513610839844, "epoch": 0.21805690389067284, "grad_norm": 0.8654899001121521, "kl": 0.105712890625, "learning_rate": 1.916323751625429e-05, "loss": 0.0042, "reward": 2.0585938692092896, "reward_std": 0.09623305778950453, "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9960937649011612, "step": 730 }, { "clip_ratio": 0.0, "completion_length": 545.0982360839844, "epoch": 0.2183556119781943, "grad_norm": 0.08278798311948776, "kl": 0.09814453125, "learning_rate": 1.9159055859931163e-05, "loss": 0.0045, "reward": 2.051339328289032, "reward_std": 0.03772235009819269, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 731 }, { "clip_ratio": 0.0, "completion_length": 532.1540374755859, "epoch": 0.21865432006571578, "grad_norm": 0.14625918865203857, "kl": 0.096923828125, "learning_rate": 1.915486423945294e-05, "loss": 0.0146, "reward": 2.080357313156128, "reward_std": 0.09846520144492388, "rewards/accuracy_reward": 0.08035714761354029, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 732 }, { "clip_ratio": 0.0, "completion_length": 576.2433242797852, "epoch": 0.21895302815323725, "grad_norm": 0.1305595487356186, "kl": 0.0928955078125, "learning_rate": 1.9150662659379705e-05, "loss": 0.0166, "reward": 2.0535715520381927, "reward_std": 0.1012692041695118, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9977678954601288, "step": 733 }, { "clip_ratio": 0.0, "completion_length": 569.4129791259766, "epoch": 0.21925173624075872, "grad_norm": 0.1589033305644989, "kl": 0.105712890625, "learning_rate": 1.914645112428235e-05, "loss": 0.0224, "reward": 2.0859375596046448, "reward_std": 0.1590290702879429, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9966517984867096, "step": 734 }, { "clip_ratio": 0.0, "completion_length": 595.9910888671875, "epoch": 0.2195504443282802, "grad_norm": 0.14425742626190186, "kl": 0.109130859375, "learning_rate": 1.9142229638742623e-05, "loss": -0.0106, "reward": 2.110491156578064, "reward_std": 0.13120807334780693, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.996651828289032, "step": 735 }, { "clip_ratio": 0.0, "completion_length": 556.8727874755859, "epoch": 0.21984915241580166, "grad_norm": 0.13361790776252747, "kl": 0.09521484375, "learning_rate": 1.913799820735309e-05, "loss": -0.0129, "reward": 2.088169753551483, "reward_std": 0.10323456209152937, "rewards/accuracy_reward": 0.09375000302679837, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9988839328289032, "step": 736 }, { "clip_ratio": 0.0, "completion_length": 554.0647506713867, "epoch": 0.22014786050332313, "grad_norm": 0.14166712760925293, "kl": 0.0982666015625, "learning_rate": 1.9133756834717118e-05, "loss": -0.012, "reward": 2.0513393878936768, "reward_std": 0.12072656117379665, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9977678954601288, "step": 737 }, { "clip_ratio": 0.0, "completion_length": 540.4687652587891, "epoch": 0.2204465685908446, "grad_norm": 0.12847471237182617, "kl": 0.0933837890625, "learning_rate": 1.9129505525448917e-05, "loss": -0.0, "reward": 2.026785910129547, "reward_std": 0.05728259216994047, "rewards/accuracy_reward": 0.026785715483129025, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 738 }, { "clip_ratio": 0.0, "completion_length": 533.2745819091797, "epoch": 0.22074527667836608, "grad_norm": 0.17241400480270386, "kl": 0.09912109375, "learning_rate": 1.9125244284173497e-05, "loss": -0.0003, "reward": 2.026785761117935, "reward_std": 0.15385471656918526, "rewards/accuracy_reward": 0.04687500046566129, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.995535746216774, "step": 739 }, { "clip_ratio": 0.0, "completion_length": 601.0245819091797, "epoch": 0.22104398476588755, "grad_norm": 0.09330423176288605, "kl": 0.0926513671875, "learning_rate": 1.912097311552666e-05, "loss": 0.0133, "reward": 2.107701003551483, "reward_std": 0.06402256805449724, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.998325914144516, "step": 740 }, { "clip_ratio": 0.0, "completion_length": 579.4397735595703, "epoch": 0.22134269285340902, "grad_norm": 0.1322270780801773, "kl": 0.0830078125, "learning_rate": 1.9116692024155026e-05, "loss": 0.0247, "reward": 2.0435268878936768, "reward_std": 0.12898404709994793, "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9988839626312256, "step": 741 }, { "clip_ratio": 0.0, "completion_length": 557.1138610839844, "epoch": 0.2216414009409305, "grad_norm": 0.1338130086660385, "kl": 0.089599609375, "learning_rate": 1.9112401014716004e-05, "loss": 0.0001, "reward": 2.075334906578064, "reward_std": 0.09949612990021706, "rewards/accuracy_reward": 0.07812500605359674, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 742 }, { "clip_ratio": 0.0, "completion_length": 581.3326110839844, "epoch": 0.22194010902845193, "grad_norm": 0.15779036283493042, "kl": 0.0892333984375, "learning_rate": 1.9108100091877787e-05, "loss": 0.0123, "reward": 2.0714286863803864, "reward_std": 0.13759701699018478, "rewards/accuracy_reward": 0.07812500465661287, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9977678805589676, "step": 743 }, { "clip_ratio": 0.0, "completion_length": 571.9910888671875, "epoch": 0.2222388171159734, "grad_norm": 0.10729778558015823, "kl": 0.0870361328125, "learning_rate": 1.9103789260319362e-05, "loss": 0.012, "reward": 2.1356027722358704, "reward_std": 0.08490916527807713, "rewards/accuracy_reward": 0.13839286286383867, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 744 }, { "clip_ratio": 0.0, "completion_length": 555.2388610839844, "epoch": 0.22253752520349487, "grad_norm": 0.1298067718744278, "kl": 0.09326171875, "learning_rate": 1.9099468524730485e-05, "loss": -0.001, "reward": 2.0429688692092896, "reward_std": 0.13204401172697544, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9960937649011612, "step": 745 }, { "clip_ratio": 0.0, "completion_length": 569.8370819091797, "epoch": 0.22283623329101634, "grad_norm": 0.16320286691188812, "kl": 0.090576171875, "learning_rate": 1.90951378898117e-05, "loss": 0.0081, "reward": 2.0396206080913544, "reward_std": 0.14363870956003666, "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9972098618745804, "step": 746 }, { "clip_ratio": 0.0, "completion_length": 580.2477874755859, "epoch": 0.22313494137853782, "grad_norm": 7.630666732788086, "kl": 0.2325439453125, "learning_rate": 1.909079736027431e-05, "loss": -0.0015, "reward": 2.0803572237491608, "reward_std": 0.1783300470560789, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9888393431901932, "step": 747 }, { "clip_ratio": 0.0, "completion_length": 565.9642944335938, "epoch": 0.2234336494660593, "grad_norm": 0.10092635452747345, "kl": 0.0887451171875, "learning_rate": 1.9086446940840386e-05, "loss": -0.0046, "reward": 2.098772406578064, "reward_std": 0.0765547938644886, "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9983259439468384, "step": 748 }, { "clip_ratio": 0.0, "completion_length": 563.1161041259766, "epoch": 0.22373235755358076, "grad_norm": 0.09811714291572571, "kl": 0.0963134765625, "learning_rate": 1.9082086636242757e-05, "loss": -0.0009, "reward": 2.046316981315613, "reward_std": 0.08158602751791477, "rewards/accuracy_reward": 0.04910714365541935, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 749 }, { "clip_ratio": 0.0, "completion_length": 518.6339569091797, "epoch": 0.22403106564110223, "grad_norm": 0.18036359548568726, "kl": 0.099365234375, "learning_rate": 1.9077716451225007e-05, "loss": 0.0035, "reward": 2.0931921005249023, "reward_std": 0.201568940654397, "rewards/accuracy_reward": 0.10714285913854837, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.997209832072258, "step": 750 }, { "clip_ratio": 0.0, "completion_length": 564.5915374755859, "epoch": 0.2243297737286237, "grad_norm": 0.15975047647953033, "kl": 0.09375, "learning_rate": 1.9073336390541472e-05, "loss": 0.0153, "reward": 2.0385046005249023, "reward_std": 0.13864813186228275, "rewards/accuracy_reward": 0.04687500139698386, "rewards/format_reward": 0.993303582072258, "rewards/tag_count_reward": 0.9983258992433548, "step": 751 }, { "clip_ratio": 0.0, "completion_length": 558.8281402587891, "epoch": 0.22462848181614517, "grad_norm": 0.11366813629865646, "kl": 0.091552734375, "learning_rate": 1.9068946458957225e-05, "loss": 0.0087, "reward": 2.0809152722358704, "reward_std": 0.1033148318529129, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.993303582072258, "rewards/tag_count_reward": 0.9983258992433548, "step": 752 }, { "clip_ratio": 0.0, "completion_length": 566.6317138671875, "epoch": 0.22492718990366664, "grad_norm": 0.10313384979963303, "kl": 0.0921630859375, "learning_rate": 1.9064546661248084e-05, "loss": 0.0003, "reward": 2.116071581840515, "reward_std": 0.08783513586968184, "rewards/accuracy_reward": 0.11607143608853221, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 753 }, { "clip_ratio": 0.0, "completion_length": 581.9754638671875, "epoch": 0.2252258979911881, "grad_norm": 0.12199605256319046, "kl": 0.093994140625, "learning_rate": 1.9060137002200597e-05, "loss": 0.0108, "reward": 2.0775671005249023, "reward_std": 0.09523651748895645, "rewards/accuracy_reward": 0.08928571688011289, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.997209832072258, "step": 754 }, { "clip_ratio": 0.0, "completion_length": 595.4420013427734, "epoch": 0.22552460607870958, "grad_norm": 0.09244892001152039, "kl": 0.0919189453125, "learning_rate": 1.905571748661204e-05, "loss": 0.0084, "reward": 2.0424107909202576, "reward_std": 0.06006636843085289, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 755 }, { "clip_ratio": 0.0, "completion_length": 589.9933319091797, "epoch": 0.22582331416623105, "grad_norm": 0.14272543787956238, "kl": 0.111083984375, "learning_rate": 1.9051288119290414e-05, "loss": -0.0042, "reward": 2.0396206378936768, "reward_std": 0.13348005339503288, "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.9888392984867096, "rewards/tag_count_reward": 0.9949776828289032, "step": 756 }, { "clip_ratio": 0.0, "completion_length": 590.8817291259766, "epoch": 0.22612202225375252, "grad_norm": 0.14576545357704163, "kl": 0.09521484375, "learning_rate": 1.9046848905054433e-05, "loss": 0.0124, "reward": 2.164062589406967, "reward_std": 0.15095004439353943, "rewards/accuracy_reward": 0.16964286682195961, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9988839328289032, "step": 757 }, { "clip_ratio": 0.0, "completion_length": 599.0647583007812, "epoch": 0.226420730341274, "grad_norm": 0.13611078262329102, "kl": 0.1082763671875, "learning_rate": 1.904239984873353e-05, "loss": 0.005, "reward": 2.076451003551483, "reward_std": 0.08189497329294682, "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.998325914144516, "step": 758 }, { "clip_ratio": 0.0, "completion_length": 587.6294708251953, "epoch": 0.22671943842879547, "grad_norm": 0.19317534565925598, "kl": 0.10595703125, "learning_rate": 1.9037940955167845e-05, "loss": 0.0141, "reward": 2.0496652722358704, "reward_std": 0.17256246879696846, "rewards/accuracy_reward": 0.06696428754366934, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9938616454601288, "step": 759 }, { "clip_ratio": 0.0, "completion_length": 555.9843902587891, "epoch": 0.22701814651631694, "grad_norm": 0.13334500789642334, "kl": 0.1064453125, "learning_rate": 1.9033472229208213e-05, "loss": -0.0021, "reward": 2.150669813156128, "reward_std": 0.11547632794827223, "rewards/accuracy_reward": 0.16964286379516125, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.996651828289032, "step": 760 }, { "clip_ratio": 0.0, "completion_length": 581.482177734375, "epoch": 0.2273168546038384, "grad_norm": 0.17485931515693665, "kl": 0.0980224609375, "learning_rate": 1.902899367571617e-05, "loss": 0.0152, "reward": 1.9793527722358704, "reward_std": 0.15407881885766983, "rewards/accuracy_reward": 0.01562500069849193, "rewards/format_reward": 0.9687500596046448, "rewards/tag_count_reward": 0.9949776977300644, "step": 761 }, { "clip_ratio": 0.0, "completion_length": 557.3147735595703, "epoch": 0.22761556269135988, "grad_norm": 0.184566929936409, "kl": 0.105224609375, "learning_rate": 1.902450529956395e-05, "loss": 0.002, "reward": 2.0998885333538055, "reward_std": 0.2191246971487999, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.990513414144516, "step": 762 }, { "clip_ratio": 0.0, "completion_length": 520.9062728881836, "epoch": 0.22791427077888135, "grad_norm": 4.09884786605835, "kl": 0.151123046875, "learning_rate": 1.902000710563445e-05, "loss": 0.0347, "reward": 2.106584906578064, "reward_std": 0.2022872194647789, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9838170111179352, "step": 763 }, { "clip_ratio": 0.0, "completion_length": 507.54913330078125, "epoch": 0.22821297886640282, "grad_norm": 0.22078369557857513, "kl": 0.1119384765625, "learning_rate": 1.9015499098821283e-05, "loss": -0.0201, "reward": 2.010602742433548, "reward_std": 0.32721535488963127, "rewards/accuracy_reward": 0.1316964323632419, "rewards/format_reward": 0.9241071790456772, "rewards/tag_count_reward": 0.9547991454601288, "step": 764 }, { "clip_ratio": 0.0, "completion_length": 555.5223388671875, "epoch": 0.22851168695392426, "grad_norm": 0.20509213209152222, "kl": 0.1068115234375, "learning_rate": 1.901098128402871e-05, "loss": 0.0016, "reward": 2.0652902722358704, "reward_std": 0.19950727745890617, "rewards/accuracy_reward": 0.10267857951112092, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9827009290456772, "step": 765 }, { "clip_ratio": 0.0, "completion_length": 532.5491409301758, "epoch": 0.22881039504144574, "grad_norm": 0.24475646018981934, "kl": 0.1123046875, "learning_rate": 1.900645366617167e-05, "loss": 0.0235, "reward": 2.130022406578064, "reward_std": 0.22665279358625412, "rewards/accuracy_reward": 0.16741072572767735, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9871652126312256, "step": 766 }, { "clip_ratio": 0.0, "completion_length": 515.2120742797852, "epoch": 0.2291091031289672, "grad_norm": 0.16581235826015472, "kl": 0.1136474609375, "learning_rate": 1.9001916250175764e-05, "loss": -0.0091, "reward": 2.10491082072258, "reward_std": 0.1595046343281865, "rewards/accuracy_reward": 0.12723214644938707, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9933036118745804, "step": 767 }, { "clip_ratio": 0.0, "completion_length": 534.7411041259766, "epoch": 0.22940781121648868, "grad_norm": 0.1501980572938919, "kl": 0.1103515625, "learning_rate": 1.8997369040977266e-05, "loss": -0.0001, "reward": 2.060267984867096, "reward_std": 0.08885351894423366, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.995535746216774, "step": 768 }, { "clip_ratio": 0.0, "completion_length": 516.1786117553711, "epoch": 0.22970651930401015, "grad_norm": 0.20876048505306244, "kl": 0.1226806640625, "learning_rate": 1.899281204352309e-05, "loss": -0.0175, "reward": 2.131696581840515, "reward_std": 0.1592700518667698, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9933036267757416, "step": 769 }, { "clip_ratio": 0.0, "completion_length": 503.85717010498047, "epoch": 0.23000522739153162, "grad_norm": 0.17989780008792877, "kl": 0.109375, "learning_rate": 1.8988245262770795e-05, "loss": 0.0035, "reward": 2.1473215222358704, "reward_std": 0.14813700318336487, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.995535746216774, "step": 770 }, { "clip_ratio": 0.0, "completion_length": 531.1741256713867, "epoch": 0.2303039354790531, "grad_norm": 0.34493979811668396, "kl": 0.164306640625, "learning_rate": 1.8983668703688598e-05, "loss": 0.0009, "reward": 2.0262277126312256, "reward_std": 0.16473331581801176, "rewards/accuracy_reward": 0.06919643026776612, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9815848767757416, "step": 771 }, { "clip_ratio": 0.0, "completion_length": 529.7745819091797, "epoch": 0.23060264356657456, "grad_norm": 0.12870118021965027, "kl": 0.106201171875, "learning_rate": 1.8979082371255347e-05, "loss": 0.0117, "reward": 2.099888503551483, "reward_std": 0.09290281124413013, "rewards/accuracy_reward": 0.10937500605359674, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.997209832072258, "step": 772 }, { "clip_ratio": 0.0, "completion_length": 531.6964569091797, "epoch": 0.23090135165409603, "grad_norm": 0.9222981333732605, "kl": 0.127685546875, "learning_rate": 1.8974486270460518e-05, "loss": 0.0013, "reward": 2.0100447237491608, "reward_std": 0.11468725465238094, "rewards/accuracy_reward": 0.02232142980210483, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9966518133878708, "step": 773 }, { "clip_ratio": 0.0, "completion_length": 530.5826110839844, "epoch": 0.2312000597416175, "grad_norm": 0.08524914085865021, "kl": 0.0960693359375, "learning_rate": 1.8969880406304227e-05, "loss": 0.0069, "reward": 2.0859376192092896, "reward_std": 0.031835637986660004, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9988839328289032, "step": 774 }, { "clip_ratio": 0.0, "completion_length": 528.7120819091797, "epoch": 0.23149876782913897, "grad_norm": 2.2026731967926025, "kl": 0.1834716796875, "learning_rate": 1.8965264783797192e-05, "loss": -0.0105, "reward": 2.185267925262451, "reward_std": 0.15655463747680187, "rewards/accuracy_reward": 0.19642858020961285, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9955357313156128, "step": 775 }, { "clip_ratio": 0.0, "completion_length": 564.2187805175781, "epoch": 0.23179747591666044, "grad_norm": 0.12350954115390778, "kl": 0.105224609375, "learning_rate": 1.8960639407960764e-05, "loss": 0.0, "reward": 2.092634081840515, "reward_std": 0.11844383738934994, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.996651828289032, "step": 776 }, { "clip_ratio": 0.0, "completion_length": 548.185302734375, "epoch": 0.23209618400418192, "grad_norm": 0.6682071089744568, "kl": 0.1348876953125, "learning_rate": 1.8956004283826897e-05, "loss": 0.0241, "reward": 2.1010046005249023, "reward_std": 0.13036386668682098, "rewards/accuracy_reward": 0.11383928963914514, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9960937649011612, "step": 777 }, { "clip_ratio": 0.0, "completion_length": 558.8192291259766, "epoch": 0.2323948920917034, "grad_norm": 0.15683938562870026, "kl": 0.0948486328125, "learning_rate": 1.8951359416438152e-05, "loss": -0.0009, "reward": 2.1333706378936768, "reward_std": 0.13871852029114962, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9972098469734192, "step": 778 }, { "clip_ratio": 0.0, "completion_length": 576.2143249511719, "epoch": 0.23269360017922486, "grad_norm": 0.11995020508766174, "kl": 0.0948486328125, "learning_rate": 1.894670481084769e-05, "loss": -0.0009, "reward": 2.0457589626312256, "reward_std": 0.10240260511636734, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9966517984867096, "step": 779 }, { "clip_ratio": 0.0, "completion_length": 596.1339569091797, "epoch": 0.23299230826674633, "grad_norm": 26.309734344482422, "kl": 2.6480712890625, "learning_rate": 1.8942040472119263e-05, "loss": 0.1589, "reward": 2.050781339406967, "reward_std": 0.13389337062835693, "rewards/accuracy_reward": 0.06919643119908869, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9949776977300644, "step": 780 }, { "clip_ratio": 0.0, "completion_length": 580.8884124755859, "epoch": 0.2332910163542678, "grad_norm": 0.11288188397884369, "kl": 0.097900390625, "learning_rate": 1.8937366405327217e-05, "loss": -0.0036, "reward": 2.06194207072258, "reward_std": 0.0744797820225358, "rewards/accuracy_reward": 0.07812500488944352, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9949777126312256, "step": 781 }, { "clip_ratio": 0.0, "completion_length": 617.0446624755859, "epoch": 0.23358972444178927, "grad_norm": 0.15792964398860931, "kl": 0.087158203125, "learning_rate": 1.8932682615556478e-05, "loss": 0.0042, "reward": 2.1802456378936768, "reward_std": 0.2004222571849823, "rewards/accuracy_reward": 0.1986607275903225, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9949777126312256, "step": 782 }, { "clip_ratio": 0.0, "completion_length": 607.0402069091797, "epoch": 0.23388843252931074, "grad_norm": 0.1731245070695877, "kl": 0.1016845703125, "learning_rate": 1.8927989107902554e-05, "loss": -0.0053, "reward": 2.0892857909202576, "reward_std": 0.2158069983124733, "rewards/accuracy_reward": 0.11607143515720963, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.988839328289032, "step": 783 }, { "clip_ratio": 0.0, "completion_length": 618.7991333007812, "epoch": 0.2341871406168322, "grad_norm": 0.11757630854845047, "kl": 0.097900390625, "learning_rate": 1.8923285887471514e-05, "loss": -0.006, "reward": 2.0279018580913544, "reward_std": 0.1463738400489092, "rewards/accuracy_reward": 0.04464286006987095, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9944196790456772, "step": 784 }, { "clip_ratio": 0.0, "completion_length": 610.2522583007812, "epoch": 0.23448584870435368, "grad_norm": 0.15805913507938385, "kl": 0.0870361328125, "learning_rate": 1.8918572959380005e-05, "loss": 0.0194, "reward": 2.0652902722358704, "reward_std": 0.10818089731037617, "rewards/accuracy_reward": 0.08258928940631449, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9938616305589676, "step": 785 }, { "clip_ratio": 0.0, "completion_length": 635.0067291259766, "epoch": 0.23478455679187513, "grad_norm": 0.15997080504894257, "kl": 0.108642578125, "learning_rate": 1.891385032875523e-05, "loss": 0.0045, "reward": 2.025111675262451, "reward_std": 0.15728376992046833, "rewards/accuracy_reward": 0.055803573690354824, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.991629496216774, "step": 786 }, { "clip_ratio": 0.0, "completion_length": 588.8928985595703, "epoch": 0.2350832648793966, "grad_norm": 0.16555948555469513, "kl": 0.1014404296875, "learning_rate": 1.890911800073495e-05, "loss": 0.0165, "reward": 2.0228795409202576, "reward_std": 0.16259420849382877, "rewards/accuracy_reward": 0.04687500116415322, "rewards/format_reward": 0.9821429252624512, "rewards/tag_count_reward": 0.9938616305589676, "step": 787 }, { "clip_ratio": 0.0, "completion_length": 540.8393173217773, "epoch": 0.23538197296691807, "grad_norm": 0.13866837322711945, "kl": 0.0989990234375, "learning_rate": 1.8904375980467474e-05, "loss": -0.0001, "reward": 2.2025670409202576, "reward_std": 0.1274496465921402, "rewards/accuracy_reward": 0.2187500111758709, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9949777126312256, "step": 788 }, { "clip_ratio": 0.0, "completion_length": 568.5089569091797, "epoch": 0.23568068105443954, "grad_norm": 0.13342957198619843, "kl": 0.11572265625, "learning_rate": 1.889962427311165e-05, "loss": -0.0064, "reward": 2.0485492050647736, "reward_std": 0.1548114288598299, "rewards/accuracy_reward": 0.07366071944124997, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9905134290456772, "step": 789 }, { "clip_ratio": 0.0, "completion_length": 500.53572845458984, "epoch": 0.235979389141961, "grad_norm": 45.700714111328125, "kl": 4.6658935546875, "learning_rate": 1.8894862883836875e-05, "loss": 0.2388, "reward": 2.088727742433548, "reward_std": 0.20111438632011414, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9882812798023224, "step": 790 }, { "clip_ratio": 0.0, "completion_length": 516.8772583007812, "epoch": 0.23627809722948248, "grad_norm": 0.1881440430879593, "kl": 0.12451171875, "learning_rate": 1.8890091817823073e-05, "loss": 0.0036, "reward": 2.1289063692092896, "reward_std": 0.15455695241689682, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9882812798023224, "step": 791 }, { "clip_ratio": 0.0, "completion_length": 490.4062728881836, "epoch": 0.23657680531700395, "grad_norm": 0.19693659245967865, "kl": 0.118896484375, "learning_rate": 1.8885311080260695e-05, "loss": -0.0023, "reward": 2.053013503551483, "reward_std": 0.1853870376944542, "rewards/accuracy_reward": 0.08482143119908869, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9882812947034836, "step": 792 }, { "clip_ratio": 0.0, "completion_length": 492.8683319091797, "epoch": 0.23687551340452542, "grad_norm": 3.2984516620635986, "kl": 0.632080078125, "learning_rate": 1.8880520676350717e-05, "loss": 0.0274, "reward": 1.99162957072258, "reward_std": 0.1835043728351593, "rewards/accuracy_reward": 0.026785714784637094, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9849330931901932, "step": 793 }, { "clip_ratio": 0.0, "completion_length": 455.2968978881836, "epoch": 0.2371742214920469, "grad_norm": 0.25061559677124023, "kl": 0.1611328125, "learning_rate": 1.8875720611304628e-05, "loss": 0.0001, "reward": 2.0513393878936768, "reward_std": 0.2765902914106846, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.9553571939468384, "rewards/tag_count_reward": 0.9799107611179352, "step": 794 }, { "clip_ratio": 0.0, "completion_length": 425.17189025878906, "epoch": 0.23747292957956836, "grad_norm": 0.20455797016620636, "kl": 0.1494140625, "learning_rate": 1.887091089034443e-05, "loss": 0.0089, "reward": 2.0563617050647736, "reward_std": 0.18632843997329473, "rewards/accuracy_reward": 0.09821428824216127, "rewards/format_reward": 0.9732143133878708, "rewards/tag_count_reward": 0.9849330633878708, "step": 795 }, { "clip_ratio": 0.0, "completion_length": 398.99778747558594, "epoch": 0.23777163766708984, "grad_norm": 0.3200564980506897, "kl": 0.177490234375, "learning_rate": 1.8866091518702622e-05, "loss": -0.0044, "reward": 1.9726563394069672, "reward_std": 0.21261707693338394, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.9575893133878708, "rewards/tag_count_reward": 0.9771205931901932, "step": 796 }, { "clip_ratio": 0.0, "completion_length": 390.91072845458984, "epoch": 0.2380703457546113, "grad_norm": 0.2669914662837982, "kl": 0.226318359375, "learning_rate": 1.8861262501622213e-05, "loss": 0.0068, "reward": 2.048549234867096, "reward_std": 0.19884126633405685, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.98604916036129, "step": 797 }, { "clip_ratio": 0.0, "completion_length": 405.2254638671875, "epoch": 0.23836905384213278, "grad_norm": 0.44281673431396484, "kl": 0.217041015625, "learning_rate": 1.88564238443567e-05, "loss": 0.0143, "reward": 1.9369420409202576, "reward_std": 0.2567248046398163, "rewards/accuracy_reward": 0.013392857974395156, "rewards/format_reward": 0.9486607611179352, "rewards/tag_count_reward": 0.9748884290456772, "step": 798 }, { "clip_ratio": 0.0, "completion_length": 404.73216247558594, "epoch": 0.23866776192965425, "grad_norm": 0.779727578163147, "kl": 0.22802734375, "learning_rate": 1.8851575552170064e-05, "loss": 0.0221, "reward": 1.9838170409202576, "reward_std": 0.3237249180674553, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.93526791036129, "rewards/tag_count_reward": 0.9592634439468384, "step": 799 }, { "clip_ratio": 0.0, "completion_length": 369.0089340209961, "epoch": 0.23896647001717572, "grad_norm": 1.242087960243225, "kl": 0.30078125, "learning_rate": 1.884671763033678e-05, "loss": 0.0295, "reward": 2.05412957072258, "reward_std": 0.3196899965405464, "rewards/accuracy_reward": 0.14285714365541935, "rewards/format_reward": 0.9397321790456772, "rewards/tag_count_reward": 0.9715402275323868, "step": 800 }, { "clip_ratio": 0.0, "completion_length": 338.9531478881836, "epoch": 0.2392651781046972, "grad_norm": 1.8074020147323608, "kl": 0.5849609375, "learning_rate": 1.8841850084141783e-05, "loss": 0.062, "reward": 1.9609376192092896, "reward_std": 0.3422044515609741, "rewards/accuracy_reward": 0.06250000139698386, "rewards/format_reward": 0.9308036118745804, "rewards/tag_count_reward": 0.9676339626312256, "step": 801 }, { "clip_ratio": 0.0, "completion_length": 327.53795623779297, "epoch": 0.23956388619221866, "grad_norm": 3.5862886905670166, "kl": 0.99609375, "learning_rate": 1.883697291888049e-05, "loss": 0.0937, "reward": 1.938616156578064, "reward_std": 0.3633335158228874, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.9375000447034836, "rewards/tag_count_reward": 0.9609375298023224, "step": 802 }, { "clip_ratio": 0.0, "completion_length": 328.56697845458984, "epoch": 0.23986259427974013, "grad_norm": 13.697088241577148, "kl": 2.337890625, "learning_rate": 1.8832086139858777e-05, "loss": 0.2518, "reward": 1.942522406578064, "reward_std": 0.40522007271647453, "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.910714328289032, "rewards/tag_count_reward": 0.9447545111179352, "step": 803 }, { "clip_ratio": 0.0, "completion_length": 340.5245666503906, "epoch": 0.2401613023672616, "grad_norm": 24.36860466003418, "kl": 2.484375, "learning_rate": 1.8827189752392982e-05, "loss": 0.3212, "reward": 1.916852742433548, "reward_std": 0.33729859441518784, "rewards/accuracy_reward": 0.022321429336443543, "rewards/format_reward": 0.9375000447034836, "rewards/tag_count_reward": 0.9570312947034836, "step": 804 }, { "clip_ratio": 0.0, "completion_length": 320.9598388671875, "epoch": 0.24046001045478307, "grad_norm": 22.355262756347656, "kl": 1.57861328125, "learning_rate": 1.882228376180989e-05, "loss": 0.1622, "reward": 2.0970983505249023, "reward_std": 0.21046067401766777, "rewards/accuracy_reward": 0.1406250037252903, "rewards/format_reward": 0.9732143133878708, "rewards/tag_count_reward": 0.9832589626312256, "step": 805 }, { "clip_ratio": 0.0, "completion_length": 312.6651916503906, "epoch": 0.24075871854230455, "grad_norm": 15.361388206481934, "kl": 1.419921875, "learning_rate": 1.881736817344675e-05, "loss": 0.2527, "reward": 2.099330484867096, "reward_std": 0.25012731179594994, "rewards/accuracy_reward": 0.14062500931322575, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9832589626312256, "step": 806 }, { "clip_ratio": 0.0, "completion_length": 315.20314025878906, "epoch": 0.24105742662982602, "grad_norm": 5.961979389190674, "kl": 0.59716796875, "learning_rate": 1.8812442992651224e-05, "loss": 0.0854, "reward": 2.0669643878936768, "reward_std": 0.1018037460744381, "rewards/accuracy_reward": 0.08035714761354029, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9955357313156128, "step": 807 }, { "clip_ratio": 0.0, "completion_length": 303.9776916503906, "epoch": 0.24135613471734746, "grad_norm": 3.400210380554199, "kl": 0.3330078125, "learning_rate": 1.880750822478144e-05, "loss": 0.0703, "reward": 2.035156399011612, "reward_std": 0.10205755848437548, "rewards/accuracy_reward": 0.04910714388824999, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9949776977300644, "step": 808 }, { "clip_ratio": 0.0, "completion_length": 301.2634048461914, "epoch": 0.24165484280486893, "grad_norm": 0.9171724915504456, "kl": 0.24560546875, "learning_rate": 1.880256387520593e-05, "loss": 0.0312, "reward": 1.9955357611179352, "reward_std": 0.0357142873108387, "rewards/accuracy_reward": 0.0022321429569274187, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9977678805589676, "step": 809 }, { "clip_ratio": 0.0, "completion_length": 331.36385345458984, "epoch": 0.2419535508923904, "grad_norm": 1.1353065967559814, "kl": 0.1748046875, "learning_rate": 1.8797609949303674e-05, "loss": 0.0164, "reward": 2.0613840222358704, "reward_std": 0.04017857275903225, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9966517984867096, "step": 810 }, { "clip_ratio": 0.0, "completion_length": 325.8727798461914, "epoch": 0.24225225897991187, "grad_norm": 0.5805432200431824, "kl": 0.1397705078125, "learning_rate": 1.879264645246405e-05, "loss": -0.0114, "reward": 2.064732253551483, "reward_std": 0.0911037065088749, "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 1.0, "step": 811 }, { "clip_ratio": 0.0, "completion_length": 388.15403747558594, "epoch": 0.24255096706743334, "grad_norm": 0.32480692863464355, "kl": 0.1715087890625, "learning_rate": 1.8787673390086857e-05, "loss": 0.0373, "reward": 2.0306921005249023, "reward_std": 0.13017369341105223, "rewards/accuracy_reward": 0.04241071711294353, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949776977300644, "step": 812 }, { "clip_ratio": 0.0, "completion_length": 403.45983123779297, "epoch": 0.2428496751549548, "grad_norm": 13.912806510925293, "kl": 1.21826171875, "learning_rate": 1.8782690767582295e-05, "loss": 0.1099, "reward": 2.029576003551483, "reward_std": 0.05036567430943251, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9960937798023224, "step": 813 }, { "clip_ratio": 0.0, "completion_length": 452.1808319091797, "epoch": 0.24314838324247628, "grad_norm": 0.7303954362869263, "kl": 0.1402587890625, "learning_rate": 1.8777698590370983e-05, "loss": 0.0325, "reward": 2.0585938692092896, "reward_std": 0.14102057088166475, "rewards/accuracy_reward": 0.07142857485450804, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9960937649011612, "step": 814 }, { "clip_ratio": 0.0, "completion_length": 484.9174270629883, "epoch": 0.24344709132999776, "grad_norm": 18116.55859375, "kl": 9.0, "learning_rate": 1.8772696863883905e-05, "loss": 0.4996, "reward": 2.023995578289032, "reward_std": 0.2009468600153923, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9860491454601288, "step": 815 }, { "clip_ratio": 0.0, "completion_length": 493.54019927978516, "epoch": 0.24374579941751923, "grad_norm": 0.14414218068122864, "kl": 0.1143798828125, "learning_rate": 1.876768559356246e-05, "loss": 0.0506, "reward": 2.0970983505249023, "reward_std": 0.13811988104134798, "rewards/accuracy_reward": 0.11830357671715319, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9921875298023224, "step": 816 }, { "clip_ratio": 0.0, "completion_length": 542.8236694335938, "epoch": 0.2440445075050407, "grad_norm": 14.78456974029541, "kl": 1.4014892578125, "learning_rate": 1.8762664784858412e-05, "loss": 0.1559, "reward": 2.023437589406967, "reward_std": 0.20055489242076874, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9832589775323868, "step": 817 }, { "clip_ratio": 0.0, "completion_length": 618.0469055175781, "epoch": 0.24434321559256217, "grad_norm": 0.19806954264640808, "kl": 0.1280517578125, "learning_rate": 1.875763444323391e-05, "loss": 0.0871, "reward": 1.9531250894069672, "reward_std": 0.2644011378288269, "rewards/accuracy_reward": 0.022321430034935474, "rewards/format_reward": 0.9531250298023224, "rewards/tag_count_reward": 0.9776786118745804, "step": 818 }, { "clip_ratio": 0.0, "completion_length": 591.5111846923828, "epoch": 0.24464192368008364, "grad_norm": 0.24580232799053192, "kl": 0.1185302734375, "learning_rate": 1.875259457416148e-05, "loss": 0.0906, "reward": 2.1043527722358704, "reward_std": 0.30570679157972336, "rewards/accuracy_reward": 0.17410715157166123, "rewards/format_reward": 0.9553571790456772, "rewards/tag_count_reward": 0.9748884439468384, "step": 819 }, { "clip_ratio": 0.0, "completion_length": 627.5826110839844, "epoch": 0.2449406317676051, "grad_norm": 0.24918343126773834, "kl": 0.1153564453125, "learning_rate": 1.8747545183123996e-05, "loss": 0.0484, "reward": 1.9955357909202576, "reward_std": 0.19733795896172523, "rewards/accuracy_reward": 0.031250001629814506, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.988839328289032, "step": 820 }, { "clip_ratio": 0.0, "completion_length": 591.6295013427734, "epoch": 0.24523933985512658, "grad_norm": 0.1442556381225586, "kl": 0.1060791015625, "learning_rate": 1.8742486275614706e-05, "loss": 0.0388, "reward": 2.0736607909202576, "reward_std": 0.16536150872707367, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.995535746216774, "step": 821 }, { "clip_ratio": 0.0, "completion_length": 570.9107437133789, "epoch": 0.24553804794264805, "grad_norm": 0.14015677571296692, "kl": 0.1168212890625, "learning_rate": 1.8737417857137204e-05, "loss": 0.0474, "reward": 2.077009081840515, "reward_std": 0.13766151946038008, "rewards/accuracy_reward": 0.09375000419095159, "rewards/format_reward": 0.9888392984867096, "rewards/tag_count_reward": 0.994419664144516, "step": 822 }, { "clip_ratio": 0.0, "completion_length": 505.30359649658203, "epoch": 0.24583675603016952, "grad_norm": 0.13642117381095886, "kl": 0.114013671875, "learning_rate": 1.873233993320543e-05, "loss": 0.0649, "reward": 2.0580357909202576, "reward_std": 0.13778439164161682, "rewards/accuracy_reward": 0.08035714412108064, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9910714626312256, "step": 823 }, { "clip_ratio": 0.0, "completion_length": 524.4710235595703, "epoch": 0.246135464117691, "grad_norm": 6.595741271972656, "kl": 0.94140625, "learning_rate": 1.872725250934367e-05, "loss": 0.0614, "reward": 2.0625001192092896, "reward_std": 0.16805721260607243, "rewards/accuracy_reward": 0.08035714784637094, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9933035969734192, "step": 824 }, { "clip_ratio": 0.0, "completion_length": 517.0893020629883, "epoch": 0.24643417220521247, "grad_norm": 0.12714479863643646, "kl": 0.1114501953125, "learning_rate": 1.8722155591086545e-05, "loss": 0.0225, "reward": 2.032366156578064, "reward_std": 0.12021647021174431, "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.994419664144516, "step": 825 }, { "clip_ratio": 0.0, "completion_length": 509.1919937133789, "epoch": 0.24673288029273394, "grad_norm": 0.17795626819133759, "kl": 0.12060546875, "learning_rate": 1.8717049183979e-05, "loss": 0.0551, "reward": 2.181361675262451, "reward_std": 0.16654819808900356, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.991629496216774, "step": 826 }, { "clip_ratio": 0.0, "completion_length": 513.1875228881836, "epoch": 0.2470315883802554, "grad_norm": 0.13466283679008484, "kl": 0.1190185546875, "learning_rate": 1.8711933293576303e-05, "loss": 0.0133, "reward": 2.092076003551483, "reward_std": 0.08183616399765015, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9983258992433548, "step": 827 }, { "clip_ratio": 0.0, "completion_length": 486.3437728881836, "epoch": 0.24733029646777688, "grad_norm": 0.6974433064460754, "kl": 0.150390625, "learning_rate": 1.8706807925444045e-05, "loss": 0.0487, "reward": 2.022321492433548, "reward_std": 0.17349900677800179, "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.986607164144516, "step": 828 }, { "clip_ratio": 0.0, "completion_length": 518.0960083007812, "epoch": 0.24762900455529832, "grad_norm": 0.8022063970565796, "kl": 0.122802734375, "learning_rate": 1.870167308515812e-05, "loss": 0.0431, "reward": 2.057477742433548, "reward_std": 0.16275636106729507, "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9882812947034836, "step": 829 }, { "clip_ratio": 0.0, "completion_length": 493.47100830078125, "epoch": 0.2479277126428198, "grad_norm": 0.14090430736541748, "kl": 0.11279296875, "learning_rate": 1.869652877830474e-05, "loss": 0.0031, "reward": 2.099888503551483, "reward_std": 0.1016942635178566, "rewards/accuracy_reward": 0.10267857578583062, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 830 }, { "clip_ratio": 0.0, "completion_length": 497.4553756713867, "epoch": 0.24822642073034126, "grad_norm": 0.36721861362457275, "kl": 0.12939453125, "learning_rate": 1.8691375010480397e-05, "loss": 0.0394, "reward": 2.087611734867096, "reward_std": 0.21345224231481552, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9893973469734192, "step": 831 }, { "clip_ratio": 0.0, "completion_length": 540.8259201049805, "epoch": 0.24852512881786273, "grad_norm": 0.15148983895778656, "kl": 0.119140625, "learning_rate": 1.868621178729189e-05, "loss": 0.0328, "reward": 2.0491072237491608, "reward_std": 0.1477718949317932, "rewards/accuracy_reward": 0.07812500419095159, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9888393133878708, "step": 832 }, { "clip_ratio": 0.0, "completion_length": 529.1585083007812, "epoch": 0.2488238369053842, "grad_norm": 7.021868705749512, "kl": 0.8948974609375, "learning_rate": 1.8681039114356298e-05, "loss": 0.0669, "reward": 2.0172992050647736, "reward_std": 0.15755436941981316, "rewards/accuracy_reward": 0.040178571827709675, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9905134290456772, "step": 833 }, { "clip_ratio": 0.0, "completion_length": 529.2187728881836, "epoch": 0.24912254499290568, "grad_norm": 0.1968086212873459, "kl": 0.1121826171875, "learning_rate": 1.867585699730098e-05, "loss": -0.0081, "reward": 2.0636161863803864, "reward_std": 0.10330329462885857, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.994419664144516, "step": 834 }, { "clip_ratio": 0.0, "completion_length": 548.4040374755859, "epoch": 0.24942125308042715, "grad_norm": 0.30495521426200867, "kl": 0.1898193359375, "learning_rate": 1.867066544176358e-05, "loss": 0.0033, "reward": 2.055245578289032, "reward_std": 0.11168898642063141, "rewards/accuracy_reward": 0.07142857508733869, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.992745578289032, "step": 835 }, { "clip_ratio": 0.0, "completion_length": 528.7165374755859, "epoch": 0.24971996116794862, "grad_norm": 0.2248619794845581, "kl": 0.1380615234375, "learning_rate": 1.8665464453391994e-05, "loss": 0.0101, "reward": 2.083147406578064, "reward_std": 0.15196639113128185, "rewards/accuracy_reward": 0.10044643143191934, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9938616454601288, "step": 836 }, { "clip_ratio": 0.0, "completion_length": 547.6786041259766, "epoch": 0.2500186692554701, "grad_norm": 0.18327116966247559, "kl": 0.098388671875, "learning_rate": 1.866025403784439e-05, "loss": 0.0179, "reward": 2.117745578289032, "reward_std": 0.16462569124996662, "rewards/accuracy_reward": 0.13839286798611283, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9949776977300644, "step": 837 }, { "clip_ratio": 0.0, "completion_length": 581.2857360839844, "epoch": 0.2503173773429916, "grad_norm": 0.1438259482383728, "kl": 0.090576171875, "learning_rate": 1.8655034200789187e-05, "loss": 0.0194, "reward": 2.1010046005249023, "reward_std": 0.11566522531211376, "rewards/accuracy_reward": 0.11160714412108064, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9960937649011612, "step": 838 }, { "clip_ratio": 0.0, "completion_length": 576.0044860839844, "epoch": 0.25061608543051306, "grad_norm": 0.133147194981575, "kl": 0.0845947265625, "learning_rate": 1.8649804947905057e-05, "loss": 0.0277, "reward": 2.0474331378936768, "reward_std": 0.12894676625728607, "rewards/accuracy_reward": 0.06919643306173384, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9916294813156128, "step": 839 }, { "clip_ratio": 0.0, "completion_length": 614.1027069091797, "epoch": 0.2509147935180345, "grad_norm": 0.14351031184196472, "kl": 0.085693359375, "learning_rate": 1.864456628488092e-05, "loss": 0.0671, "reward": 2.1322545409202576, "reward_std": 0.2188372127711773, "rewards/accuracy_reward": 0.16741071920841932, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.984933078289032, "step": 840 }, { "clip_ratio": 0.0, "completion_length": 647.3817138671875, "epoch": 0.25121350160555594, "grad_norm": 0.15273825824260712, "kl": 0.0875244140625, "learning_rate": 1.8639318217415918e-05, "loss": 0.0441, "reward": 2.019531339406967, "reward_std": 0.3228912204504013, "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9748884290456772, "step": 841 }, { "clip_ratio": 0.0, "completion_length": 631.2411041259766, "epoch": 0.2515122096930774, "grad_norm": 0.1184600368142128, "kl": 0.083740234375, "learning_rate": 1.8634060751219442e-05, "loss": 0.0625, "reward": 2.01116082072258, "reward_std": 0.21613645181059837, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.96651791036129, "rewards/tag_count_reward": 0.9776786118745804, "step": 842 }, { "clip_ratio": 0.0, "completion_length": 648.9129638671875, "epoch": 0.2518109177805989, "grad_norm": 0.13520176708698273, "kl": 0.097412109375, "learning_rate": 1.8628793892011103e-05, "loss": 0.0715, "reward": 2.029576003551483, "reward_std": 0.33559121936559677, "rewards/accuracy_reward": 0.1138392947614193, "rewards/format_reward": 0.9531250447034836, "rewards/tag_count_reward": 0.9626116454601288, "step": 843 }, { "clip_ratio": 0.0, "completion_length": 662.9062805175781, "epoch": 0.25210962586812036, "grad_norm": 0.15851840376853943, "kl": 0.0802001953125, "learning_rate": 1.8623517645520714e-05, "loss": 0.0729, "reward": 2.0574777722358704, "reward_std": 0.2956627160310745, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.9598214775323868, "rewards/tag_count_reward": 0.9793527275323868, "step": 844 }, { "clip_ratio": 0.0, "completion_length": 622.2187805175781, "epoch": 0.25240833395564183, "grad_norm": 0.19662076234817505, "kl": 1.1475830078125, "learning_rate": 1.861823201748833e-05, "loss": 0.0643, "reward": 2.141183078289032, "reward_std": 0.3228520080447197, "rewards/accuracy_reward": 0.2120535857975483, "rewards/format_reward": 0.9598214775323868, "rewards/tag_count_reward": 0.969308078289032, "step": 845 }, { "clip_ratio": 0.0, "completion_length": 630.4620666503906, "epoch": 0.2527070420431633, "grad_norm": 0.13568644225597382, "kl": 0.0799560546875, "learning_rate": 1.861293701366418e-05, "loss": 0.0558, "reward": 2.031808078289032, "reward_std": 0.22578419372439384, "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.984933078289032, "step": 846 }, { "clip_ratio": 0.0, "completion_length": 642.0870819091797, "epoch": 0.25300575013068477, "grad_norm": 0.11370380222797394, "kl": 0.0831298828125, "learning_rate": 1.8607632639808724e-05, "loss": 0.0341, "reward": 2.0513393878936768, "reward_std": 0.15329240635037422, "rewards/accuracy_reward": 0.08482143096625805, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9843750149011612, "step": 847 }, { "clip_ratio": 0.0, "completion_length": 675.4911041259766, "epoch": 0.25330445821820624, "grad_norm": 0.1550089418888092, "kl": 0.082275390625, "learning_rate": 1.8602318901692592e-05, "loss": 0.0518, "reward": 2.0418527722358704, "reward_std": 0.20613490231335163, "rewards/accuracy_reward": 0.0937500074505806, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9793527275323868, "step": 848 }, { "clip_ratio": 0.0, "completion_length": 652.8036041259766, "epoch": 0.2536031663057277, "grad_norm": 0.14443738758563995, "kl": 0.14501953125, "learning_rate": 1.8596995805096615e-05, "loss": 0.0452, "reward": 1.9882812798023224, "reward_std": 0.22681822627782822, "rewards/accuracy_reward": 0.042410716880112886, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9793527275323868, "step": 849 }, { "clip_ratio": 0.0, "completion_length": 642.1094055175781, "epoch": 0.2539018743932492, "grad_norm": 0.11510393023490906, "kl": 0.080078125, "learning_rate": 1.8591663355811794e-05, "loss": 0.0166, "reward": 2.1400671005249023, "reward_std": 0.16002348065376282, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9927455633878708, "step": 850 }, { "clip_ratio": 0.0, "completion_length": 626.8616333007812, "epoch": 0.25420058248077065, "grad_norm": 0.1600610315799713, "kl": 0.08447265625, "learning_rate": 1.8586321559639316e-05, "loss": 0.0202, "reward": 2.1177456378936768, "reward_std": 0.1918745506554842, "rewards/accuracy_reward": 0.15625000838190317, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9860491454601288, "step": 851 }, { "clip_ratio": 0.0, "completion_length": 599.8169860839844, "epoch": 0.2544992905682921, "grad_norm": 0.14111031591892242, "kl": 0.080078125, "learning_rate": 1.8580970422390535e-05, "loss": 0.0488, "reward": 2.0747768580913544, "reward_std": 0.18705390021204948, "rewards/accuracy_reward": 0.11383929220028222, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9854910969734192, "step": 852 }, { "clip_ratio": 0.0, "completion_length": 602.3147583007812, "epoch": 0.2547979986558136, "grad_norm": 0.14941930770874023, "kl": 0.44384765625, "learning_rate": 1.8575609949886955e-05, "loss": 0.0142, "reward": 2.1361607909202576, "reward_std": 0.2474056538194418, "rewards/accuracy_reward": 0.17857143771834671, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9866071790456772, "step": 853 }, { "clip_ratio": 0.0, "completion_length": 594.8817291259766, "epoch": 0.25509670674333507, "grad_norm": 0.5976813435554504, "kl": 0.0963134765625, "learning_rate": 1.8570240147960254e-05, "loss": 0.0385, "reward": 2.0385045409202576, "reward_std": 0.16685935109853745, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9871652126312256, "step": 854 }, { "clip_ratio": 0.0, "completion_length": 572.6852874755859, "epoch": 0.25539541483085654, "grad_norm": 0.49262896180152893, "kl": 0.1048583984375, "learning_rate": 1.8564861022452244e-05, "loss": 0.0285, "reward": 2.020089328289032, "reward_std": 0.11791195720434189, "rewards/accuracy_reward": 0.035714287078008056, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9933036118745804, "step": 855 }, { "clip_ratio": 0.0, "completion_length": 594.9196624755859, "epoch": 0.255694122918378, "grad_norm": 0.15445531904697418, "kl": 0.092041015625, "learning_rate": 1.855947257921489e-05, "loss": 0.0224, "reward": 1.9938616752624512, "reward_std": 0.09437747672200203, "rewards/accuracy_reward": 0.011160714784637094, "rewards/format_reward": 0.9888392984867096, "rewards/tag_count_reward": 0.9938616156578064, "step": 856 }, { "clip_ratio": 0.0, "completion_length": 523.3169784545898, "epoch": 0.2559928310058995, "grad_norm": 0.16664451360702515, "kl": 0.080322265625, "learning_rate": 1.8554074824110285e-05, "loss": 0.0061, "reward": 2.0965402722358704, "reward_std": 0.11724387854337692, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9960937649011612, "step": 857 }, { "clip_ratio": 0.0, "completion_length": 568.9553985595703, "epoch": 0.25629153909342095, "grad_norm": 0.1177176907658577, "kl": 0.08447265625, "learning_rate": 1.8548667763010664e-05, "loss": 0.0312, "reward": 2.095424234867096, "reward_std": 0.12242686562240124, "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.997209832072258, "step": 858 }, { "clip_ratio": 0.0, "completion_length": 591.5491333007812, "epoch": 0.2565902471809424, "grad_norm": 0.12191642820835114, "kl": 0.08740234375, "learning_rate": 1.8543251401798374e-05, "loss": 0.0213, "reward": 2.0636162161827087, "reward_std": 0.11797291226685047, "rewards/accuracy_reward": 0.07589286426082253, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.994419664144516, "step": 859 }, { "clip_ratio": 0.0, "completion_length": 557.7210235595703, "epoch": 0.2568889552684639, "grad_norm": 0.12499865144491196, "kl": 0.083740234375, "learning_rate": 1.853782574636589e-05, "loss": 0.0318, "reward": 2.107142925262451, "reward_std": 0.11860774271190166, "rewards/accuracy_reward": 0.1205357238650322, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.995535746216774, "step": 860 }, { "clip_ratio": 0.0, "completion_length": 583.3660888671875, "epoch": 0.25718766335598536, "grad_norm": 0.1404704600572586, "kl": 0.08251953125, "learning_rate": 1.8532390802615788e-05, "loss": 0.0175, "reward": 2.1010045409202576, "reward_std": 0.13824624195694923, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9916294813156128, "step": 861 }, { "clip_ratio": 0.0, "completion_length": 570.2254791259766, "epoch": 0.25748637144350683, "grad_norm": 0.12337784469127655, "kl": 0.0850830078125, "learning_rate": 1.8526946576460757e-05, "loss": 0.0311, "reward": 2.1316965222358704, "reward_std": 0.11599720641970634, "rewards/accuracy_reward": 0.14732143841683865, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9933035969734192, "step": 862 }, { "clip_ratio": 0.0, "completion_length": 549.6205749511719, "epoch": 0.2577850795310283, "grad_norm": 0.13547633588314056, "kl": 0.093994140625, "learning_rate": 1.8521493073823583e-05, "loss": 0.0289, "reward": 2.119977831840515, "reward_std": 0.1479758070781827, "rewards/accuracy_reward": 0.1316964386496693, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949777126312256, "step": 863 }, { "clip_ratio": 0.0, "completion_length": 542.1875228881836, "epoch": 0.2580837876185498, "grad_norm": 0.13042673468589783, "kl": 0.0870361328125, "learning_rate": 1.8516030300637142e-05, "loss": 0.0278, "reward": 2.053013503551483, "reward_std": 0.1157610435038805, "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9949777126312256, "step": 864 }, { "clip_ratio": 0.0, "completion_length": 584.5312652587891, "epoch": 0.25838249570607125, "grad_norm": 0.23779642581939697, "kl": 0.083740234375, "learning_rate": 1.851055826284439e-05, "loss": 0.0231, "reward": 2.0770090222358704, "reward_std": 0.11244368925690651, "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9966517984867096, "step": 865 }, { "clip_ratio": 0.0, "completion_length": 583.1116180419922, "epoch": 0.2586812037935927, "grad_norm": 0.16252903640270233, "kl": 0.098388671875, "learning_rate": 1.850507696639838e-05, "loss": 0.0331, "reward": 2.023437589406967, "reward_std": 0.1534289475530386, "rewards/accuracy_reward": 0.046875001629814506, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9921875298023224, "step": 866 }, { "clip_ratio": 0.0, "completion_length": 533.8794784545898, "epoch": 0.2589799118811142, "grad_norm": 0.17376114428043365, "kl": 0.1064453125, "learning_rate": 1.849958641726221e-05, "loss": 0.0206, "reward": 2.1367188692092896, "reward_std": 0.13150371704250574, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9938616305589676, "step": 867 }, { "clip_ratio": 0.0, "completion_length": 550.5424346923828, "epoch": 0.25927861996863566, "grad_norm": 0.2222760021686554, "kl": 0.091064453125, "learning_rate": 1.849408662140907e-05, "loss": 0.0229, "reward": 2.0061385333538055, "reward_std": 0.11644704733043909, "rewards/accuracy_reward": 0.024553573224693537, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9927455633878708, "step": 868 }, { "clip_ratio": 0.0, "completion_length": 554.341552734375, "epoch": 0.25957732805615713, "grad_norm": 0.18735897541046143, "kl": 0.1114501953125, "learning_rate": 1.8488577584822197e-05, "loss": 0.049, "reward": 2.1439732909202576, "reward_std": 0.21221916750073433, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9899553805589676, "step": 869 }, { "clip_ratio": 0.0, "completion_length": 550.2366485595703, "epoch": 0.2598760361436786, "grad_norm": 0.1842295080423355, "kl": 0.1221923828125, "learning_rate": 1.8483059313494877e-05, "loss": 0.0666, "reward": 2.0485492050647736, "reward_std": 0.22358180582523346, "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9815848618745804, "step": 870 }, { "clip_ratio": 0.0, "completion_length": 537.0156478881836, "epoch": 0.2601747442312001, "grad_norm": 0.49171096086502075, "kl": 0.194580078125, "learning_rate": 1.847753181343046e-05, "loss": 0.1013, "reward": 2.036830425262451, "reward_std": 0.2659831829369068, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.9553571790456772, "rewards/tag_count_reward": 0.9720982611179352, "step": 871 }, { "clip_ratio": 0.0, "completion_length": 536.997802734375, "epoch": 0.26047345231872154, "grad_norm": 0.34244561195373535, "kl": 0.194091796875, "learning_rate": 1.8471995090642312e-05, "loss": 0.1707, "reward": 1.970424234867096, "reward_std": 0.39580073207616806, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.9375000298023224, "rewards/tag_count_reward": 0.9570312947034836, "step": 872 }, { "clip_ratio": 0.0, "completion_length": 551.5089569091797, "epoch": 0.260772160406243, "grad_norm": 0.41386473178863525, "kl": 0.172607421875, "learning_rate": 1.8466449151153853e-05, "loss": 0.191, "reward": 1.9570313394069672, "reward_std": 0.40818119794130325, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.9218750447034836, "rewards/tag_count_reward": 0.9414062947034836, "step": 873 }, { "clip_ratio": 0.0, "completion_length": 578.1183319091797, "epoch": 0.2610708684937645, "grad_norm": 0.35368436574935913, "kl": 0.22265625, "learning_rate": 1.8460894000998518e-05, "loss": 0.2096, "reward": 1.872209906578064, "reward_std": 0.5072178244590759, "rewards/accuracy_reward": 0.05580357415601611, "rewards/format_reward": 0.8928571790456772, "rewards/tag_count_reward": 0.9235491454601288, "step": 874 }, { "clip_ratio": 0.0, "completion_length": 522.1406326293945, "epoch": 0.26136957658128596, "grad_norm": 0.3423708379268646, "kl": 0.199951171875, "learning_rate": 1.8455329646219767e-05, "loss": 0.1829, "reward": 2.0145090520381927, "reward_std": 0.4426068291068077, "rewards/accuracy_reward": 0.15625000838190317, "rewards/format_reward": 0.9174107611179352, "rewards/tag_count_reward": 0.940848246216774, "step": 875 }, { "clip_ratio": 0.0, "completion_length": 509.31922149658203, "epoch": 0.2616682846688074, "grad_norm": 0.361074298620224, "kl": 0.186279296875, "learning_rate": 1.844975609287107e-05, "loss": 0.1556, "reward": 1.9609375596046448, "reward_std": 0.314551766961813, "rewards/accuracy_reward": 0.053571430733427405, "rewards/format_reward": 0.9464286118745804, "rewards/tag_count_reward": 0.9609375447034836, "step": 876 }, { "clip_ratio": 0.0, "completion_length": 526.7522506713867, "epoch": 0.2619669927563289, "grad_norm": 1.4976379871368408, "kl": 0.47021484375, "learning_rate": 1.8444173347015912e-05, "loss": 0.1961, "reward": 1.9330357909202576, "reward_std": 0.3637930378317833, "rewards/accuracy_reward": 0.08705357532016933, "rewards/format_reward": 0.9084821790456772, "rewards/tag_count_reward": 0.9375000447034836, "step": 877 }, { "clip_ratio": 0.0, "completion_length": 445.0022506713867, "epoch": 0.26226570084385037, "grad_norm": 0.2853803038597107, "kl": 0.15283203125, "learning_rate": 1.843858141472777e-05, "loss": 0.0878, "reward": 2.056361734867096, "reward_std": 0.22389471717178822, "rewards/accuracy_reward": 0.10267857508733869, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9804687947034836, "step": 878 }, { "clip_ratio": 0.0, "completion_length": 455.9397506713867, "epoch": 0.26256440893137184, "grad_norm": 0.4720841348171234, "kl": 0.1156005859375, "learning_rate": 1.8432980302090116e-05, "loss": 0.0896, "reward": 2.104910731315613, "reward_std": 0.15967703238129616, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9843750298023224, "step": 879 }, { "clip_ratio": 0.0, "completion_length": 450.8616256713867, "epoch": 0.2628631170188933, "grad_norm": 3.9991588592529297, "kl": 0.141845703125, "learning_rate": 1.842737001519642e-05, "loss": 0.0608, "reward": 2.0541295409202576, "reward_std": 0.13528813049197197, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9893973618745804, "step": 880 }, { "clip_ratio": 0.0, "completion_length": 431.61609649658203, "epoch": 0.2631618251064148, "grad_norm": 29221.216796875, "kl": 1326.780029296875, "learning_rate": 1.8421750560150112e-05, "loss": 71.0881, "reward": 2.056919753551483, "reward_std": 0.14310095086693764, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9899553805589676, "step": 881 }, { "clip_ratio": 0.0, "completion_length": 451.74778747558594, "epoch": 0.26346053319393625, "grad_norm": 1.945155382156372, "kl": 0.1107177734375, "learning_rate": 1.841612194306462e-05, "loss": 0.0093, "reward": 2.0691965222358704, "reward_std": 0.038984465412795544, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9977678805589676, "step": 882 }, { "clip_ratio": 0.0, "completion_length": 402.64064025878906, "epoch": 0.26375924128145767, "grad_norm": 687.9214477539062, "kl": 57.576904296875, "learning_rate": 1.8410484170063317e-05, "loss": 2.4381, "reward": 2.048549234867096, "reward_std": 0.028257629135623574, "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 0.9994419813156128, "step": 883 }, { "clip_ratio": 0.0, "completion_length": 417.8750228881836, "epoch": 0.26405794936897914, "grad_norm": 0.5842060446739197, "kl": 0.1861572265625, "learning_rate": 1.8404837247279558e-05, "loss": 0.01, "reward": 2.013950914144516, "reward_std": 0.09453790634870529, "rewards/accuracy_reward": 0.02678571455180645, "rewards/format_reward": 0.993303582072258, "rewards/tag_count_reward": 0.9938616156578064, "step": 884 }, { "clip_ratio": 0.0, "completion_length": 400.2835006713867, "epoch": 0.2643566574565006, "grad_norm": 6.161169052124023, "kl": 0.1400146484375, "learning_rate": 1.8399181180856635e-05, "loss": 0.066, "reward": 2.116071581840515, "reward_std": 0.14768151007592678, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9910714626312256, "step": 885 }, { "clip_ratio": 0.0, "completion_length": 412.22322845458984, "epoch": 0.2646553655440221, "grad_norm": 49.132625579833984, "kl": 4.4404296875, "learning_rate": 1.8393515976947795e-05, "loss": 0.382, "reward": 2.0982143878936768, "reward_std": 0.08362732548266649, "rewards/accuracy_reward": 0.11383928847499192, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9933035969734192, "step": 886 }, { "clip_ratio": 0.0, "completion_length": 397.0870666503906, "epoch": 0.26495407363154355, "grad_norm": 0.3885454833507538, "kl": 0.173828125, "learning_rate": 1.8387841641716226e-05, "loss": 0.0167, "reward": 2.0725447237491608, "reward_std": 0.03125000186264515, "rewards/accuracy_reward": 0.0758928582072258, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9988839328289032, "step": 887 }, { "clip_ratio": 0.0, "completion_length": 406.6986770629883, "epoch": 0.265252781719065, "grad_norm": 1.0713754892349243, "kl": 0.28173828125, "learning_rate": 1.8382158181335046e-05, "loss": 0.0195, "reward": 2.0312501192092896, "reward_std": 0.08201200887560844, "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9933035969734192, "step": 888 }, { "clip_ratio": 0.0, "completion_length": 373.98216247558594, "epoch": 0.2655514898065865, "grad_norm": 33.684539794921875, "kl": 3.2020263671875, "learning_rate": 1.8376465601987302e-05, "loss": 0.2569, "reward": 2.0150670409202576, "reward_std": 0.1131645068526268, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.990513414144516, "step": 889 }, { "clip_ratio": 0.0, "completion_length": 373.4375228881836, "epoch": 0.26585019789410796, "grad_norm": 8.81002426147461, "kl": 1.4794921875, "learning_rate": 1.837076390986597e-05, "loss": 0.1314, "reward": 2.0881697833538055, "reward_std": 0.1835097186267376, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9832589626312256, "step": 890 }, { "clip_ratio": 0.0, "completion_length": 361.2544860839844, "epoch": 0.26614890598162944, "grad_norm": 0.7248677611351013, "kl": 0.2203369140625, "learning_rate": 1.8365053111173924e-05, "loss": 0.009, "reward": 2.1406250596046448, "reward_std": 0.1691837329417467, "rewards/accuracy_reward": 0.1674107201397419, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9843750447034836, "step": 891 }, { "clip_ratio": 0.0, "completion_length": 340.18751525878906, "epoch": 0.2664476140691509, "grad_norm": 80.57315063476562, "kl": 7.3287353515625, "learning_rate": 1.8359333212123958e-05, "loss": 0.5765, "reward": 2.068080484867096, "reward_std": 0.21793736517429352, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.9765625447034836, "step": 892 }, { "clip_ratio": 0.0, "completion_length": 379.5111770629883, "epoch": 0.2667463221566724, "grad_norm": 24.625009536743164, "kl": 2.87841796875, "learning_rate": 1.835360421893876e-05, "loss": 0.2317, "reward": 2.092633992433548, "reward_std": 0.148915889672935, "rewards/accuracy_reward": 0.12276786123402417, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9899553954601288, "step": 893 }, { "clip_ratio": 0.0, "completion_length": 376.0134048461914, "epoch": 0.26704503024419385, "grad_norm": 4.259302139282227, "kl": 0.66357421875, "learning_rate": 1.834786613785091e-05, "loss": 0.1168, "reward": 1.982142984867096, "reward_std": 0.2172011248767376, "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.9642857760190964, "rewards/tag_count_reward": 0.9687500298023224, "step": 894 }, { "clip_ratio": 0.0, "completion_length": 359.9821548461914, "epoch": 0.2673437383317153, "grad_norm": 4.558290958404541, "kl": 1.8671875, "learning_rate": 1.8342118975102887e-05, "loss": 0.1622, "reward": 1.9559152722358704, "reward_std": 0.2244024220854044, "rewards/accuracy_reward": 0.013392857741564512, "rewards/format_reward": 0.9665178805589676, "rewards/tag_count_reward": 0.9760045111179352, "step": 895 }, { "clip_ratio": 0.0, "completion_length": 365.1674346923828, "epoch": 0.2676424464192368, "grad_norm": 2.513187885284424, "kl": 0.9482421875, "learning_rate": 1.833636273694703e-05, "loss": 0.1678, "reward": 2.0412946939468384, "reward_std": 0.17339041456580162, "rewards/accuracy_reward": 0.08258928707800806, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9787946790456772, "step": 896 }, { "clip_ratio": 0.0, "completion_length": 394.20091247558594, "epoch": 0.26794115450675826, "grad_norm": 2.6183135509490967, "kl": 0.51513671875, "learning_rate": 1.8330597429645566e-05, "loss": 0.1258, "reward": 2.0357143878936768, "reward_std": 0.18352274782955647, "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9754464626312256, "step": 897 }, { "clip_ratio": 0.0, "completion_length": 389.7276916503906, "epoch": 0.26823986259427973, "grad_norm": 0.2950279712677002, "kl": 0.298828125, "learning_rate": 1.8324823059470587e-05, "loss": 0.032, "reward": 2.0703125596046448, "reward_std": 0.07691855356097221, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9899553805589676, "step": 898 }, { "clip_ratio": 0.0, "completion_length": 374.6808166503906, "epoch": 0.2685385706818012, "grad_norm": 1.8881800174713135, "kl": 0.5322265625, "learning_rate": 1.8319039632704042e-05, "loss": 0.0873, "reward": 2.1015626192092896, "reward_std": 0.1813326571136713, "rewards/accuracy_reward": 0.1294642947614193, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.987723246216774, "step": 899 }, { "clip_ratio": 0.0, "completion_length": 396.9464416503906, "epoch": 0.2688372787693227, "grad_norm": 0.5968738794326782, "kl": 0.2003173828125, "learning_rate": 1.8313247155637725e-05, "loss": 0.0535, "reward": 2.026227831840515, "reward_std": 0.17181934416294098, "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.990513414144516, "step": 900 }, { "clip_ratio": 0.0, "completion_length": 398.3482360839844, "epoch": 0.26913598685684414, "grad_norm": 1.2569233179092407, "kl": 0.237060546875, "learning_rate": 1.830744563457329e-05, "loss": 0.1048, "reward": 2.021763503551483, "reward_std": 0.1558468323200941, "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9838170111179352, "step": 901 }, { "clip_ratio": 0.0, "completion_length": 375.67189025878906, "epoch": 0.2694346949443656, "grad_norm": 0.3392607569694519, "kl": 0.224853515625, "learning_rate": 1.8301635075822222e-05, "loss": 0.1525, "reward": 2.0301340222358704, "reward_std": 0.2132299244403839, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9832589626312256, "step": 902 }, { "clip_ratio": 0.0, "completion_length": 422.87278747558594, "epoch": 0.2697334030318871, "grad_norm": 0.3840981423854828, "kl": 0.15576171875, "learning_rate": 1.8295815485705842e-05, "loss": 0.0435, "reward": 2.0189733505249023, "reward_std": 0.10438985750079155, "rewards/accuracy_reward": 0.03125000186264515, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.994419664144516, "step": 903 }, { "clip_ratio": 0.0, "completion_length": 421.5692138671875, "epoch": 0.27003211111940856, "grad_norm": 0.30747029185295105, "kl": 0.171875, "learning_rate": 1.8289986870555287e-05, "loss": 0.0988, "reward": 2.0770090222358704, "reward_std": 0.14734355546534061, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9832589626312256, "step": 904 }, { "clip_ratio": 0.0, "completion_length": 426.0401916503906, "epoch": 0.27033081920693003, "grad_norm": 0.6791226267814636, "kl": 0.1820068359375, "learning_rate": 1.8284149236711527e-05, "loss": 0.0861, "reward": 1.9966518878936768, "reward_std": 0.17319505102932453, "rewards/accuracy_reward": 0.03348214481957257, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9854910969734192, "step": 905 }, { "clip_ratio": 0.0, "completion_length": 451.3593978881836, "epoch": 0.2706295272944515, "grad_norm": 0.585615873336792, "kl": 0.181640625, "learning_rate": 1.8278302590525326e-05, "loss": 0.1183, "reward": 2.0100446939468384, "reward_std": 0.2291083261370659, "rewards/accuracy_reward": 0.06250000488944352, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9765625447034836, "step": 906 }, { "clip_ratio": 0.0, "completion_length": 463.5178756713867, "epoch": 0.27092823538197297, "grad_norm": 0.6108899116516113, "kl": 0.1160888671875, "learning_rate": 1.8272446938357272e-05, "loss": 0.0784, "reward": 2.058593899011612, "reward_std": 0.1893401276320219, "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.984933078289032, "step": 907 }, { "clip_ratio": 0.0, "completion_length": 452.00894927978516, "epoch": 0.27122694346949444, "grad_norm": 0.8488578796386719, "kl": 0.1650390625, "learning_rate": 1.826658228657773e-05, "loss": 0.1276, "reward": 2.0178572237491608, "reward_std": 0.2368055135011673, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.96651791036129, "rewards/tag_count_reward": 0.9732143431901932, "step": 908 }, { "clip_ratio": 0.0, "completion_length": 507.5022659301758, "epoch": 0.2715256515570159, "grad_norm": 0.41942551732063293, "kl": 0.229736328125, "learning_rate": 1.826070864156688e-05, "loss": 0.1853, "reward": 1.9849331080913544, "reward_std": 0.34486590325832367, "rewards/accuracy_reward": 0.0870535783469677, "rewards/format_reward": 0.9419643133878708, "rewards/tag_count_reward": 0.9559152275323868, "step": 909 }, { "clip_ratio": 0.0, "completion_length": 495.7812805175781, "epoch": 0.2718243596445374, "grad_norm": 0.5185264945030212, "kl": 0.241455078125, "learning_rate": 1.8254826009714663e-05, "loss": 0.1462, "reward": 2.0379465222358704, "reward_std": 0.31891387701034546, "rewards/accuracy_reward": 0.11160714784637094, "rewards/format_reward": 0.9553571939468384, "rewards/tag_count_reward": 0.9709821939468384, "step": 910 }, { "clip_ratio": 0.0, "completion_length": 490.79244232177734, "epoch": 0.27212306773205885, "grad_norm": 2.3198657035827637, "kl": 0.434814453125, "learning_rate": 1.8248934397420802e-05, "loss": 0.3326, "reward": 1.860491156578064, "reward_std": 0.5046837106347084, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.8928571790456772, "rewards/tag_count_reward": 0.9229911118745804, "step": 911 }, { "clip_ratio": 0.0, "completion_length": 479.42413330078125, "epoch": 0.2724217758195803, "grad_norm": 0.27103009819984436, "kl": 0.18212890625, "learning_rate": 1.82430338110948e-05, "loss": 0.1494, "reward": 1.989397406578064, "reward_std": 0.2792120426893234, "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.948660746216774, "rewards/tag_count_reward": 0.9648437947034836, "step": 912 }, { "clip_ratio": 0.0, "completion_length": 464.3326110839844, "epoch": 0.2727204839071018, "grad_norm": 0.3713147044181824, "kl": 0.111083984375, "learning_rate": 1.8237124257155917e-05, "loss": 0.1355, "reward": 2.0184153020381927, "reward_std": 0.2755810543894768, "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9715402275323868, "step": 913 }, { "clip_ratio": 0.0, "completion_length": 443.5089416503906, "epoch": 0.27301919199462327, "grad_norm": 0.21166561543941498, "kl": 0.129638671875, "learning_rate": 1.823120574203317e-05, "loss": 0.1072, "reward": 2.0318081080913544, "reward_std": 0.14935417659580708, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.984933078289032, "step": 914 }, { "clip_ratio": 0.0, "completion_length": 464.42413330078125, "epoch": 0.27331790008214474, "grad_norm": 0.41084709763526917, "kl": 0.1044921875, "learning_rate": 1.822527827216532e-05, "loss": 0.0974, "reward": 2.0357143580913544, "reward_std": 0.20459807105362415, "rewards/accuracy_reward": 0.08482143143191934, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9799107611179352, "step": 915 }, { "clip_ratio": 0.0, "completion_length": 422.66519927978516, "epoch": 0.2736166081696662, "grad_norm": 0.4103494882583618, "kl": 0.1123046875, "learning_rate": 1.8219341854000873e-05, "loss": 0.1106, "reward": 2.055803656578064, "reward_std": 0.2291455827653408, "rewards/accuracy_reward": 0.09151786379516125, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9843750447034836, "step": 916 }, { "clip_ratio": 0.0, "completion_length": 437.66072845458984, "epoch": 0.2739153162571877, "grad_norm": 0.8921476602554321, "kl": 0.24658203125, "learning_rate": 1.821339649399807e-05, "loss": 0.2275, "reward": 1.9893974363803864, "reward_std": 0.33918145298957825, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.9352678954601288, "rewards/tag_count_reward": 0.9536830633878708, "step": 917 }, { "clip_ratio": 0.0, "completion_length": 458.1562728881836, "epoch": 0.27421402434470915, "grad_norm": 1.3469569683074951, "kl": 0.21875, "learning_rate": 1.8207442198624882e-05, "loss": 0.3302, "reward": 1.885602742433548, "reward_std": 0.4766450375318527, "rewards/accuracy_reward": 0.04241071455180645, "rewards/format_reward": 0.910714328289032, "rewards/tag_count_reward": 0.9324777126312256, "step": 918 }, { "clip_ratio": 0.0, "completion_length": 428.29466247558594, "epoch": 0.2745127324322306, "grad_norm": 1.327561855316162, "kl": 0.50244140625, "learning_rate": 1.8201478974358996e-05, "loss": 0.3811, "reward": 1.9229911863803864, "reward_std": 0.48089350014925003, "rewards/accuracy_reward": 0.11607143515720963, "rewards/format_reward": 0.8883928805589676, "rewards/tag_count_reward": 0.918526828289032, "step": 919 }, { "clip_ratio": 0.0, "completion_length": 397.3817138671875, "epoch": 0.2748114405197521, "grad_norm": 0.2557903230190277, "kl": 0.12060546875, "learning_rate": 1.8195506827687818e-05, "loss": 0.1089, "reward": 2.083705484867096, "reward_std": 0.16749700531363487, "rewards/accuracy_reward": 0.12276786286383867, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9832589477300644, "step": 920 }, { "clip_ratio": 0.0, "completion_length": 355.26116943359375, "epoch": 0.27511014860727356, "grad_norm": 0.22162000834941864, "kl": 0.115966796875, "learning_rate": 1.8189525765108457e-05, "loss": 0.0954, "reward": 2.050781399011612, "reward_std": 0.1004464291036129, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9905134290456772, "step": 921 }, { "clip_ratio": 0.0, "completion_length": 375.9218978881836, "epoch": 0.27540885669479503, "grad_norm": 0.3269272446632385, "kl": 0.1912841796875, "learning_rate": 1.8183535793127722e-05, "loss": 0.1137, "reward": 2.074776828289032, "reward_std": 0.18578222393989563, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9877232611179352, "step": 922 }, { "clip_ratio": 0.0, "completion_length": 315.87501525878906, "epoch": 0.2757075647823165, "grad_norm": 0.17624634504318237, "kl": 0.1251220703125, "learning_rate": 1.817753691826212e-05, "loss": 0.0326, "reward": 2.0753349363803864, "reward_std": 0.05456917732954025, "rewards/accuracy_reward": 0.08482143026776612, "rewards/format_reward": 0.993303582072258, "rewards/tag_count_reward": 0.997209832072258, "step": 923 }, { "clip_ratio": 0.0, "completion_length": 340.00894927978516, "epoch": 0.276006272869838, "grad_norm": 0.2511219084262848, "kl": 0.123291015625, "learning_rate": 1.8171529147037835e-05, "loss": 0.0316, "reward": 2.0619420409202576, "reward_std": 0.10178205370903015, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949776977300644, "step": 924 }, { "clip_ratio": 0.0, "completion_length": 326.3326110839844, "epoch": 0.27630498095735945, "grad_norm": 0.20849256217479706, "kl": 0.1197509765625, "learning_rate": 1.8165512485990734e-05, "loss": 0.0145, "reward": 2.0569196939468384, "reward_std": 0.1288723610341549, "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9966517984867096, "step": 925 }, { "clip_ratio": 0.0, "completion_length": 384.0044860839844, "epoch": 0.27660368904488086, "grad_norm": 0.3363122344017029, "kl": 0.1204833984375, "learning_rate": 1.8159486941666354e-05, "loss": 0.037, "reward": 2.005580484867096, "reward_std": 0.08511811681091785, "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.994419664144516, "step": 926 }, { "clip_ratio": 0.0, "completion_length": 377.3660888671875, "epoch": 0.27690239713240233, "grad_norm": 0.24254298210144043, "kl": 0.138427734375, "learning_rate": 1.8153452520619897e-05, "loss": 0.0286, "reward": 2.0580358505249023, "reward_std": 0.12533817999064922, "rewards/accuracy_reward": 0.07142857508733869, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9955357313156128, "step": 927 }, { "clip_ratio": 0.0, "completion_length": 385.9732360839844, "epoch": 0.2772011052199238, "grad_norm": 0.1859503835439682, "kl": 0.1121826171875, "learning_rate": 1.814740922941622e-05, "loss": 0.0213, "reward": 2.1289063692092896, "reward_std": 0.1315176822245121, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.997209832072258, "step": 928 }, { "clip_ratio": 0.0, "completion_length": 422.1272506713867, "epoch": 0.2774998133074453, "grad_norm": 0.3720918297767639, "kl": 0.1217041015625, "learning_rate": 1.8141357074629838e-05, "loss": 0.0805, "reward": 2.0256697237491608, "reward_std": 0.14004772901535034, "rewards/accuracy_reward": 0.051339289639145136, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9921875596046448, "step": 929 }, { "clip_ratio": 0.0, "completion_length": 438.73216247558594, "epoch": 0.27779852139496675, "grad_norm": 0.46434006094932556, "kl": 0.126220703125, "learning_rate": 1.8135296062844893e-05, "loss": 0.2471, "reward": 2.036272406578064, "reward_std": 0.2924267016351223, "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.9352678954601288, "rewards/tag_count_reward": 0.9760045260190964, "step": 930 }, { "clip_ratio": 0.0, "completion_length": 473.5268096923828, "epoch": 0.2780972294824882, "grad_norm": 0.631721556186676, "kl": 0.200927734375, "learning_rate": 1.8129226200655177e-05, "loss": 0.5787, "reward": 1.7684153020381927, "reward_std": 0.5831883028149605, "rewards/accuracy_reward": 0.10491071874275804, "rewards/format_reward": 0.7410714626312256, "rewards/tag_count_reward": 0.9224330633878708, "step": 931 }, { "clip_ratio": 0.0, "completion_length": 490.88842010498047, "epoch": 0.2783959375700097, "grad_norm": 1.0529640913009644, "kl": 0.45166015625, "learning_rate": 1.8123147494664105e-05, "loss": 0.8396, "reward": 1.5133929252624512, "reward_std": 0.6491270214319229, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.6049107313156128, "rewards/tag_count_reward": 0.8705357611179352, "step": 932 }, { "clip_ratio": 0.0, "completion_length": 434.99332427978516, "epoch": 0.27869464565753116, "grad_norm": 0.4428441822528839, "kl": 0.3056640625, "learning_rate": 1.8117059951484714e-05, "loss": 0.7799, "reward": 1.6439732909202576, "reward_std": 0.5689154118299484, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.6629464626312256, "rewards/tag_count_reward": 0.8805803954601288, "step": 933 }, { "clip_ratio": 0.0, "completion_length": 384.10716247558594, "epoch": 0.27899335374505263, "grad_norm": 0.4327075779438019, "kl": 0.239013671875, "learning_rate": 1.8110963577739654e-05, "loss": 0.7639, "reward": 1.7639509737491608, "reward_std": 0.5259297117590904, "rewards/accuracy_reward": 0.133928582072258, "rewards/format_reward": 0.7232143133878708, "rewards/tag_count_reward": 0.906808078289032, "step": 934 }, { "clip_ratio": 0.0, "completion_length": 234.97098922729492, "epoch": 0.2792920618325741, "grad_norm": 1.0421653985977173, "kl": 0.25, "learning_rate": 1.8104858380061178e-05, "loss": 0.8397, "reward": 1.8666295409202576, "reward_std": 0.3772808462381363, "rewards/accuracy_reward": 0.05133928847499192, "rewards/format_reward": 0.8616071939468384, "rewards/tag_count_reward": 0.953683078289032, "step": 935 }, { "clip_ratio": 0.0, "completion_length": 331.68751525878906, "epoch": 0.27959076992009557, "grad_norm": 0.43408915400505066, "kl": 0.2880859375, "learning_rate": 1.809874436509115e-05, "loss": 0.9007, "reward": 1.7572545409202576, "reward_std": 0.5069598630070686, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.7633928954601288, "rewards/tag_count_reward": 0.91573666036129, "step": 936 }, { "clip_ratio": 0.0, "completion_length": 324.24554443359375, "epoch": 0.27988947800761704, "grad_norm": 0.8099424839019775, "kl": 0.41259765625, "learning_rate": 1.809262153948101e-05, "loss": 1.071, "reward": 1.7087054550647736, "reward_std": 0.517625592648983, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.7589286267757416, "rewards/tag_count_reward": 0.9140625447034836, "step": 937 }, { "clip_ratio": 0.0, "completion_length": 212.90179443359375, "epoch": 0.2801881860951385, "grad_norm": 0.6947817206382751, "kl": 0.25048828125, "learning_rate": 1.80864899098918e-05, "loss": 1.0258, "reward": 1.8325893580913544, "reward_std": 0.4178910180926323, "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.8816964775323868, "rewards/tag_count_reward": 0.941964328289032, "step": 938 }, { "clip_ratio": 0.0, "completion_length": 123.34599113464355, "epoch": 0.28048689418266, "grad_norm": 0.4654977321624756, "kl": 0.263671875, "learning_rate": 1.8080349482994132e-05, "loss": 0.525, "reward": 1.9369420409202576, "reward_std": 0.18344547040760517, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9575893431901932, "rewards/tag_count_reward": 0.9793527275323868, "step": 939 }, { "clip_ratio": 0.0, "completion_length": 129.4776840209961, "epoch": 0.28078560227018146, "grad_norm": 0.707318902015686, "kl": 0.35107421875, "learning_rate": 1.8074200265468183e-05, "loss": 0.7124, "reward": 1.98772332072258, "reward_std": 0.2282431647181511, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.9598214626312256, "rewards/tag_count_reward": 0.9787946939468384, "step": 940 }, { "clip_ratio": 0.0, "completion_length": 148.5379524230957, "epoch": 0.2810843103577029, "grad_norm": 0.4051108956336975, "kl": 0.31689453125, "learning_rate": 1.80680422640037e-05, "loss": 0.6377, "reward": 2.0184152722358704, "reward_std": 0.2274763286113739, "rewards/accuracy_reward": 0.08705357555299997, "rewards/format_reward": 0.9531250447034836, "rewards/tag_count_reward": 0.9782366454601288, "step": 941 }, { "clip_ratio": 0.0, "completion_length": 129.20089530944824, "epoch": 0.2813830184452244, "grad_norm": 0.8141277432441711, "kl": 0.32666015625, "learning_rate": 1.8061875485299987e-05, "loss": 0.7394, "reward": 1.9520090222358704, "reward_std": 0.23401820473372936, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.9787947088479996, "step": 942 }, { "clip_ratio": 0.0, "completion_length": 142.6897373199463, "epoch": 0.28168172653274587, "grad_norm": 0.49476152658462524, "kl": 0.2470703125, "learning_rate": 1.80556999360659e-05, "loss": 0.77, "reward": 1.9587054550647736, "reward_std": 0.24835725128650665, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9508928954601288, "rewards/tag_count_reward": 0.972098246216774, "step": 943 }, { "clip_ratio": 0.0, "completion_length": 144.3147373199463, "epoch": 0.28198043462026734, "grad_norm": 0.4160972535610199, "kl": 0.2451171875, "learning_rate": 1.804951562301982e-05, "loss": 0.5206, "reward": 1.9508929252624512, "reward_std": 0.20315229706466198, "rewards/accuracy_reward": 0.0200892873108387, "rewards/format_reward": 0.9531250298023224, "rewards/tag_count_reward": 0.9776786118745804, "step": 944 }, { "clip_ratio": 0.0, "completion_length": 138.2656307220459, "epoch": 0.2822791427077888, "grad_norm": 0.3042178750038147, "kl": 0.224609375, "learning_rate": 1.8043322552889685e-05, "loss": 0.4581, "reward": 1.9681920409202576, "reward_std": 0.1495867818593979, "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9882812947034836, "step": 945 }, { "clip_ratio": 0.0, "completion_length": 149.85937881469727, "epoch": 0.2825778507953103, "grad_norm": 0.6084446310997009, "kl": 0.204833984375, "learning_rate": 1.803712073241294e-05, "loss": 0.5231, "reward": 1.987165242433548, "reward_std": 0.1859233658760786, "rewards/accuracy_reward": 0.026785715483129025, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.9916295111179352, "step": 946 }, { "clip_ratio": 0.0, "completion_length": 197.1317024230957, "epoch": 0.28287655888283175, "grad_norm": 0.4133909046649933, "kl": 0.216796875, "learning_rate": 1.8030910168336558e-05, "loss": 0.5137, "reward": 1.9570313394069672, "reward_std": 0.22406762465834618, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.941964328289032, "rewards/tag_count_reward": 0.9771205931901932, "step": 947 }, { "clip_ratio": 0.0, "completion_length": 193.41742324829102, "epoch": 0.2831752669703532, "grad_norm": 0.49017149209976196, "kl": 0.21044921875, "learning_rate": 1.802469086741703e-05, "loss": 0.5891, "reward": 1.9754465222358704, "reward_std": 0.22469544783234596, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.948660746216774, "rewards/tag_count_reward": 0.9821428954601288, "step": 948 }, { "clip_ratio": 0.0, "completion_length": 268.9442024230957, "epoch": 0.2834739750578747, "grad_norm": 1.0024855136871338, "kl": 0.219970703125, "learning_rate": 1.801846283642034e-05, "loss": 0.7102, "reward": 1.8521206080913544, "reward_std": 0.3748982548713684, "rewards/accuracy_reward": 0.0133928582072258, "rewards/format_reward": 0.8883928805589676, "rewards/tag_count_reward": 0.9503348469734192, "step": 949 }, { "clip_ratio": 0.0, "completion_length": 380.42859649658203, "epoch": 0.28377268314539617, "grad_norm": 0.8266788721084595, "kl": 0.4423828125, "learning_rate": 1.801222608212198e-05, "loss": 0.9398, "reward": 1.6997768580913544, "reward_std": 0.6256737858057022, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.7276786118745804, "rewards/tag_count_reward": 0.8716518133878708, "step": 950 }, { "clip_ratio": 0.0, "completion_length": 432.48216247558594, "epoch": 0.28407139123291764, "grad_norm": 1.301263451576233, "kl": 0.4716796875, "learning_rate": 1.8005980611306926e-05, "loss": 0.878, "reward": 1.5703125596046448, "reward_std": 0.5969109088182449, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.6696428805589676, "rewards/tag_count_reward": 0.8627232611179352, "step": 951 }, { "clip_ratio": 0.0, "completion_length": 319.9241180419922, "epoch": 0.2843700993204391, "grad_norm": 0.47672057151794434, "kl": 0.34765625, "learning_rate": 1.799972643076963e-05, "loss": 0.9831, "reward": 1.7405134439468384, "reward_std": 0.5127791464328766, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.7879464626312256, "rewards/tag_count_reward": 0.9168527126312256, "step": 952 }, { "clip_ratio": 0.0, "completion_length": 257.3437614440918, "epoch": 0.2846688074079606, "grad_norm": 0.4835071265697479, "kl": 0.29248046875, "learning_rate": 1.7993463547314044e-05, "loss": 0.806, "reward": 1.8404018580913544, "reward_std": 0.38476159423589706, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.8660714626312256, "rewards/tag_count_reward": 0.9386161267757416, "step": 953 }, { "clip_ratio": 0.0, "completion_length": 225.0156364440918, "epoch": 0.28496751549548205, "grad_norm": 1.5009877681732178, "kl": 0.28173828125, "learning_rate": 1.798719196775356e-05, "loss": 0.921, "reward": 1.8688616752624512, "reward_std": 0.38479653745889664, "rewards/accuracy_reward": 0.0267857164144516, "rewards/format_reward": 0.88839291036129, "rewards/tag_count_reward": 0.9536830931901932, "step": 954 }, { "clip_ratio": 0.0, "completion_length": 155.5513458251953, "epoch": 0.2852662235830035, "grad_norm": 6.340460300445557, "kl": 0.32666015625, "learning_rate": 1.7980911698911045e-05, "loss": 0.3847, "reward": 2.0591518580913544, "reward_std": 0.1876828558743, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9854910969734192, "step": 955 }, { "clip_ratio": 0.0, "completion_length": 117.1004524230957, "epoch": 0.285564931670525, "grad_norm": 8.343101501464844, "kl": 1.0625, "learning_rate": 1.797462274761881e-05, "loss": 0.073, "reward": 2.0412946939468384, "reward_std": 0.05281830672174692, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9988839626312256, "step": 956 }, { "clip_ratio": 0.0, "completion_length": 116.64956092834473, "epoch": 0.28586363975804646, "grad_norm": 7.063857078552246, "kl": 0.75439453125, "learning_rate": 1.7968325120718624e-05, "loss": 0.2194, "reward": 2.0351562798023224, "reward_std": 0.07483086362481117, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9949776977300644, "step": 957 }, { "clip_ratio": 0.0, "completion_length": 111.50000381469727, "epoch": 0.28616234784556793, "grad_norm": 8.509167671203613, "kl": 0.611328125, "learning_rate": 1.796201882506169e-05, "loss": 0.1133, "reward": 2.0675224363803864, "reward_std": 0.015625, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9983258992433548, "step": 958 }, { "clip_ratio": 0.0, "completion_length": 111.34822082519531, "epoch": 0.2864610559330894, "grad_norm": 11.163392066955566, "kl": 0.67724609375, "learning_rate": 1.7955703867508634e-05, "loss": 0.2024, "reward": 2.060267925262451, "reward_std": 0.044642859138548374, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9955357313156128, "step": 959 }, { "clip_ratio": 0.0, "completion_length": 118.14286231994629, "epoch": 0.2867597640206109, "grad_norm": 0.9653461575508118, "kl": 0.531982421875, "learning_rate": 1.794938025492951e-05, "loss": 0.0827, "reward": 2.029017925262451, "reward_std": 0.0267857164144516, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9977678656578064, "step": 960 }, { "clip_ratio": 0.0, "completion_length": 105.12277412414551, "epoch": 0.28705847210813235, "grad_norm": 0.26120084524154663, "kl": 0.32666015625, "learning_rate": 1.7943047994203796e-05, "loss": 0.0223, "reward": 2.0318081080913544, "reward_std": 0.015625, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9983258992433548, "step": 961 }, { "clip_ratio": 0.0, "completion_length": 107.3593807220459, "epoch": 0.2873571801956538, "grad_norm": 4.012116432189941, "kl": 0.28662109375, "learning_rate": 1.7936707092220363e-05, "loss": 0.0969, "reward": 2.0613840520381927, "reward_std": 0.04017857275903225, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9966517984867096, "step": 962 }, { "clip_ratio": 0.0, "completion_length": 117.0714340209961, "epoch": 0.2876558882831753, "grad_norm": 3.9074490070343018, "kl": 0.324951171875, "learning_rate": 1.79303575558775e-05, "loss": 0.1553, "reward": 2.022879481315613, "reward_std": 0.05133928917348385, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9938616305589676, "step": 963 }, { "clip_ratio": 0.0, "completion_length": 110.0870590209961, "epoch": 0.28795459637069676, "grad_norm": 0.43424394726753235, "kl": 0.33349609375, "learning_rate": 1.792399939208287e-05, "loss": 0.1262, "reward": 2.0000000596046448, "reward_std": 0.051686200546100736, "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.995535746216774, "step": 964 }, { "clip_ratio": 0.0, "completion_length": 143.00670433044434, "epoch": 0.28825330445821823, "grad_norm": 0.5443654656410217, "kl": 0.451171875, "learning_rate": 1.791763260775354e-05, "loss": 0.3854, "reward": 1.9648438394069672, "reward_std": 0.19848398491740227, "rewards/accuracy_reward": 0.013392857508733869, "rewards/format_reward": 0.9687500596046448, "rewards/tag_count_reward": 0.9827009439468384, "step": 965 }, { "clip_ratio": 0.0, "completion_length": 175.15179443359375, "epoch": 0.2885520125457397, "grad_norm": 0.4852140545845032, "kl": 0.3671875, "learning_rate": 1.791125720981594e-05, "loss": 0.7066, "reward": 2.0011162161827087, "reward_std": 0.25438613444566727, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.9441964775323868, "rewards/tag_count_reward": 0.97433041036129, "step": 966 }, { "clip_ratio": 0.0, "completion_length": 181.44866943359375, "epoch": 0.28885072063326117, "grad_norm": 0.8153749704360962, "kl": 0.4609375, "learning_rate": 1.7904873205205886e-05, "loss": 0.6863, "reward": 1.958147406578064, "reward_std": 0.2872319333255291, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.9441964626312256, "rewards/tag_count_reward": 0.9715402275323868, "step": 967 }, { "clip_ratio": 0.0, "completion_length": 206.5959930419922, "epoch": 0.28914942872078264, "grad_norm": 0.4109874963760376, "kl": 0.3056640625, "learning_rate": 1.7898480600868544e-05, "loss": 0.596, "reward": 1.9363840222358704, "reward_std": 0.2836313098669052, "rewards/accuracy_reward": 0.0290178582072258, "rewards/format_reward": 0.9397321790456772, "rewards/tag_count_reward": 0.9676339775323868, "step": 968 }, { "clip_ratio": 0.0, "completion_length": 204.90848922729492, "epoch": 0.28944813680830406, "grad_norm": 0.6382594704627991, "kl": 0.35986328125, "learning_rate": 1.7892079403758444e-05, "loss": 0.5788, "reward": 1.9536831378936768, "reward_std": 0.2498920075595379, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9464286118745804, "rewards/tag_count_reward": 0.9715402126312256, "step": 969 }, { "clip_ratio": 0.0, "completion_length": 210.35491943359375, "epoch": 0.28974684489582553, "grad_norm": 0.63507479429245, "kl": 0.30712890625, "learning_rate": 1.788566962083946e-05, "loss": 0.6245, "reward": 1.996093899011612, "reward_std": 0.2683752328157425, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.9441964775323868, "rewards/tag_count_reward": 0.9715402275323868, "step": 970 }, { "clip_ratio": 0.0, "completion_length": 225.02902603149414, "epoch": 0.290045552983347, "grad_norm": 0.4483128786087036, "kl": 0.37109375, "learning_rate": 1.7879251259084803e-05, "loss": 0.6264, "reward": 2.032366156578064, "reward_std": 0.31498028710484505, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.9330357611179352, "rewards/tag_count_reward": 0.9587053954601288, "step": 971 }, { "clip_ratio": 0.0, "completion_length": 197.73884963989258, "epoch": 0.29034426107086847, "grad_norm": 5.667562961578369, "kl": 0.498046875, "learning_rate": 1.787282432547703e-05, "loss": 0.4824, "reward": 1.94866082072258, "reward_std": 0.22782805934548378, "rewards/accuracy_reward": 0.0223214291036129, "rewards/format_reward": 0.9531250447034836, "rewards/tag_count_reward": 0.973214328289032, "step": 972 }, { "clip_ratio": 0.0, "completion_length": 202.87723922729492, "epoch": 0.29064296915838994, "grad_norm": 0.4369342029094696, "kl": 0.34814453125, "learning_rate": 1.786638882700801e-05, "loss": 0.4209, "reward": 1.97600457072258, "reward_std": 0.2307378314435482, "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.9486607611179352, "rewards/tag_count_reward": 0.9670759439468384, "step": 973 }, { "clip_ratio": 0.0, "completion_length": 180.8482208251953, "epoch": 0.2909416772459114, "grad_norm": 0.2506786286830902, "kl": 0.314453125, "learning_rate": 1.7859944770678933e-05, "loss": 0.2566, "reward": 2.032366156578064, "reward_std": 0.11729154549539089, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9854911118745804, "step": 974 }, { "clip_ratio": 0.0, "completion_length": 175.6942024230957, "epoch": 0.2912403853334329, "grad_norm": 0.28390660881996155, "kl": 0.27294921875, "learning_rate": 1.7853492163500306e-05, "loss": 0.4699, "reward": 2.0786831378936768, "reward_std": 0.20680510438978672, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.982700914144516, "step": 975 }, { "clip_ratio": 0.0, "completion_length": 160.87500762939453, "epoch": 0.29153909342095435, "grad_norm": 0.18372879922389984, "kl": 0.25146484375, "learning_rate": 1.7847031012491925e-05, "loss": 0.2086, "reward": 2.013951003551483, "reward_std": 0.078854164108634, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9916295111179352, "step": 976 }, { "clip_ratio": 0.0, "completion_length": 159.37054061889648, "epoch": 0.2918378015084758, "grad_norm": 0.22940145432949066, "kl": 0.2724609375, "learning_rate": 1.78405613246829e-05, "loss": 0.2241, "reward": 2.0468751192092896, "reward_std": 0.11041304469108582, "rewards/accuracy_reward": 0.0691964291036129, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9910714626312256, "step": 977 }, { "clip_ratio": 0.0, "completion_length": 156.3973274230957, "epoch": 0.2921365095959973, "grad_norm": 0.15774716436862946, "kl": 0.234130859375, "learning_rate": 1.783408310711161e-05, "loss": 0.1445, "reward": 2.081473231315613, "reward_std": 0.057017866522073746, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.996651828289032, "step": 978 }, { "clip_ratio": 0.0, "completion_length": 158.05804061889648, "epoch": 0.29243521768351877, "grad_norm": 0.21035929024219513, "kl": 0.27099609375, "learning_rate": 1.7827596366825718e-05, "loss": 0.0675, "reward": 2.0691965222358704, "reward_std": 0.08501227665692568, "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9955357313156128, "step": 979 }, { "clip_ratio": 0.0, "completion_length": 146.7589340209961, "epoch": 0.29273392577104024, "grad_norm": 0.21177516877651215, "kl": 0.29052734375, "learning_rate": 1.782110111088217e-05, "loss": 0.1157, "reward": 2.051897406578064, "reward_std": 0.07497656065970659, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9960937649011612, "step": 980 }, { "clip_ratio": 0.0, "completion_length": 152.36161422729492, "epoch": 0.2930326338585617, "grad_norm": 0.3682345747947693, "kl": 0.286376953125, "learning_rate": 1.7814597346347163e-05, "loss": 0.071, "reward": 2.118861675262451, "reward_std": 0.08340324647724628, "rewards/accuracy_reward": 0.12723215017467737, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9960937649011612, "step": 981 }, { "clip_ratio": 0.0, "completion_length": 156.96875762939453, "epoch": 0.2933313419460832, "grad_norm": 0.6267690658569336, "kl": 0.296875, "learning_rate": 1.7808085080296154e-05, "loss": 0.0479, "reward": 2.0619420409202576, "reward_std": 0.04117330349981785, "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.997209832072258, "step": 982 }, { "clip_ratio": 0.0, "completion_length": 162.26116943359375, "epoch": 0.29363005003360465, "grad_norm": 1.0063753128051758, "kl": 0.2880859375, "learning_rate": 1.7801564319813854e-05, "loss": -0.0023, "reward": 2.0892857909202576, "reward_std": 0.06173977069556713, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9977678954601288, "step": 983 }, { "clip_ratio": 0.0, "completion_length": 155.38616943359375, "epoch": 0.2939287581211261, "grad_norm": 2.2346034049987793, "kl": 0.275146484375, "learning_rate": 1.779503507199421e-05, "loss": 0.0348, "reward": 2.0859375596046448, "reward_std": 0.05457546189427376, "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9988839328289032, "step": 984 }, { "clip_ratio": 0.0, "completion_length": 177.42411422729492, "epoch": 0.2942274662086476, "grad_norm": 12.313162803649902, "kl": 0.68408203125, "learning_rate": 1.77884973439404e-05, "loss": 0.1797, "reward": 2.071986675262451, "reward_std": 0.11203670874238014, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9938616454601288, "step": 985 }, { "clip_ratio": 0.0, "completion_length": 199.0647430419922, "epoch": 0.29452617429616906, "grad_norm": 10.167309761047363, "kl": 0.68505859375, "learning_rate": 1.7781951142764838e-05, "loss": 0.1105, "reward": 2.107142895460129, "reward_std": 0.051653504371643066, "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9977678805589676, "step": 986 }, { "clip_ratio": 0.0, "completion_length": 202.54911422729492, "epoch": 0.29482488238369053, "grad_norm": 2.75547456741333, "kl": 0.42138671875, "learning_rate": 1.7775396475589144e-05, "loss": 0.1344, "reward": 2.098772406578064, "reward_std": 0.05417834781110287, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9960937798023224, "step": 987 }, { "clip_ratio": 0.0, "completion_length": 211.67858123779297, "epoch": 0.295123590471212, "grad_norm": 1.9300671815872192, "kl": 0.359375, "learning_rate": 1.7768833349544157e-05, "loss": 0.082, "reward": 2.041294753551483, "reward_std": 0.06784625351428986, "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9966518133878708, "step": 988 }, { "clip_ratio": 0.0, "completion_length": 251.25447845458984, "epoch": 0.2954222985587335, "grad_norm": 3.972376585006714, "kl": 0.427734375, "learning_rate": 1.776226177176991e-05, "loss": 0.1149, "reward": 2.122767984867096, "reward_std": 0.09372477978467941, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9955357611179352, "step": 989 }, { "clip_ratio": 0.0, "completion_length": 249.3325958251953, "epoch": 0.29572100664625495, "grad_norm": 1.9673961400985718, "kl": 0.37841796875, "learning_rate": 1.7755681749415644e-05, "loss": 0.084, "reward": 1.9988840818405151, "reward_std": 0.07023446820676327, "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.994419664144516, "step": 990 }, { "clip_ratio": 0.0, "completion_length": 307.32366943359375, "epoch": 0.2960197147337764, "grad_norm": 3.0267388820648193, "kl": 0.400146484375, "learning_rate": 1.774909328963977e-05, "loss": 0.2008, "reward": 2.0440849363803864, "reward_std": 0.13017532788217068, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9882812798023224, "step": 991 }, { "clip_ratio": 0.0, "completion_length": 308.31250762939453, "epoch": 0.2963184228212979, "grad_norm": 0.6141567230224609, "kl": 0.31640625, "learning_rate": 1.7742496399609888e-05, "loss": 0.0691, "reward": 2.0708706378936768, "reward_std": 0.15048110485076904, "rewards/accuracy_reward": 0.09151786309666932, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.992745578289032, "step": 992 }, { "clip_ratio": 0.0, "completion_length": 343.3259048461914, "epoch": 0.29661713090881936, "grad_norm": 7.514430046081543, "kl": 0.6259765625, "learning_rate": 1.773589108650277e-05, "loss": 0.2419, "reward": 2.0178572237491608, "reward_std": 0.17116030678153038, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.98214291036129, "step": 993 }, { "clip_ratio": 0.0, "completion_length": 345.9687728881836, "epoch": 0.29691583899634083, "grad_norm": 0.6738871335983276, "kl": 0.5498046875, "learning_rate": 1.772927735750435e-05, "loss": 0.1554, "reward": 2.052455425262451, "reward_std": 0.21493914350867271, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9832589775323868, "step": 994 }, { "clip_ratio": 0.0, "completion_length": 397.87278747558594, "epoch": 0.2972145470838623, "grad_norm": 2.223489999771118, "kl": 0.43505859375, "learning_rate": 1.7722655219809718e-05, "loss": 0.1197, "reward": 2.0474331378936768, "reward_std": 0.1639699451625347, "rewards/accuracy_reward": 0.08258928707800806, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9871652275323868, "step": 995 }, { "clip_ratio": 0.0, "completion_length": 437.95760345458984, "epoch": 0.2975132551713838, "grad_norm": 0.5649505853652954, "kl": 0.29296875, "learning_rate": 1.7716024680623106e-05, "loss": 0.0563, "reward": 2.1238840222358704, "reward_std": 0.13272957131266594, "rewards/accuracy_reward": 0.14732143469154835, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9921875298023224, "step": 996 }, { "clip_ratio": 0.0, "completion_length": 455.6160888671875, "epoch": 0.29781196325890524, "grad_norm": 0.7349733114242554, "kl": 0.271240234375, "learning_rate": 1.770938574715789e-05, "loss": 0.0775, "reward": 2.047991156578064, "reward_std": 0.16726710088551044, "rewards/accuracy_reward": 0.08035714784637094, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9877232611179352, "step": 997 }, { "clip_ratio": 0.0, "completion_length": 493.47322845458984, "epoch": 0.2981106713464267, "grad_norm": 0.4936922490596771, "kl": 0.32568359375, "learning_rate": 1.7702738426636587e-05, "loss": 0.1375, "reward": 2.0340402722358704, "reward_std": 0.24774369224905968, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.95089291036129, "rewards/tag_count_reward": 0.9737723618745804, "step": 998 }, { "clip_ratio": 0.0, "completion_length": 506.56475830078125, "epoch": 0.2984093794339482, "grad_norm": 0.8605767488479614, "kl": 0.36083984375, "learning_rate": 1.7696082726290825e-05, "loss": 0.131, "reward": 2.03069207072258, "reward_std": 0.23910916224122047, "rewards/accuracy_reward": 0.08928572060540318, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9771205633878708, "step": 999 }, { "clip_ratio": 0.0, "completion_length": 564.654052734375, "epoch": 0.29870808752146966, "grad_norm": 0.7312789559364319, "kl": 0.54345703125, "learning_rate": 1.7689418653361354e-05, "loss": 0.151, "reward": 2.05022332072258, "reward_std": 0.3533405140042305, "rewards/accuracy_reward": 0.14732143748551607, "rewards/format_reward": 0.9375000447034836, "rewards/tag_count_reward": 0.9654018431901932, "step": 1000 }, { "clip_ratio": 0.0, "completion_length": 555.8102874755859, "epoch": 0.2990067956089911, "grad_norm": 394.1839599609375, "kl": 10.951171875, "learning_rate": 1.768274621509803e-05, "loss": 1.0181, "reward": 1.8225446939468384, "reward_std": 0.5274956673383713, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.8616071790456772, "rewards/tag_count_reward": 0.8850446939468384, "step": 1001 }, { "clip_ratio": 0.0, "completion_length": 559.8571624755859, "epoch": 0.2993055036965126, "grad_norm": 12.057634353637695, "kl": 2.3369140625, "learning_rate": 1.7676065418759814e-05, "loss": 0.3407, "reward": 1.7399554252624512, "reward_std": 0.6260671764612198, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.8415178954601288, "rewards/tag_count_reward": 0.862723246216774, "step": 1002 }, { "clip_ratio": 0.0, "completion_length": 568.3482360839844, "epoch": 0.29960421178403407, "grad_norm": 9.394242286682129, "kl": 1.33203125, "learning_rate": 1.7669376271614757e-05, "loss": 0.2712, "reward": 1.95256707072258, "reward_std": 0.5139819383621216, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.895089328289032, "rewards/tag_count_reward": 0.9190848469734192, "step": 1003 }, { "clip_ratio": 0.0, "completion_length": 512.2366333007812, "epoch": 0.29990291987155554, "grad_norm": 0.8574299216270447, "kl": 0.460205078125, "learning_rate": 1.7662678780939996e-05, "loss": 0.1281, "reward": 2.049665242433548, "reward_std": 0.27508315816521645, "rewards/accuracy_reward": 0.11830357694998384, "rewards/format_reward": 0.9598214477300644, "rewards/tag_count_reward": 0.9715402275323868, "step": 1004 }, { "clip_ratio": 0.0, "completion_length": 558.2276916503906, "epoch": 0.300201627959077, "grad_norm": 0.27844417095184326, "kl": 0.21923828125, "learning_rate": 1.7655972954021745e-05, "loss": 0.0874, "reward": 2.04631707072258, "reward_std": 0.3033268488943577, "rewards/accuracy_reward": 0.13616072316654027, "rewards/format_reward": 0.9464286118745804, "rewards/tag_count_reward": 0.9637277126312256, "step": 1005 }, { "clip_ratio": 0.0, "completion_length": 538.5469055175781, "epoch": 0.3005003360465985, "grad_norm": 0.3297044038772583, "kl": 0.232666015625, "learning_rate": 1.764925879815529e-05, "loss": 0.1491, "reward": 1.8878348767757416, "reward_std": 0.32170870900154114, "rewards/accuracy_reward": 0.008928572060540318, "rewards/format_reward": 0.9241071790456772, "rewards/tag_count_reward": 0.9547991454601288, "step": 1006 }, { "clip_ratio": 0.0, "completion_length": 527.2299499511719, "epoch": 0.30079904413411995, "grad_norm": 0.25583237409591675, "kl": 0.216064453125, "learning_rate": 1.7642536320644964e-05, "loss": 0.1718, "reward": 1.9481027722358704, "reward_std": 0.40661608427762985, "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.9107143133878708, "rewards/tag_count_reward": 0.9414062947034836, "step": 1007 }, { "clip_ratio": 0.0, "completion_length": 541.4085235595703, "epoch": 0.3010977522216414, "grad_norm": 1.0727146863937378, "kl": 0.342041015625, "learning_rate": 1.7635805528804175e-05, "loss": 0.2406, "reward": 1.9291295409202576, "reward_std": 0.4932979866862297, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.8928571790456772, "rewards/tag_count_reward": 0.9224330633878708, "step": 1008 }, { "clip_ratio": 0.0, "completion_length": 542.654052734375, "epoch": 0.3013964603091629, "grad_norm": 0.38281774520874023, "kl": 0.26025390625, "learning_rate": 1.7629066429955358e-05, "loss": 0.2387, "reward": 1.8906250894069672, "reward_std": 0.4868224039673805, "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.879464328289032, "rewards/tag_count_reward": 0.9285714626312256, "step": 1009 }, { "clip_ratio": 0.0, "completion_length": 505.8839569091797, "epoch": 0.30169516839668437, "grad_norm": 0.5858570337295532, "kl": 0.217529296875, "learning_rate": 1.7622319031429995e-05, "loss": 0.2554, "reward": 1.94084832072258, "reward_std": 0.44672662764787674, "rewards/accuracy_reward": 0.09821429010480642, "rewards/format_reward": 0.9040178954601288, "rewards/tag_count_reward": 0.9386161118745804, "step": 1010 }, { "clip_ratio": 0.0, "completion_length": 453.06251525878906, "epoch": 0.30199387648420584, "grad_norm": 0.8756740689277649, "kl": 0.2275390625, "learning_rate": 1.7615563340568594e-05, "loss": 0.156, "reward": 2.026227742433548, "reward_std": 0.243971673771739, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.9748884290456772, "step": 1011 }, { "clip_ratio": 0.0, "completion_length": 457.7232360839844, "epoch": 0.30229258457172725, "grad_norm": 0.30324211716651917, "kl": 0.2099609375, "learning_rate": 1.7608799364720685e-05, "loss": 0.1672, "reward": 2.0117188692092896, "reward_std": 0.2564270533621311, "rewards/accuracy_reward": 0.08258928940631449, "rewards/format_reward": 0.9531250298023224, "rewards/tag_count_reward": 0.9760045260190964, "step": 1012 }, { "clip_ratio": 0.0, "completion_length": 438.83038330078125, "epoch": 0.3025912926592487, "grad_norm": 0.3114936351776123, "kl": 0.219970703125, "learning_rate": 1.7602027111244807e-05, "loss": 0.1049, "reward": 2.0128348767757416, "reward_std": 0.16539617627859116, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.98604916036129, "step": 1013 }, { "clip_ratio": 0.0, "completion_length": 370.84153747558594, "epoch": 0.3028900007467702, "grad_norm": 0.20460286736488342, "kl": 0.232421875, "learning_rate": 1.7595246587508513e-05, "loss": 0.0568, "reward": 2.1283482909202576, "reward_std": 0.0580357164144516, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9944196939468384, "step": 1014 }, { "clip_ratio": 0.0, "completion_length": 353.5535888671875, "epoch": 0.30318870883429166, "grad_norm": 0.24168585240840912, "kl": 0.216796875, "learning_rate": 1.7588457800888342e-05, "loss": 0.0471, "reward": 2.0898438692092896, "reward_std": 0.08956352714449167, "rewards/accuracy_reward": 0.10491072130389512, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9960937947034836, "step": 1015 }, { "clip_ratio": 0.0, "completion_length": 338.9085006713867, "epoch": 0.30348741692181314, "grad_norm": 0.23408101499080658, "kl": 0.19287109375, "learning_rate": 1.7581660758769836e-05, "loss": 0.0748, "reward": 2.0786831378936768, "reward_std": 0.09409692045301199, "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9960937798023224, "step": 1016 }, { "clip_ratio": 0.0, "completion_length": 315.73438262939453, "epoch": 0.3037861250093346, "grad_norm": 0.1574399769306183, "kl": 0.186767578125, "learning_rate": 1.7574855468547503e-05, "loss": 0.0048, "reward": 2.051897406578064, "reward_std": 0.06978568714112043, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9960937947034836, "step": 1017 }, { "clip_ratio": 0.0, "completion_length": 332.3214340209961, "epoch": 0.3040848330968561, "grad_norm": 0.4721401631832123, "kl": 0.21630859375, "learning_rate": 1.7568041937624843e-05, "loss": 0.1386, "reward": 2.212611734867096, "reward_std": 0.19374322518706322, "rewards/accuracy_reward": 0.2500000086147338, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9893973469734192, "step": 1018 }, { "clip_ratio": 0.0, "completion_length": 329.2745666503906, "epoch": 0.30438354118437755, "grad_norm": 11.354450225830078, "kl": 0.395263671875, "learning_rate": 1.7561220173414297e-05, "loss": 0.1168, "reward": 2.084821581840515, "reward_std": 0.11637553386390209, "rewards/accuracy_reward": 0.11383928847499192, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9910714477300644, "step": 1019 }, { "clip_ratio": 0.0, "completion_length": 346.9575958251953, "epoch": 0.304682249271899, "grad_norm": 0.22706007957458496, "kl": 0.219482421875, "learning_rate": 1.755439018333728e-05, "loss": 0.0334, "reward": 2.142299234867096, "reward_std": 0.06989026255905628, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9994419813156128, "step": 1020 }, { "clip_ratio": 0.0, "completion_length": 306.3660888671875, "epoch": 0.3049809573594205, "grad_norm": 0.2910265326499939, "kl": 0.221923828125, "learning_rate": 1.7547551974824158e-05, "loss": 0.1162, "reward": 2.0000000596046448, "reward_std": 0.19731055200099945, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.9754464477300644, "rewards/tag_count_reward": 0.979910746216774, "step": 1021 }, { "clip_ratio": 0.0, "completion_length": 323.02679443359375, "epoch": 0.30527966544694196, "grad_norm": 0.1401769071817398, "kl": 0.19873046875, "learning_rate": 1.7540705555314224e-05, "loss": 0.0174, "reward": 1.993303656578064, "reward_std": 0.05118321813642979, "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9955357313156128, "step": 1022 }, { "clip_ratio": 0.0, "completion_length": 331.6049270629883, "epoch": 0.30557837353446343, "grad_norm": 0.23631435632705688, "kl": 0.186767578125, "learning_rate": 1.753385093225572e-05, "loss": 0.0796, "reward": 2.05803582072258, "reward_std": 0.16790901497006416, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9843750447034836, "step": 1023 }, { "clip_ratio": 0.0, "completion_length": 353.7031478881836, "epoch": 0.3058770816219849, "grad_norm": 0.1965683549642563, "kl": 0.156494140625, "learning_rate": 1.7526988113105794e-05, "loss": 0.0859, "reward": 1.9765625894069672, "reward_std": 0.2363225258886814, "rewards/accuracy_reward": 0.03348214435391128, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.9743303954601288, "step": 1024 }, { "clip_ratio": 0.0, "completion_length": 414.3348388671875, "epoch": 0.3061757897095064, "grad_norm": 0.2448388785123825, "kl": 0.180419921875, "learning_rate": 1.7520117105330524e-05, "loss": 0.1386, "reward": 1.9335938096046448, "reward_std": 0.2427399381995201, "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.9575893133878708, "rewards/tag_count_reward": 0.9693080633878708, "step": 1025 }, { "clip_ratio": 0.0, "completion_length": 384.85938262939453, "epoch": 0.30647449779702785, "grad_norm": 0.43239709734916687, "kl": 0.200439453125, "learning_rate": 1.7513237916404896e-05, "loss": 0.1241, "reward": 2.0959822237491608, "reward_std": 0.20056847855448723, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.979910746216774, "step": 1026 }, { "clip_ratio": 0.0, "completion_length": 398.89064025878906, "epoch": 0.3067732058845493, "grad_norm": 0.2389671802520752, "kl": 0.210693359375, "learning_rate": 1.750635055381279e-05, "loss": 0.1767, "reward": 1.9447545111179352, "reward_std": 0.28707967698574066, "rewards/accuracy_reward": 0.024553572991862893, "rewards/format_reward": 0.948660746216774, "rewards/tag_count_reward": 0.9715402275323868, "step": 1027 }, { "clip_ratio": 0.0, "completion_length": 400.89064025878906, "epoch": 0.3070719139720708, "grad_norm": 0.36410287022590637, "kl": 0.20556640625, "learning_rate": 1.7499455025046982e-05, "loss": 0.2589, "reward": 1.9475446939468384, "reward_std": 0.3334804140031338, "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.9241071790456772, "rewards/tag_count_reward": 0.9542411118745804, "step": 1028 }, { "clip_ratio": 0.0, "completion_length": 395.5067138671875, "epoch": 0.30737062205959226, "grad_norm": 0.3295355439186096, "kl": 0.24072265625, "learning_rate": 1.7492551337609134e-05, "loss": 0.3327, "reward": 1.9553572237491608, "reward_std": 0.4338413029909134, "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.9218750447034836, "rewards/tag_count_reward": 0.9508928954601288, "step": 1029 }, { "clip_ratio": 0.0, "completion_length": 347.37501525878906, "epoch": 0.30766933014711373, "grad_norm": 0.7932664752006531, "kl": 0.312255859375, "learning_rate": 1.748563949900978e-05, "loss": 0.3998, "reward": 1.9380581378936768, "reward_std": 0.36801398545503616, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.9241071939468384, "rewards/tag_count_reward": 0.9581473618745804, "step": 1030 }, { "clip_ratio": 0.0, "completion_length": 334.5803756713867, "epoch": 0.3079680382346352, "grad_norm": 0.7846641540527344, "kl": 0.3955078125, "learning_rate": 1.7478719516768324e-05, "loss": 0.336, "reward": 1.8900670409202576, "reward_std": 0.327284537255764, "rewards/accuracy_reward": 0.011160715017467737, "rewards/format_reward": 0.9263393431901932, "rewards/tag_count_reward": 0.9525670111179352, "step": 1031 }, { "clip_ratio": 0.0, "completion_length": 313.10716247558594, "epoch": 0.30826674632215667, "grad_norm": 0.40991348028182983, "kl": 0.268310546875, "learning_rate": 1.7471791398413026e-05, "loss": 0.3265, "reward": 1.9542411863803864, "reward_std": 0.2959762178361416, "rewards/accuracy_reward": 0.05133928847499192, "rewards/format_reward": 0.9352678954601288, "rewards/tag_count_reward": 0.9676339775323868, "step": 1032 }, { "clip_ratio": 0.0, "completion_length": 316.28126525878906, "epoch": 0.30856545440967814, "grad_norm": 0.3766689598560333, "kl": 0.26513671875, "learning_rate": 1.7464855151481e-05, "loss": 0.3925, "reward": 1.9363840520381927, "reward_std": 0.3681555688381195, "rewards/accuracy_reward": 0.0468750037252903, "rewards/format_reward": 0.933035746216774, "rewards/tag_count_reward": 0.9564732611179352, "step": 1033 }, { "clip_ratio": 0.0, "completion_length": 269.08483123779297, "epoch": 0.3088641624971996, "grad_norm": 0.2861553132534027, "kl": 0.2041015625, "learning_rate": 1.7457910783518204e-05, "loss": 0.2304, "reward": 2.1250000596046448, "reward_std": 0.23014095798134804, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.9843750298023224, "step": 1034 }, { "clip_ratio": 0.0, "completion_length": 241.59375762939453, "epoch": 0.3091628705847211, "grad_norm": 0.5758584141731262, "kl": 0.3154296875, "learning_rate": 1.7450958302079428e-05, "loss": 0.1728, "reward": 2.0424108505249023, "reward_std": 0.17593876644968987, "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9843750298023224, "step": 1035 }, { "clip_ratio": 0.0, "completion_length": 231.4084930419922, "epoch": 0.30946157867224255, "grad_norm": 0.22198806703090668, "kl": 0.17333984375, "learning_rate": 1.7443997714728294e-05, "loss": 0.107, "reward": 2.029017925262451, "reward_std": 0.11156961880624294, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9910714626312256, "step": 1036 }, { "clip_ratio": 0.0, "completion_length": 226.97991943359375, "epoch": 0.309760286759764, "grad_norm": 0.24234211444854736, "kl": 0.1640625, "learning_rate": 1.7437029029037233e-05, "loss": 0.0961, "reward": 2.134486734867096, "reward_std": 0.13366702012717724, "rewards/accuracy_reward": 0.14732143469154835, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9960937947034836, "step": 1037 }, { "clip_ratio": 0.0, "completion_length": 231.67858123779297, "epoch": 0.3100589948472855, "grad_norm": 0.24324029684066772, "kl": 0.161865234375, "learning_rate": 1.7430052252587498e-05, "loss": 0.0974, "reward": 2.13225457072258, "reward_std": 0.16904658637940884, "rewards/accuracy_reward": 0.14732143399305642, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9938616305589676, "step": 1038 }, { "clip_ratio": 0.0, "completion_length": 262.8750114440918, "epoch": 0.31035770293480697, "grad_norm": 0.2686210572719574, "kl": 0.162353515625, "learning_rate": 1.7423067392969137e-05, "loss": 0.0908, "reward": 2.0731027722358704, "reward_std": 0.12458669021725655, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9927455633878708, "step": 1039 }, { "clip_ratio": 0.0, "completion_length": 257.11608123779297, "epoch": 0.31065641102232844, "grad_norm": 0.350046306848526, "kl": 0.1640625, "learning_rate": 1.741607445778099e-05, "loss": 0.1722, "reward": 2.0747768580913544, "reward_std": 0.17902548052370548, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9787946790456772, "step": 1040 }, { "clip_ratio": 0.0, "completion_length": 266.2522392272949, "epoch": 0.3109551191098499, "grad_norm": 0.21502655744552612, "kl": 0.159912109375, "learning_rate": 1.7409073454630686e-05, "loss": 0.0525, "reward": 2.0100446939468384, "reward_std": 0.14836085587739944, "rewards/accuracy_reward": 0.037946431431919336, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9899553954601288, "step": 1041 }, { "clip_ratio": 0.0, "completion_length": 263.7589340209961, "epoch": 0.3112538271973714, "grad_norm": 0.28335756063461304, "kl": 0.1591796875, "learning_rate": 1.7402064391134626e-05, "loss": 0.2024, "reward": 2.060267984867096, "reward_std": 0.17676471173763275, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9843750447034836, "step": 1042 }, { "clip_ratio": 0.0, "completion_length": 263.27679443359375, "epoch": 0.31155253528489285, "grad_norm": 0.23032866418361664, "kl": 0.1953125, "learning_rate": 1.7395047274917994e-05, "loss": 0.1598, "reward": 1.9787947237491608, "reward_std": 0.10740913078188896, "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9899553954601288, "step": 1043 }, { "clip_ratio": 0.0, "completion_length": 283.16966247558594, "epoch": 0.3118512433724143, "grad_norm": 0.3015403151512146, "kl": 0.218994140625, "learning_rate": 1.7388022113614722e-05, "loss": 0.231, "reward": 1.998883992433548, "reward_std": 0.23499133810400963, "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.97433041036129, "step": 1044 }, { "clip_ratio": 0.0, "completion_length": 299.24778747558594, "epoch": 0.3121499514599358, "grad_norm": 0.5040445327758789, "kl": 0.21533203125, "learning_rate": 1.7380988914867488e-05, "loss": 0.3241, "reward": 1.977678656578064, "reward_std": 0.2858409658074379, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.9553571790456772, "rewards/tag_count_reward": 0.9642857611179352, "step": 1045 }, { "clip_ratio": 0.0, "completion_length": 295.5602798461914, "epoch": 0.31244865954745726, "grad_norm": 1.2429466247558594, "kl": 0.2841796875, "learning_rate": 1.7373947686327736e-05, "loss": 0.3119, "reward": 1.9977678954601288, "reward_std": 0.27840020693838596, "rewards/accuracy_reward": 0.07812500302679837, "rewards/format_reward": 0.9531250298023224, "rewards/tag_count_reward": 0.9665178954601288, "step": 1046 }, { "clip_ratio": 0.0, "completion_length": 278.72098541259766, "epoch": 0.31274736763497873, "grad_norm": 0.5370210409164429, "kl": 0.226806640625, "learning_rate": 1.736689843565562e-05, "loss": 0.0722, "reward": 2.1121652722358704, "reward_std": 0.1402025744318962, "rewards/accuracy_reward": 0.12723214784637094, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9938616305589676, "step": 1047 }, { "clip_ratio": 0.0, "completion_length": 297.16072845458984, "epoch": 0.3130460757225002, "grad_norm": 0.9019346833229065, "kl": 0.214111328125, "learning_rate": 1.7359841170520043e-05, "loss": 0.105, "reward": 2.0546876192092896, "reward_std": 0.14801614731550217, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9899553805589676, "step": 1048 }, { "clip_ratio": 0.0, "completion_length": 307.18750762939453, "epoch": 0.3133447838100217, "grad_norm": 12.704946517944336, "kl": 24.782470703125, "learning_rate": 1.7352775898598615e-05, "loss": 0.1445, "reward": 2.0987724363803864, "reward_std": 0.20030156336724758, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.984933078289032, "step": 1049 }, { "clip_ratio": 0.0, "completion_length": 328.61385345458984, "epoch": 0.31364349189754315, "grad_norm": 0.2060595452785492, "kl": 0.175048828125, "learning_rate": 1.7345702627577655e-05, "loss": 0.1061, "reward": 1.99553582072258, "reward_std": 0.19064628146588802, "rewards/accuracy_reward": 0.03794643096625805, "rewards/format_reward": 0.9732143133878708, "rewards/tag_count_reward": 0.9843750298023224, "step": 1050 }, { "clip_ratio": 0.0, "completion_length": 277.2544708251953, "epoch": 0.3139421999850646, "grad_norm": 0.24137713015079498, "kl": 0.212158203125, "learning_rate": 1.7338621365152195e-05, "loss": 0.2133, "reward": 2.031808078289032, "reward_std": 0.17910079285502434, "rewards/accuracy_reward": 0.07812500232830644, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9804687947034836, "step": 1051 }, { "clip_ratio": 0.0, "completion_length": 354.3705596923828, "epoch": 0.3142409080725861, "grad_norm": 0.3093903064727783, "kl": 0.19287109375, "learning_rate": 1.7331532119025953e-05, "loss": 0.1479, "reward": 2.021763503551483, "reward_std": 0.21344264596700668, "rewards/accuracy_reward": 0.06919643376022577, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9793527126312256, "step": 1052 }, { "clip_ratio": 0.0, "completion_length": 330.45314025878906, "epoch": 0.31453961616010756, "grad_norm": 0.19436286389827728, "kl": 0.176025390625, "learning_rate": 1.7324434896911332e-05, "loss": 0.0892, "reward": 2.181361734867096, "reward_std": 0.13946893997490406, "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9893973618745804, "step": 1053 }, { "clip_ratio": 0.0, "completion_length": 308.3459930419922, "epoch": 0.31483832424762903, "grad_norm": 0.22833842039108276, "kl": 0.202880859375, "learning_rate": 1.7317329706529413e-05, "loss": 0.1063, "reward": 2.070312649011612, "reward_std": 0.18434032518416643, "rewards/accuracy_reward": 0.09598214877769351, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9899553805589676, "step": 1054 }, { "clip_ratio": 0.0, "completion_length": 341.6026916503906, "epoch": 0.31513703233515045, "grad_norm": 0.29645785689353943, "kl": 0.21142578125, "learning_rate": 1.731021655560995e-05, "loss": 0.171, "reward": 2.0005581378936768, "reward_std": 0.21320422366261482, "rewards/accuracy_reward": 0.046875000931322575, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9804687798023224, "step": 1055 }, { "clip_ratio": 0.0, "completion_length": 335.56250762939453, "epoch": 0.3154357404226719, "grad_norm": 0.43666332960128784, "kl": 0.2216796875, "learning_rate": 1.7303095451891356e-05, "loss": 0.1365, "reward": 2.0000001192092896, "reward_std": 0.196923503652215, "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.9754464477300644, "rewards/tag_count_reward": 0.9821428954601288, "step": 1056 }, { "clip_ratio": 0.0, "completion_length": 361.63841247558594, "epoch": 0.3157344485101934, "grad_norm": 0.22255021333694458, "kl": 0.190673828125, "learning_rate": 1.7295966403120685e-05, "loss": 0.1691, "reward": 2.018415242433548, "reward_std": 0.25568411126732826, "rewards/accuracy_reward": 0.07812500488944352, "rewards/format_reward": 0.96651791036129, "rewards/tag_count_reward": 0.9737723469734192, "step": 1057 }, { "clip_ratio": 0.0, "completion_length": 386.6272506713867, "epoch": 0.31603315659771486, "grad_norm": 0.49462029337882996, "kl": 0.219970703125, "learning_rate": 1.728882941705365e-05, "loss": 0.0539, "reward": 2.005580425262451, "reward_std": 0.0848214328289032, "rewards/accuracy_reward": 0.0223214291036129, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.994419664144516, "step": 1058 }, { "clip_ratio": 0.0, "completion_length": 342.5870666503906, "epoch": 0.31633186468523633, "grad_norm": 0.2759440839290619, "kl": 0.219970703125, "learning_rate": 1.7281684501454595e-05, "loss": 0.1675, "reward": 2.0457590520381927, "reward_std": 0.2355557195842266, "rewards/accuracy_reward": 0.10491071944124997, "rewards/format_reward": 0.9642857760190964, "rewards/tag_count_reward": 0.9765625447034836, "step": 1059 }, { "clip_ratio": 0.0, "completion_length": 404.33484649658203, "epoch": 0.3166305727727578, "grad_norm": 0.23593617975711823, "kl": 0.17626953125, "learning_rate": 1.727453166409648e-05, "loss": 0.1677, "reward": 1.98381707072258, "reward_std": 0.28055629692971706, "rewards/accuracy_reward": 0.05133928661234677, "rewards/format_reward": 0.9598214775323868, "rewards/tag_count_reward": 0.9726562947034836, "step": 1060 }, { "clip_ratio": 0.0, "completion_length": 388.38394927978516, "epoch": 0.31692928086027927, "grad_norm": 0.20100095868110657, "kl": 0.1845703125, "learning_rate": 1.72673709127609e-05, "loss": 0.1386, "reward": 1.9787947237491608, "reward_std": 0.22767801396548748, "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.9598214626312256, "rewards/tag_count_reward": 0.9698661118745804, "step": 1061 }, { "clip_ratio": 0.0, "completion_length": 400.11385345458984, "epoch": 0.31722798894780074, "grad_norm": 0.5485225915908813, "kl": 0.2802734375, "learning_rate": 1.726020225523804e-05, "loss": 0.1435, "reward": 2.0156251192092896, "reward_std": 0.2727869488298893, "rewards/accuracy_reward": 0.07812500465661287, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9709821939468384, "step": 1062 }, { "clip_ratio": 0.0, "completion_length": 395.9598388671875, "epoch": 0.3175266970353222, "grad_norm": 0.2742289900779724, "kl": 0.428466796875, "learning_rate": 1.7253025699326706e-05, "loss": 0.1298, "reward": 2.0669643580913544, "reward_std": 0.19021883700042963, "rewards/accuracy_reward": 0.1205357238650322, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9776785969734192, "step": 1063 }, { "clip_ratio": 0.0, "completion_length": 408.45314025878906, "epoch": 0.3178254051228437, "grad_norm": 0.4669473469257355, "kl": 0.23388671875, "learning_rate": 1.7245841252834282e-05, "loss": 0.2056, "reward": 1.9927456378936768, "reward_std": 0.33031393960118294, "rewards/accuracy_reward": 0.07366071850992739, "rewards/format_reward": 0.9531250447034836, "rewards/tag_count_reward": 0.9659598618745804, "step": 1064 }, { "clip_ratio": 0.0, "completion_length": 341.10938262939453, "epoch": 0.31812411321036516, "grad_norm": 0.29381176829338074, "kl": 0.2060546875, "learning_rate": 1.723864892357675e-05, "loss": 0.0956, "reward": 2.0675224363803864, "reward_std": 0.18148620054125786, "rewards/accuracy_reward": 0.10267857951112092, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9871652126312256, "step": 1065 }, { "clip_ratio": 0.0, "completion_length": 318.03126525878906, "epoch": 0.3184228212978866, "grad_norm": 0.6576449871063232, "kl": 0.2333984375, "learning_rate": 1.7231448719378645e-05, "loss": 0.0872, "reward": 2.0133929550647736, "reward_std": 0.16588891111314297, "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9843750447034836, "step": 1066 }, { "clip_ratio": 0.0, "completion_length": 316.5044708251953, "epoch": 0.3187215293854081, "grad_norm": 0.27645179629325867, "kl": 0.46435546875, "learning_rate": 1.7224240648073097e-05, "loss": 0.0426, "reward": 2.088169664144516, "reward_std": 0.14756645075976849, "rewards/accuracy_reward": 0.11383929220028222, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9899553805589676, "step": 1067 }, { "clip_ratio": 0.0, "completion_length": 379.2455520629883, "epoch": 0.31902023747292957, "grad_norm": 0.251384437084198, "kl": 0.170166015625, "learning_rate": 1.7217024717501772e-05, "loss": 0.1368, "reward": 2.0563617050647736, "reward_std": 0.20753103122115135, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9827009439468384, "step": 1068 }, { "clip_ratio": 0.0, "completion_length": 444.39957427978516, "epoch": 0.31931894556045104, "grad_norm": 0.2183113992214203, "kl": 0.15283203125, "learning_rate": 1.72098009355149e-05, "loss": 0.115, "reward": 2.056919753551483, "reward_std": 0.24293025210499763, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9765625447034836, "step": 1069 }, { "clip_ratio": 0.0, "completion_length": 479.6473388671875, "epoch": 0.3196176536479725, "grad_norm": 0.2099137008190155, "kl": 0.151611328125, "learning_rate": 1.7202569309971245e-05, "loss": 0.1591, "reward": 2.050781339406967, "reward_std": 0.2892887145280838, "rewards/accuracy_reward": 0.12276786426082253, "rewards/format_reward": 0.9575893133878708, "rewards/tag_count_reward": 0.9704241454601288, "step": 1070 }, { "clip_ratio": 0.0, "completion_length": 516.2946853637695, "epoch": 0.319916361735494, "grad_norm": 0.18468257784843445, "kl": 0.1282958984375, "learning_rate": 1.7195329848738113e-05, "loss": 0.0959, "reward": 2.0814733505249023, "reward_std": 0.2800023518502712, "rewards/accuracy_reward": 0.149553582072258, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.97433041036129, "step": 1071 }, { "clip_ratio": 0.0, "completion_length": 511.94197845458984, "epoch": 0.32021506982301545, "grad_norm": 0.23751361668109894, "kl": 0.15625, "learning_rate": 1.7188082559691318e-05, "loss": 0.0984, "reward": 2.0039063692092896, "reward_std": 0.2922986000776291, "rewards/accuracy_reward": 0.07366071688011289, "rewards/format_reward": 0.9598214626312256, "rewards/tag_count_reward": 0.9704241454601288, "step": 1072 }, { "clip_ratio": 0.0, "completion_length": 546.7009429931641, "epoch": 0.3205137779105369, "grad_norm": 0.1660633683204651, "kl": 0.355224609375, "learning_rate": 1.718082745071521e-05, "loss": 0.0696, "reward": 2.0491072237491608, "reward_std": 0.2633579969406128, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9709821939468384, "step": 1073 }, { "clip_ratio": 0.0, "completion_length": 549.1027069091797, "epoch": 0.3208124859980584, "grad_norm": 0.2678101658821106, "kl": 0.159423828125, "learning_rate": 1.7173564529702627e-05, "loss": 0.1673, "reward": 1.95256707072258, "reward_std": 0.35201262682676315, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.93526791036129, "rewards/tag_count_reward": 0.9592634290456772, "step": 1074 }, { "clip_ratio": 0.0, "completion_length": 540.8594055175781, "epoch": 0.32111119408557987, "grad_norm": 0.28119590878486633, "kl": 0.150634765625, "learning_rate": 1.716629380455493e-05, "loss": 0.1325, "reward": 2.0206474363803864, "reward_std": 0.29665707051754, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.9441964775323868, "rewards/tag_count_reward": 0.96261166036129, "step": 1075 }, { "clip_ratio": 0.0, "completion_length": 556.3169860839844, "epoch": 0.32140990217310134, "grad_norm": 0.22669632732868195, "kl": 0.215576171875, "learning_rate": 1.715901528318194e-05, "loss": 0.1926, "reward": 1.9196429550647736, "reward_std": 0.41709214448928833, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.9107143133878708, "rewards/tag_count_reward": 0.93526791036129, "step": 1076 }, { "clip_ratio": 0.0, "completion_length": 598.1652069091797, "epoch": 0.3217086102606228, "grad_norm": 0.7067750096321106, "kl": 0.31494140625, "learning_rate": 1.715172897350198e-05, "loss": 0.2525, "reward": 1.8219866752624512, "reward_std": 0.5876220688223839, "rewards/accuracy_reward": 0.05803571571595967, "rewards/format_reward": 0.8660714626312256, "rewards/tag_count_reward": 0.8978795111179352, "step": 1077 }, { "clip_ratio": 0.0, "completion_length": 544.7969055175781, "epoch": 0.3220073183481443, "grad_norm": 0.3964366614818573, "kl": 0.251708984375, "learning_rate": 1.7144434883441843e-05, "loss": 0.205, "reward": 1.903459906578064, "reward_std": 0.4610176458954811, "rewards/accuracy_reward": 0.07589286100119352, "rewards/format_reward": 0.8995536118745804, "rewards/tag_count_reward": 0.9280134290456772, "step": 1078 }, { "clip_ratio": 0.0, "completion_length": 549.4911041259766, "epoch": 0.32230602643566575, "grad_norm": 0.24379128217697144, "kl": 0.159423828125, "learning_rate": 1.7137133020936783e-05, "loss": 0.1467, "reward": 2.042410761117935, "reward_std": 0.3568807505071163, "rewards/accuracy_reward": 0.13616071734577417, "rewards/format_reward": 0.941964328289032, "rewards/tag_count_reward": 0.964285746216774, "step": 1079 }, { "clip_ratio": 0.0, "completion_length": 490.3348388671875, "epoch": 0.3226047345231872, "grad_norm": 0.18832315504550934, "kl": 0.327880859375, "learning_rate": 1.712982339393051e-05, "loss": 0.0634, "reward": 2.0552456080913544, "reward_std": 0.2608336377888918, "rewards/accuracy_reward": 0.12276786426082253, "rewards/format_reward": 0.9598214626312256, "rewards/tag_count_reward": 0.9726562947034836, "step": 1080 }, { "clip_ratio": 0.0, "completion_length": 529.8594055175781, "epoch": 0.3229034426107087, "grad_norm": 0.2224011868238449, "kl": 0.15234375, "learning_rate": 1.7122506010375182e-05, "loss": 0.0758, "reward": 2.0496652722358704, "reward_std": 0.21101084724068642, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9804687947034836, "step": 1081 }, { "clip_ratio": 0.0, "completion_length": 483.3906478881836, "epoch": 0.32320215069823016, "grad_norm": 0.19936822354793549, "kl": 0.1351318359375, "learning_rate": 1.7115180878231394e-05, "loss": 0.0917, "reward": 2.1356027722358704, "reward_std": 0.24961041286587715, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9815848618745804, "step": 1082 }, { "clip_ratio": 0.0, "completion_length": 456.57592010498047, "epoch": 0.32350085878575163, "grad_norm": 0.31254926323890686, "kl": 0.1739501953125, "learning_rate": 1.7107848005468177e-05, "loss": 0.0968, "reward": 2.1222099661827087, "reward_std": 0.18917454034090042, "rewards/accuracy_reward": 0.16071428917348385, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.98604916036129, "step": 1083 }, { "clip_ratio": 0.0, "completion_length": 461.5781555175781, "epoch": 0.3237995668732731, "grad_norm": 0.19370058178901672, "kl": 0.141845703125, "learning_rate": 1.710050740006297e-05, "loss": 0.1278, "reward": 2.068080425262451, "reward_std": 0.23728307709097862, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9832589775323868, "step": 1084 }, { "clip_ratio": 0.0, "completion_length": 418.18751525878906, "epoch": 0.3240982749607946, "grad_norm": 0.20501428842544556, "kl": 0.1435546875, "learning_rate": 1.7093159070001637e-05, "loss": 0.0518, "reward": 2.0630581378936768, "reward_std": 0.1757730282843113, "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9916295111179352, "step": 1085 }, { "clip_ratio": 0.0, "completion_length": 397.8549270629883, "epoch": 0.32439698304831605, "grad_norm": 0.28684964776039124, "kl": 0.384033203125, "learning_rate": 1.7085803023278444e-05, "loss": 0.0274, "reward": 2.079799175262451, "reward_std": 0.1692030355334282, "rewards/accuracy_reward": 0.10714286495931447, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9882813096046448, "step": 1086 }, { "clip_ratio": 0.0, "completion_length": 403.27457427978516, "epoch": 0.3246956911358375, "grad_norm": 0.4038228988647461, "kl": 0.162109375, "learning_rate": 1.7078439267896042e-05, "loss": 0.0933, "reward": 2.039062589406967, "reward_std": 0.2346351109445095, "rewards/accuracy_reward": 0.07589286239817739, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9832589477300644, "step": 1087 }, { "clip_ratio": 0.0, "completion_length": 417.18082427978516, "epoch": 0.324994399223359, "grad_norm": 0.19170694053173065, "kl": 0.162109375, "learning_rate": 1.7071067811865477e-05, "loss": 0.0441, "reward": 2.1138394474983215, "reward_std": 0.21150623261928558, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9910714626312256, "step": 1088 }, { "clip_ratio": 0.0, "completion_length": 387.48885345458984, "epoch": 0.32529310731088046, "grad_norm": 0.27376043796539307, "kl": 0.150634765625, "learning_rate": 1.7063688663206172e-05, "loss": 0.0701, "reward": 2.0959822237491608, "reward_std": 0.18950608745217323, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9866071939468384, "step": 1089 }, { "clip_ratio": 0.0, "completion_length": 470.1406478881836, "epoch": 0.32559181539840193, "grad_norm": 0.11023484170436859, "kl": 0.126220703125, "learning_rate": 1.705630182994592e-05, "loss": 0.046, "reward": 2.0435268878936768, "reward_std": 0.10965991858392954, "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9944196790456772, "step": 1090 }, { "clip_ratio": 0.0, "completion_length": 396.1317138671875, "epoch": 0.3258905234859234, "grad_norm": 0.2426033914089203, "kl": 0.150146484375, "learning_rate": 1.7048907320120867e-05, "loss": 0.0521, "reward": 2.0496652722358704, "reward_std": 0.11840680707246065, "rewards/accuracy_reward": 0.07142857438884676, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9916294813156128, "step": 1091 }, { "clip_ratio": 0.0, "completion_length": 424.70091247558594, "epoch": 0.32618923157344487, "grad_norm": 0.31790393590927124, "kl": 0.1767578125, "learning_rate": 1.7041505141775517e-05, "loss": 0.1042, "reward": 2.0295760333538055, "reward_std": 0.2067285142838955, "rewards/accuracy_reward": 0.07589285890571773, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.97823666036129, "step": 1092 }, { "clip_ratio": 0.0, "completion_length": 443.9263610839844, "epoch": 0.32648793966096634, "grad_norm": 1.6403183937072754, "kl": 0.268798828125, "learning_rate": 1.7034095302962716e-05, "loss": 0.0532, "reward": 2.032924175262451, "reward_std": 0.22850735113024712, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9882812798023224, "step": 1093 }, { "clip_ratio": 0.0, "completion_length": 423.32591247558594, "epoch": 0.3267866477484878, "grad_norm": 2.0837466716766357, "kl": 0.386474609375, "learning_rate": 1.7026677811743638e-05, "loss": 0.0575, "reward": 2.1506696939468384, "reward_std": 0.16615738347172737, "rewards/accuracy_reward": 0.18750001350417733, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9832589626312256, "step": 1094 }, { "clip_ratio": 0.0, "completion_length": 454.9531555175781, "epoch": 0.3270853558360093, "grad_norm": 2.0946478843688965, "kl": 0.186279296875, "learning_rate": 1.701925267618779e-05, "loss": 0.0725, "reward": 2.0775670409202576, "reward_std": 0.16983093321323395, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.98604916036129, "step": 1095 }, { "clip_ratio": 0.0, "completion_length": 449.2835006713867, "epoch": 0.32738406392353075, "grad_norm": 2.47999906539917, "kl": 0.6259765625, "learning_rate": 1.7011819904372992e-05, "loss": 0.0608, "reward": 2.1099331378936768, "reward_std": 0.1853840947151184, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9871652126312256, "step": 1096 }, { "clip_ratio": 0.0, "completion_length": 486.2455596923828, "epoch": 0.3276827720110522, "grad_norm": 133.90843200683594, "kl": 9.70703125, "learning_rate": 1.700437950438537e-05, "loss": 0.3855, "reward": 2.002790242433548, "reward_std": 0.2548494152724743, "rewards/accuracy_reward": 0.04464286006987095, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9827009439468384, "step": 1097 }, { "clip_ratio": 0.0, "completion_length": 494.42189025878906, "epoch": 0.32798148009857364, "grad_norm": 438.7192687988281, "kl": 10.215087890625, "learning_rate": 1.699693148431935e-05, "loss": 0.6976, "reward": 2.1361607909202576, "reward_std": 0.2504974827170372, "rewards/accuracy_reward": 0.1763392984867096, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9843750298023224, "step": 1098 }, { "clip_ratio": 0.0, "completion_length": 503.4777069091797, "epoch": 0.3282801881860951, "grad_norm": 1.1512190103530884, "kl": 0.348388671875, "learning_rate": 1.698947585227765e-05, "loss": -0.0062, "reward": 2.04241082072258, "reward_std": 0.16859184205532074, "rewards/accuracy_reward": 0.07366071688011289, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9866071790456772, "step": 1099 }, { "clip_ratio": 0.0, "completion_length": 533.8326263427734, "epoch": 0.3285788962736166, "grad_norm": 0.9055755734443665, "kl": 0.3509521484375, "learning_rate": 1.6982012616371263e-05, "loss": 0.0353, "reward": 2.064732253551483, "reward_std": 0.18035225570201874, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9888393133878708, "step": 1100 }, { "clip_ratio": 0.0, "completion_length": 512.0357360839844, "epoch": 0.32887760436113805, "grad_norm": 0.4822562634944916, "kl": 0.2630615234375, "learning_rate": 1.6974541784719458e-05, "loss": 0.0053, "reward": 2.0993304550647736, "reward_std": 0.14148844592273235, "rewards/accuracy_reward": 0.11830358020961285, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9921875298023224, "step": 1101 }, { "clip_ratio": 0.0, "completion_length": 511.5960159301758, "epoch": 0.3291763124486595, "grad_norm": 1.5008049011230469, "kl": 0.6650390625, "learning_rate": 1.6967063365449774e-05, "loss": -0.0345, "reward": 2.0239956080913544, "reward_std": 0.1829963531345129, "rewards/accuracy_reward": 0.060267859138548374, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.983816996216774, "step": 1102 }, { "clip_ratio": 0.0, "completion_length": 516.3303756713867, "epoch": 0.329475020536181, "grad_norm": 0.7869018316268921, "kl": 0.8568115234375, "learning_rate": 1.695957736669799e-05, "loss": 0.0048, "reward": 2.0385046005249023, "reward_std": 0.1695702113211155, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9871652126312256, "step": 1103 }, { "clip_ratio": 0.0, "completion_length": 518.1585083007812, "epoch": 0.32977372862370247, "grad_norm": 0.39834296703338623, "kl": 1.370849609375, "learning_rate": 1.6952083796608144e-05, "loss": -0.0378, "reward": 2.019531339406967, "reward_std": 0.18861424271017313, "rewards/accuracy_reward": 0.055803573690354824, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.983816996216774, "step": 1104 }, { "clip_ratio": 0.0, "completion_length": 517.4062881469727, "epoch": 0.33007243671122394, "grad_norm": 2.2712624073028564, "kl": 4.000244140625, "learning_rate": 1.694458266333251e-05, "loss": -0.0303, "reward": 2.116629481315613, "reward_std": 0.20866131410002708, "rewards/accuracy_reward": 0.15401786426082253, "rewards/format_reward": 0.9799107313156128, "rewards/tag_count_reward": 0.9827009290456772, "step": 1105 }, { "clip_ratio": 0.0, "completion_length": 497.76342010498047, "epoch": 0.3303711447987454, "grad_norm": 0.3130522668361664, "kl": 0.35888671875, "learning_rate": 1.6937073975031576e-05, "loss": -0.0127, "reward": 2.1037946939468384, "reward_std": 0.0960923433303833, "rewards/accuracy_reward": 0.12053572339937091, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9921875298023224, "step": 1106 }, { "clip_ratio": 0.0, "completion_length": 515.5513610839844, "epoch": 0.3306698528862669, "grad_norm": 0.18764588236808777, "kl": 0.322265625, "learning_rate": 1.6929557739874064e-05, "loss": -0.0139, "reward": 2.172991156578064, "reward_std": 0.1266226079314947, "rewards/accuracy_reward": 0.18973215110599995, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9921875298023224, "step": 1107 }, { "clip_ratio": 0.0, "completion_length": 459.26564025878906, "epoch": 0.33096856097378835, "grad_norm": 0.7857891321182251, "kl": 0.906982421875, "learning_rate": 1.69220339660369e-05, "loss": -0.0274, "reward": 2.100446492433548, "reward_std": 0.20471584796905518, "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.9843750149011612, "rewards/tag_count_reward": 0.986607164144516, "step": 1108 }, { "clip_ratio": 0.0, "completion_length": 535.834846496582, "epoch": 0.3312672690613098, "grad_norm": 0.4130170941352844, "kl": 0.339599609375, "learning_rate": 1.6914502661705216e-05, "loss": 0.004, "reward": 2.1250001788139343, "reward_std": 0.11164034903049469, "rewards/accuracy_reward": 0.149553582072258, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9910714626312256, "step": 1109 }, { "clip_ratio": 0.0, "completion_length": 534.2522583007812, "epoch": 0.3315659771488313, "grad_norm": 2.8097102642059326, "kl": 2.0576171875, "learning_rate": 1.6906963835072325e-05, "loss": 0.0003, "reward": 2.0485492050647736, "reward_std": 0.20720881968736649, "rewards/accuracy_reward": 0.08258929080329835, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9860491454601288, "step": 1110 }, { "clip_ratio": 0.0, "completion_length": 572.1562805175781, "epoch": 0.33186468523635276, "grad_norm": 0.6099815964698792, "kl": 0.1807861328125, "learning_rate": 1.6899417494339737e-05, "loss": 0.0114, "reward": 2.1333706378936768, "reward_std": 0.07705373130738735, "rewards/accuracy_reward": 0.13616071920841932, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 1111 }, { "clip_ratio": 0.0, "completion_length": 560.8348388671875, "epoch": 0.33216339332387423, "grad_norm": 0.3682892918586731, "kl": 0.5302734375, "learning_rate": 1.6891863647717135e-05, "loss": -0.0365, "reward": 2.1512277722358704, "reward_std": 0.28931667283177376, "rewards/accuracy_reward": 0.18973215110599995, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.983816996216774, "step": 1112 }, { "clip_ratio": 0.0, "completion_length": 560.7522583007812, "epoch": 0.3324621014113957, "grad_norm": 1.0491515398025513, "kl": 0.341796875, "learning_rate": 1.688430230342236e-05, "loss": 0.0085, "reward": 2.027901917695999, "reward_std": 0.09892731811851263, "rewards/accuracy_reward": 0.046875000931322575, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9921875298023224, "step": 1113 }, { "clip_ratio": 0.0, "completion_length": 537.4174423217773, "epoch": 0.3327608094989172, "grad_norm": 1.8887676000595093, "kl": 0.210205078125, "learning_rate": 1.6876733469681407e-05, "loss": 0.0177, "reward": 2.0167411267757416, "reward_std": 0.11882244423031807, "rewards/accuracy_reward": 0.03348214365541935, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9944196790456772, "step": 1114 }, { "clip_ratio": 0.0, "completion_length": 565.4174194335938, "epoch": 0.33305951758643865, "grad_norm": 0.28251394629478455, "kl": 0.259765625, "learning_rate": 1.6869157154728437e-05, "loss": -0.0158, "reward": 2.1261162161827087, "reward_std": 0.1697975341230631, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9921875149011612, "step": 1115 }, { "clip_ratio": 0.0, "completion_length": 538.0245742797852, "epoch": 0.3333582256739601, "grad_norm": 0.19714942574501038, "kl": 0.1151123046875, "learning_rate": 1.686157336680573e-05, "loss": 0.015, "reward": 2.224888503551483, "reward_std": 0.16661217156797647, "rewards/accuracy_reward": 0.227678582072258, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 1116 }, { "clip_ratio": 0.0, "completion_length": 542.1719055175781, "epoch": 0.3336569337614816, "grad_norm": 0.11726927012205124, "kl": 0.1158447265625, "learning_rate": 1.685398211416371e-05, "loss": 0.0155, "reward": 2.0089287757873535, "reward_std": 0.04791303817182779, "rewards/accuracy_reward": 0.011160714784637094, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 1.0, "step": 1117 }, { "clip_ratio": 0.0, "completion_length": 515.7857437133789, "epoch": 0.33395564184900306, "grad_norm": 0.1877799779176712, "kl": 0.4075927734375, "learning_rate": 1.6846383405060905e-05, "loss": -0.0066, "reward": 2.091517925262451, "reward_std": 0.11290003079921007, "rewards/accuracy_reward": 0.10044643096625805, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9955357313156128, "step": 1118 }, { "clip_ratio": 0.0, "completion_length": 516.0089492797852, "epoch": 0.33425434993652453, "grad_norm": 0.17218632996082306, "kl": 0.2574462890625, "learning_rate": 1.683877724776398e-05, "loss": 0.006, "reward": 2.0502232909202576, "reward_std": 0.11060897074639797, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9944196790456772, "step": 1119 }, { "clip_ratio": 0.0, "completion_length": 525.1406478881836, "epoch": 0.334553058024046, "grad_norm": 0.13107414543628693, "kl": 0.183837890625, "learning_rate": 1.6831163650547678e-05, "loss": 0.0002, "reward": 2.0485492646694183, "reward_std": 0.08617874328047037, "rewards/accuracy_reward": 0.06250000302679837, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9949776977300644, "step": 1120 }, { "clip_ratio": 0.0, "completion_length": 535.0870819091797, "epoch": 0.3348517661115675, "grad_norm": 0.26226022839546204, "kl": 0.171142578125, "learning_rate": 1.6823542621694852e-05, "loss": 0.0168, "reward": 2.060267984867096, "reward_std": 0.1044102581217885, "rewards/accuracy_reward": 0.07142857322469354, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9977678954601288, "step": 1121 }, { "clip_ratio": 0.0, "completion_length": 545.553596496582, "epoch": 0.33515047419908894, "grad_norm": 0.17774035036563873, "kl": 0.172119140625, "learning_rate": 1.681591416949643e-05, "loss": -0.0017, "reward": 2.0323662161827087, "reward_std": 0.10919780191034079, "rewards/accuracy_reward": 0.04464285867288709, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9966517984867096, "step": 1122 }, { "clip_ratio": 0.0, "completion_length": 520.0134201049805, "epoch": 0.3354491822866104, "grad_norm": 0.38894689083099365, "kl": 0.404052734375, "learning_rate": 1.6808278302251425e-05, "loss": 0.0039, "reward": 2.0379464626312256, "reward_std": 0.12110753543674946, "rewards/accuracy_reward": 0.05803571664728224, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9933036118745804, "step": 1123 }, { "clip_ratio": 0.0, "completion_length": 478.32144927978516, "epoch": 0.3357478903741319, "grad_norm": 0.34580910205841064, "kl": 0.1630859375, "learning_rate": 1.6800635028266908e-05, "loss": 0.0274, "reward": 2.117187589406967, "reward_std": 0.10889969021081924, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9966517984867096, "step": 1124 }, { "clip_ratio": 0.0, "completion_length": 512.3393096923828, "epoch": 0.33604659846165336, "grad_norm": 0.38455772399902344, "kl": 0.1446533203125, "learning_rate": 1.679298435585802e-05, "loss": 0.0411, "reward": 2.0608259737491608, "reward_std": 0.159828407689929, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.9888392984867096, "rewards/tag_count_reward": 0.9916294813156128, "step": 1125 }, { "clip_ratio": 0.0, "completion_length": 519.6495742797852, "epoch": 0.3363453065491748, "grad_norm": 0.6461003422737122, "kl": 0.1568603515625, "learning_rate": 1.678532629334793e-05, "loss": 0.0115, "reward": 2.1138394474983215, "reward_std": 0.12690858636051416, "rewards/accuracy_reward": 0.11830358067527413, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9977678656578064, "step": 1126 }, { "clip_ratio": 0.0, "completion_length": 497.60717010498047, "epoch": 0.3366440146366963, "grad_norm": 0.15263251960277557, "kl": 0.25048828125, "learning_rate": 1.677766084906787e-05, "loss": -0.0144, "reward": 2.035714328289032, "reward_std": 0.12103947438299656, "rewards/accuracy_reward": 0.04464285844005644, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9977678656578064, "step": 1127 }, { "clip_ratio": 0.0, "completion_length": 503.6205596923828, "epoch": 0.33694272272421777, "grad_norm": 0.5687162280082703, "kl": 0.3682861328125, "learning_rate": 1.6769988031357086e-05, "loss": -0.0024, "reward": 2.0295759737491608, "reward_std": 0.10386637970805168, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9916294813156128, "step": 1128 }, { "clip_ratio": 0.0, "completion_length": 523.4643173217773, "epoch": 0.33724143081173924, "grad_norm": 0.4483833312988281, "kl": 0.3267822265625, "learning_rate": 1.6762307848562858e-05, "loss": 0.0052, "reward": 2.051897406578064, "reward_std": 0.10691878292709589, "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9960937798023224, "step": 1129 }, { "clip_ratio": 0.0, "completion_length": 536.7433319091797, "epoch": 0.3375401388992607, "grad_norm": 2.236499786376953, "kl": 0.741943359375, "learning_rate": 1.6754620309040464e-05, "loss": 0.041, "reward": 2.075892925262451, "reward_std": 0.14226006343960762, "rewards/accuracy_reward": 0.08928571571595967, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.995535746216774, "step": 1130 }, { "clip_ratio": 0.0, "completion_length": 535.8705673217773, "epoch": 0.3378388469867822, "grad_norm": 0.27861690521240234, "kl": 0.3238525390625, "learning_rate": 1.6746925421153196e-05, "loss": 0.0246, "reward": 2.0770090222358704, "reward_std": 0.14319850970059633, "rewards/accuracy_reward": 0.09151786123402417, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.994419664144516, "step": 1131 }, { "clip_ratio": 0.0, "completion_length": 505.6763610839844, "epoch": 0.33813755507430365, "grad_norm": 0.43136417865753174, "kl": 0.407958984375, "learning_rate": 1.6739223193272346e-05, "loss": 0.003, "reward": 2.0820313692092896, "reward_std": 0.20251084677875042, "rewards/accuracy_reward": 0.10267857275903225, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9905134439468384, "step": 1132 }, { "clip_ratio": 0.0, "completion_length": 541.5669860839844, "epoch": 0.3384362631618251, "grad_norm": 0.09390202909708023, "kl": 0.1075439453125, "learning_rate": 1.6731513633777173e-05, "loss": 0.004, "reward": 2.166852831840515, "reward_std": 0.08185798861086369, "rewards/accuracy_reward": 0.1696428619325161, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 1133 }, { "clip_ratio": 0.0, "completion_length": 535.7076263427734, "epoch": 0.3387349712493466, "grad_norm": 0.2668273448944092, "kl": 0.2249755859375, "learning_rate": 1.6723796751054925e-05, "loss": 0.0251, "reward": 2.0223214626312256, "reward_std": 0.09394084755331278, "rewards/accuracy_reward": 0.042410717345774174, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9910714626312256, "step": 1134 }, { "clip_ratio": 0.0, "completion_length": 542.9419784545898, "epoch": 0.33903367933686807, "grad_norm": 0.17246274650096893, "kl": 0.1295166015625, "learning_rate": 1.6716072553500816e-05, "loss": 0.026, "reward": 2.0742188692092896, "reward_std": 0.14488712046295404, "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9938616454601288, "step": 1135 }, { "clip_ratio": 0.0, "completion_length": 521.9419784545898, "epoch": 0.33933238742438954, "grad_norm": 0.1479964405298233, "kl": 0.1199951171875, "learning_rate": 1.6708341049518016e-05, "loss": 0.0716, "reward": 2.0452009439468384, "reward_std": 0.18238546326756477, "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9893973469734192, "step": 1136 }, { "clip_ratio": 0.0, "completion_length": 531.3995819091797, "epoch": 0.339631095511911, "grad_norm": 0.08657194674015045, "kl": 0.1190185546875, "learning_rate": 1.670060224751764e-05, "loss": 0.0283, "reward": 2.0284598767757416, "reward_std": 0.06985790771432221, "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9949777126312256, "step": 1137 }, { "clip_ratio": 0.0, "completion_length": 570.4062957763672, "epoch": 0.3399298035994325, "grad_norm": 0.21277548372745514, "kl": 0.1380615234375, "learning_rate": 1.669285615591875e-05, "loss": 0.0393, "reward": 1.9988840520381927, "reward_std": 0.12440321315079927, "rewards/accuracy_reward": 0.022321430034935474, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9899553805589676, "step": 1138 }, { "clip_ratio": 0.0, "completion_length": 573.0402069091797, "epoch": 0.34022851168695395, "grad_norm": 134.7840576171875, "kl": 4.420654296875, "learning_rate": 1.668510278314833e-05, "loss": 0.3548, "reward": 2.076451003551483, "reward_std": 0.08100571483373642, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9960937649011612, "step": 1139 }, { "clip_ratio": 0.0, "completion_length": 558.1094055175781, "epoch": 0.3405272197744754, "grad_norm": 0.11505535244941711, "kl": 0.112060546875, "learning_rate": 1.6677342137641294e-05, "loss": 0.0229, "reward": 2.0535715222358704, "reward_std": 0.12136515136808157, "rewards/accuracy_reward": 0.07366071944124997, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9933035969734192, "step": 1140 }, { "clip_ratio": 0.0, "completion_length": 545.1986923217773, "epoch": 0.3408259278619969, "grad_norm": 0.24621063470840454, "kl": 0.16943359375, "learning_rate": 1.666957422784046e-05, "loss": 0.0597, "reward": 2.0987723767757416, "reward_std": 0.2610030993819237, "rewards/accuracy_reward": 0.14062500605359674, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9827009439468384, "step": 1141 }, { "clip_ratio": 0.0, "completion_length": 569.9375305175781, "epoch": 0.3411246359495183, "grad_norm": 0.11857727915048599, "kl": 0.1041259765625, "learning_rate": 1.666179906219656e-05, "loss": 0.0247, "reward": 2.01506707072258, "reward_std": 0.13483005575835705, "rewards/accuracy_reward": 0.03794643096625805, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9905134290456772, "step": 1142 }, { "clip_ratio": 0.0, "completion_length": 574.669677734375, "epoch": 0.3414233440370398, "grad_norm": 0.1404426544904709, "kl": 0.1278076171875, "learning_rate": 1.6654016649168203e-05, "loss": 0.0178, "reward": 2.0150671005249023, "reward_std": 0.14617122523486614, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9927455484867096, "step": 1143 }, { "clip_ratio": 0.0, "completion_length": 588.2277069091797, "epoch": 0.34172205212456125, "grad_norm": 0.1648350954055786, "kl": 0.152587890625, "learning_rate": 1.66462269972219e-05, "loss": 0.051, "reward": 2.0027902722358704, "reward_std": 0.2254163809120655, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9804687947034836, "step": 1144 }, { "clip_ratio": 0.0, "completion_length": 592.6629791259766, "epoch": 0.3420207602120827, "grad_norm": 0.21733282506465912, "kl": 0.127197265625, "learning_rate": 1.6638430114832015e-05, "loss": 0.0705, "reward": 2.1155134737491608, "reward_std": 0.24180055409669876, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9815848618745804, "step": 1145 }, { "clip_ratio": 0.0, "completion_length": 631.7053985595703, "epoch": 0.3423194682996042, "grad_norm": 0.1670110523700714, "kl": 0.1302490234375, "learning_rate": 1.6630626010480807e-05, "loss": 0.0349, "reward": 2.09319207072258, "reward_std": 0.24658512324094772, "rewards/accuracy_reward": 0.13616071990691125, "rewards/format_reward": 0.9732143431901932, "rewards/tag_count_reward": 0.9838170111179352, "step": 1146 }, { "clip_ratio": 0.0, "completion_length": 600.6138458251953, "epoch": 0.34261817638712566, "grad_norm": 21.513092041015625, "kl": 0.620361328125, "learning_rate": 1.662281469265837e-05, "loss": 0.1106, "reward": 2.151227831840515, "reward_std": 0.1889546886086464, "rewards/accuracy_reward": 0.19196430034935474, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9815848618745804, "step": 1147 }, { "clip_ratio": 0.0, "completion_length": 583.8281555175781, "epoch": 0.34291688447464713, "grad_norm": 0.34892863035202026, "kl": 0.1920166015625, "learning_rate": 1.6614996169862654e-05, "loss": 0.0178, "reward": 2.1071430444717407, "reward_std": 0.19850512593984604, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9888393133878708, "step": 1148 }, { "clip_ratio": 0.0, "completion_length": 586.0089569091797, "epoch": 0.3432155925621686, "grad_norm": 0.16194447875022888, "kl": 0.266845703125, "learning_rate": 1.6607170450599445e-05, "loss": 0.0297, "reward": 2.015625089406967, "reward_std": 0.23419231176376343, "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9687500447034836, "step": 1149 }, { "clip_ratio": 0.0, "completion_length": 575.2187652587891, "epoch": 0.3435143006496901, "grad_norm": 0.28007134795188904, "kl": 0.203369140625, "learning_rate": 1.6599337543382356e-05, "loss": 0.0246, "reward": 2.051897406578064, "reward_std": 0.17872450314462185, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.984933078289032, "step": 1150 }, { "clip_ratio": 0.0, "completion_length": 597.435302734375, "epoch": 0.34381300873721155, "grad_norm": 0.16538354754447937, "kl": 0.1119384765625, "learning_rate": 1.6591497456732827e-05, "loss": 0.0292, "reward": 2.09319207072258, "reward_std": 0.17552635818719864, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9882812947034836, "step": 1151 }, { "clip_ratio": 0.0, "completion_length": 589.2678833007812, "epoch": 0.344111716824733, "grad_norm": 0.14256902039051056, "kl": 0.11083984375, "learning_rate": 1.6583650199180097e-05, "loss": 0.0253, "reward": 2.0306920409202576, "reward_std": 0.1148433219641447, "rewards/accuracy_reward": 0.049107145285233855, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9927455484867096, "step": 1152 }, { "clip_ratio": 0.0, "completion_length": 620.7031555175781, "epoch": 0.3444104249122545, "grad_norm": 0.21060490608215332, "kl": 0.1375732421875, "learning_rate": 1.6575795779261222e-05, "loss": 0.0418, "reward": 2.1545759737491608, "reward_std": 0.2164953500032425, "rewards/accuracy_reward": 0.19419643748551607, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.984933078289032, "step": 1153 }, { "clip_ratio": 0.0, "completion_length": 592.1562652587891, "epoch": 0.34470913299977596, "grad_norm": 0.14284077286720276, "kl": 0.1109619140625, "learning_rate": 1.6567934205521036e-05, "loss": 0.0377, "reward": 2.0703125596046448, "reward_std": 0.1953265629708767, "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9854910969734192, "step": 1154 }, { "clip_ratio": 0.0, "completion_length": 545.6428680419922, "epoch": 0.34500784108729743, "grad_norm": 0.20860683917999268, "kl": 0.1083984375, "learning_rate": 1.656006548651216e-05, "loss": 0.018, "reward": 2.0864956378936768, "reward_std": 0.15861098654568195, "rewards/accuracy_reward": 0.10491071734577417, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.992745578289032, "step": 1155 }, { "clip_ratio": 0.0, "completion_length": 565.0602874755859, "epoch": 0.3453065491748189, "grad_norm": 0.11792613565921783, "kl": 0.101318359375, "learning_rate": 1.6552189630794987e-05, "loss": 0.0456, "reward": 2.089843839406967, "reward_std": 0.12209773994982243, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9938616454601288, "step": 1156 }, { "clip_ratio": 0.0, "completion_length": 561.7745742797852, "epoch": 0.34560525726234037, "grad_norm": 0.295906126499176, "kl": 0.1605224609375, "learning_rate": 1.6544306646937683e-05, "loss": 0.0774, "reward": 1.9787947535514832, "reward_std": 0.21501988917589188, "rewards/accuracy_reward": 0.024553572526201606, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9787946939468384, "step": 1157 }, { "clip_ratio": 0.0, "completion_length": 551.7232360839844, "epoch": 0.34590396534986184, "grad_norm": 0.12675856053829193, "kl": 0.1043701171875, "learning_rate": 1.6536416543516157e-05, "loss": 0.0125, "reward": 2.1054688692092896, "reward_std": 0.1363515630364418, "rewards/accuracy_reward": 0.11383929289877415, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9960937649011612, "step": 1158 }, { "clip_ratio": 0.0, "completion_length": 519.3415374755859, "epoch": 0.3462026734373833, "grad_norm": 0.08094298839569092, "kl": 0.1014404296875, "learning_rate": 1.652851932911407e-05, "loss": 0.0325, "reward": 2.0446428656578064, "reward_std": 0.0851701945066452, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9933035969734192, "step": 1159 }, { "clip_ratio": 0.0, "completion_length": 533.4910888671875, "epoch": 0.3465013815249048, "grad_norm": 0.196476548910141, "kl": 0.1287841796875, "learning_rate": 1.6520615012322815e-05, "loss": 0.0515, "reward": 2.084263503551483, "reward_std": 0.12667076010257006, "rewards/accuracy_reward": 0.12053571874275804, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9860491305589676, "step": 1160 }, { "clip_ratio": 0.0, "completion_length": 556.5602951049805, "epoch": 0.34680008961242625, "grad_norm": 0.12217886000871658, "kl": 0.100830078125, "learning_rate": 1.6512703601741517e-05, "loss": 0.0583, "reward": 2.020647406578064, "reward_std": 0.1342063844203949, "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.991629496216774, "step": 1161 }, { "clip_ratio": 0.0, "completion_length": 546.8750305175781, "epoch": 0.3470987976999477, "grad_norm": 0.20141614973545074, "kl": 0.1187744140625, "learning_rate": 1.6504785105977012e-05, "loss": 0.0361, "reward": 2.158482253551483, "reward_std": 0.14209691807627678, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.988839328289032, "step": 1162 }, { "clip_ratio": 0.0, "completion_length": 583.1295013427734, "epoch": 0.3473975057874692, "grad_norm": 39.8187370300293, "kl": 0.2957763671875, "learning_rate": 1.649685953364385e-05, "loss": 0.0751, "reward": 2.0965402126312256, "reward_std": 0.13150635734200478, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9916295111179352, "step": 1163 }, { "clip_ratio": 0.0, "completion_length": 552.1585083007812, "epoch": 0.34769621387499067, "grad_norm": 43.59043502807617, "kl": 0.41259765625, "learning_rate": 1.6488926893364276e-05, "loss": 0.0747, "reward": 2.0530134737491608, "reward_std": 0.17319795489311218, "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9860491454601288, "step": 1164 }, { "clip_ratio": 0.0, "completion_length": 565.0401916503906, "epoch": 0.34799492196251214, "grad_norm": 18.0660457611084, "kl": 0.1912841796875, "learning_rate": 1.6480987193768227e-05, "loss": 0.0284, "reward": 2.1082590222358704, "reward_std": 0.1239287331700325, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.994419664144516, "step": 1165 }, { "clip_ratio": 0.0, "completion_length": 577.1339569091797, "epoch": 0.3482936300500336, "grad_norm": 10.063819885253906, "kl": 0.658203125, "learning_rate": 1.6473040443493314e-05, "loss": 0.1071, "reward": 2.1222099363803864, "reward_std": 0.2755516339093447, "rewards/accuracy_reward": 0.17633929336443543, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9748884439468384, "step": 1166 }, { "clip_ratio": 0.0, "completion_length": 618.8661193847656, "epoch": 0.3485923381375551, "grad_norm": 0.09710503369569778, "kl": 0.098876953125, "learning_rate": 1.6465086651184826e-05, "loss": 0.0161, "reward": 2.0831474661827087, "reward_std": 0.10629567224532366, "rewards/accuracy_reward": 0.09375000419095159, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9960937798023224, "step": 1167 }, { "clip_ratio": 0.0, "completion_length": 581.4397583007812, "epoch": 0.34889104622507655, "grad_norm": 0.8230822682380676, "kl": 0.1719970703125, "learning_rate": 1.645712582549571e-05, "loss": 0.0411, "reward": 2.006696581840515, "reward_std": 0.1466161198914051, "rewards/accuracy_reward": 0.033482144586741924, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9888393133878708, "step": 1168 }, { "clip_ratio": 0.0, "completion_length": 572.9933242797852, "epoch": 0.349189754312598, "grad_norm": 0.21225635707378387, "kl": 0.1181640625, "learning_rate": 1.644915797508656e-05, "loss": 0.0444, "reward": 2.012276828289032, "reward_std": 0.18235558830201626, "rewards/accuracy_reward": 0.04464286006987095, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.987723246216774, "step": 1169 }, { "clip_ratio": 0.0, "completion_length": 586.2009124755859, "epoch": 0.3494884624001195, "grad_norm": 0.16427995264530182, "kl": 0.108154296875, "learning_rate": 1.6441183108625617e-05, "loss": 0.0214, "reward": 2.1127232909202576, "reward_std": 0.13267517648637295, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9966517984867096, "step": 1170 }, { "clip_ratio": 0.0, "completion_length": 613.4241180419922, "epoch": 0.34978717048764096, "grad_norm": 0.6009090542793274, "kl": 0.1436767578125, "learning_rate": 1.6433201234788758e-05, "loss": 0.0446, "reward": 2.0418527722358704, "reward_std": 0.20847126469016075, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.98604916036129, "step": 1171 }, { "clip_ratio": 0.0, "completion_length": 610.7701263427734, "epoch": 0.35008587857516243, "grad_norm": 0.27805885672569275, "kl": 0.120361328125, "learning_rate": 1.6425212362259474e-05, "loss": 0.0295, "reward": 2.158482253551483, "reward_std": 0.15760973654687405, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9933035969734192, "step": 1172 }, { "clip_ratio": 0.0, "completion_length": 613.9375305175781, "epoch": 0.3503845866626839, "grad_norm": 0.21993836760520935, "kl": 0.142333984375, "learning_rate": 1.641721649972888e-05, "loss": 0.0207, "reward": 2.0273438692092896, "reward_std": 0.14879334717988968, "rewards/accuracy_reward": 0.0535714328289032, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9893973618745804, "step": 1173 }, { "clip_ratio": 0.0, "completion_length": 635.1540374755859, "epoch": 0.3506832947502054, "grad_norm": 0.16722017526626587, "kl": 0.1435546875, "learning_rate": 1.640921365589569e-05, "loss": 0.0536, "reward": 2.0167411863803864, "reward_std": 0.1951602455228567, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9854911267757416, "step": 1174 }, { "clip_ratio": 0.0, "completion_length": 620.2254638671875, "epoch": 0.35098200283772685, "grad_norm": 0.1438942402601242, "kl": 0.1190185546875, "learning_rate": 1.6401203839466212e-05, "loss": 0.0408, "reward": 2.1584821939468384, "reward_std": 0.24855397827923298, "rewards/accuracy_reward": 0.18973215110599995, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9866071790456772, "step": 1175 }, { "clip_ratio": 0.0, "completion_length": 628.0089416503906, "epoch": 0.3512807109252483, "grad_norm": 0.1377178281545639, "kl": 0.0992431640625, "learning_rate": 1.6393187059154344e-05, "loss": 0.026, "reward": 2.0853795409202576, "reward_std": 0.16046156361699104, "rewards/accuracy_reward": 0.10491071757860482, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9938616305589676, "step": 1176 }, { "clip_ratio": 0.0, "completion_length": 618.7857360839844, "epoch": 0.3515794190127698, "grad_norm": 0.3011901378631592, "kl": 0.1312255859375, "learning_rate": 1.6385163323681554e-05, "loss": 0.0539, "reward": 2.0585938692092896, "reward_std": 0.182827889919281, "rewards/accuracy_reward": 0.08705357322469354, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9871652126312256, "step": 1177 }, { "clip_ratio": 0.0, "completion_length": 560.5111846923828, "epoch": 0.35187812710029126, "grad_norm": 0.18412728607654572, "kl": 0.114990234375, "learning_rate": 1.637713264177688e-05, "loss": 0.0239, "reward": 2.161272406578064, "reward_std": 0.11882243119180202, "rewards/accuracy_reward": 0.17187500977888703, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9960937649011612, "step": 1178 }, { "clip_ratio": 0.0, "completion_length": 643.9308319091797, "epoch": 0.35217683518781273, "grad_norm": 0.18599754571914673, "kl": 0.1246337890625, "learning_rate": 1.636909502217692e-05, "loss": 0.0494, "reward": 2.085379511117935, "reward_std": 0.15808406844735146, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9871652126312256, "step": 1179 }, { "clip_ratio": 0.0, "completion_length": 610.4375305175781, "epoch": 0.3524755432753342, "grad_norm": 0.07854767888784409, "kl": 0.0911865234375, "learning_rate": 1.6361050473625813e-05, "loss": 0.016, "reward": 2.1445313692092896, "reward_std": 0.0647010775282979, "rewards/accuracy_reward": 0.15625000931322575, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949776977300644, "step": 1180 }, { "clip_ratio": 0.0, "completion_length": 591.2656402587891, "epoch": 0.3527742513628557, "grad_norm": 0.17565259337425232, "kl": 0.12548828125, "learning_rate": 1.6352999004875242e-05, "loss": 0.0395, "reward": 2.0463170409202576, "reward_std": 0.1574108824133873, "rewards/accuracy_reward": 0.06696428940631449, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9905134290456772, "step": 1181 }, { "clip_ratio": 0.0, "completion_length": 593.7634124755859, "epoch": 0.35307295945037714, "grad_norm": 0.1799377053976059, "kl": 0.11083984375, "learning_rate": 1.6344940624684413e-05, "loss": 0.0349, "reward": 2.1629464626312256, "reward_std": 0.16311326064169407, "rewards/accuracy_reward": 0.1941964328289032, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9866071939468384, "step": 1182 }, { "clip_ratio": 0.0, "completion_length": 571.0134353637695, "epoch": 0.3533716675378986, "grad_norm": 0.6631428003311157, "kl": 0.1322021484375, "learning_rate": 1.6336875341820052e-05, "loss": 0.0614, "reward": 2.041852831840515, "reward_std": 0.21318715438246727, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9860491454601288, "step": 1183 }, { "clip_ratio": 0.0, "completion_length": 609.5022583007812, "epoch": 0.3536703756254201, "grad_norm": 1.6622748374938965, "kl": 0.191162109375, "learning_rate": 1.6328803165056405e-05, "loss": 0.0959, "reward": 2.087053656578064, "reward_std": 0.22076209262013435, "rewards/accuracy_reward": 0.12500000721774995, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9843750447034836, "step": 1184 }, { "clip_ratio": 0.0, "completion_length": 600.6540374755859, "epoch": 0.3539690837129415, "grad_norm": 0.3526524305343628, "kl": 0.1507568359375, "learning_rate": 1.63207241031752e-05, "loss": 0.0196, "reward": 1.99162957072258, "reward_std": 0.12128153257071972, "rewards/accuracy_reward": 0.015625000465661287, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9916295111179352, "step": 1185 }, { "clip_ratio": 0.0, "completion_length": 633.6205596923828, "epoch": 0.35426779180046297, "grad_norm": 0.5828351378440857, "kl": 0.103515625, "learning_rate": 1.631263816496567e-05, "loss": 0.0486, "reward": 2.0563617050647736, "reward_std": 0.2288995496928692, "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9871652275323868, "step": 1186 }, { "clip_ratio": 0.0, "completion_length": 623.9955596923828, "epoch": 0.35456649988798444, "grad_norm": 1.3421553373336792, "kl": 0.3516845703125, "learning_rate": 1.630454535922452e-05, "loss": 0.0224, "reward": 2.0457590222358704, "reward_std": 0.18544107116758823, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.9888392984867096, "rewards/tag_count_reward": 0.9921875149011612, "step": 1187 }, { "clip_ratio": 0.0, "completion_length": 582.1741333007812, "epoch": 0.3548652079755059, "grad_norm": 1.4891606569290161, "kl": 0.194580078125, "learning_rate": 1.6296445694755937e-05, "loss": 0.0743, "reward": 2.037388503551483, "reward_std": 0.15910603292286396, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.9821429252624512, "rewards/tag_count_reward": 0.98604916036129, "step": 1188 }, { "clip_ratio": 0.0, "completion_length": 576.3192291259766, "epoch": 0.3551639160630274, "grad_norm": 0.3242643475532532, "kl": 0.1112060546875, "learning_rate": 1.628833918037155e-05, "loss": 0.0228, "reward": 2.1244420409202576, "reward_std": 0.17956644296646118, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9882812947034836, "step": 1189 }, { "clip_ratio": 0.0, "completion_length": 549.5468978881836, "epoch": 0.35546262415054886, "grad_norm": 0.47072941064834595, "kl": 0.10791015625, "learning_rate": 1.628022582489046e-05, "loss": 0.0589, "reward": 2.1227679550647736, "reward_std": 0.1985839419066906, "rewards/accuracy_reward": 0.14732143399305642, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9910714626312256, "step": 1190 }, { "clip_ratio": 0.0, "completion_length": 574.4910888671875, "epoch": 0.3557613322380703, "grad_norm": 0.1567360907793045, "kl": 0.1026611328125, "learning_rate": 1.6272105637139203e-05, "loss": 0.0281, "reward": 2.1875001788139343, "reward_std": 0.19994967058300972, "rewards/accuracy_reward": 0.2031250074505806, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.993303582072258, "step": 1191 }, { "clip_ratio": 0.0, "completion_length": 602.3102874755859, "epoch": 0.3560600403255918, "grad_norm": 0.15464182198047638, "kl": 0.1102294921875, "learning_rate": 1.6263978625951743e-05, "loss": 0.0503, "reward": 2.0931920409202576, "reward_std": 0.19378501549363136, "rewards/accuracy_reward": 0.12053571688011289, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9905134290456772, "step": 1192 }, { "clip_ratio": 0.0, "completion_length": 562.935302734375, "epoch": 0.35635874841311327, "grad_norm": 0.14760376513004303, "kl": 0.1181640625, "learning_rate": 1.6255844800169472e-05, "loss": 0.0444, "reward": 2.098214328289032, "reward_std": 0.169113015756011, "rewards/accuracy_reward": 0.12723214668221772, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9888393133878708, "step": 1193 }, { "clip_ratio": 0.0, "completion_length": 518.1294860839844, "epoch": 0.35665745650063474, "grad_norm": 0.21869127452373505, "kl": 0.1527099609375, "learning_rate": 1.62477041686412e-05, "loss": 0.0766, "reward": 2.1121652722358704, "reward_std": 0.2368068303912878, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.984933078289032, "step": 1194 }, { "clip_ratio": 0.0, "completion_length": 531.1295013427734, "epoch": 0.3569561645881562, "grad_norm": 0.11475037038326263, "kl": 0.128662109375, "learning_rate": 1.6239556740223132e-05, "loss": 0.0492, "reward": 2.1763393878936768, "reward_std": 0.12200032733380795, "rewards/accuracy_reward": 0.19866072502918541, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.988839328289032, "step": 1195 }, { "clip_ratio": 0.0, "completion_length": 585.3772430419922, "epoch": 0.3572548726756777, "grad_norm": 0.8085581660270691, "kl": 0.2508544921875, "learning_rate": 1.6231402523778873e-05, "loss": 0.0387, "reward": 2.050781339406967, "reward_std": 0.15341350249946117, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9882812798023224, "step": 1196 }, { "clip_ratio": 0.0, "completion_length": 536.787971496582, "epoch": 0.35755358076319915, "grad_norm": 0.45116397738456726, "kl": 0.1754150390625, "learning_rate": 1.6223241528179415e-05, "loss": 0.0515, "reward": 2.0920759737491608, "reward_std": 0.21359282359480858, "rewards/accuracy_reward": 0.12723214668221772, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9849330633878708, "step": 1197 }, { "clip_ratio": 0.0, "completion_length": 555.1160888671875, "epoch": 0.3578522888507206, "grad_norm": 0.20296962559223175, "kl": 0.167236328125, "learning_rate": 1.6215073762303113e-05, "loss": 0.0695, "reward": 2.191964328289032, "reward_std": 0.2359352633357048, "rewards/accuracy_reward": 0.23437500977888703, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9821428954601288, "step": 1198 }, { "clip_ratio": 0.0, "completion_length": 552.529052734375, "epoch": 0.3581509969382421, "grad_norm": 0.236254021525383, "kl": 0.12890625, "learning_rate": 1.62068992350357e-05, "loss": 0.0707, "reward": 2.0825893878936768, "reward_std": 0.2215857356786728, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.9799107313156128, "rewards/tag_count_reward": 0.9843750298023224, "step": 1199 }, { "clip_ratio": 0.0, "completion_length": 578.4129638671875, "epoch": 0.35844970502576357, "grad_norm": 0.3206605911254883, "kl": 0.1334228515625, "learning_rate": 1.6198717955270264e-05, "loss": 0.041, "reward": 2.1004465222358704, "reward_std": 0.17701421864330769, "rewards/accuracy_reward": 0.13392857694998384, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9866071790456772, "step": 1200 }, { "clip_ratio": 0.0, "completion_length": 565.0268096923828, "epoch": 0.35874841311328504, "grad_norm": 0.2313549518585205, "kl": 0.115234375, "learning_rate": 1.619052993190723e-05, "loss": 0.0285, "reward": 2.1406250596046448, "reward_std": 0.15048612654209137, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9910714775323868, "step": 1201 }, { "clip_ratio": 0.0, "completion_length": 577.0736846923828, "epoch": 0.3590471212008065, "grad_norm": 0.42409104108810425, "kl": 0.15283203125, "learning_rate": 1.6182335173854368e-05, "loss": 0.0695, "reward": 2.074218839406967, "reward_std": 0.19872584007680416, "rewards/accuracy_reward": 0.11607143585570157, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9827009290456772, "step": 1202 }, { "clip_ratio": 0.0, "completion_length": 594.310302734375, "epoch": 0.359345829288328, "grad_norm": 0.21645620465278625, "kl": 0.153076171875, "learning_rate": 1.617413369002677e-05, "loss": 0.0249, "reward": 2.0993304550647736, "reward_std": 0.15891692601144314, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.994419664144516, "step": 1203 }, { "clip_ratio": 0.0, "completion_length": 622.3928833007812, "epoch": 0.35964453737584945, "grad_norm": 0.3476819097995758, "kl": 0.177001953125, "learning_rate": 1.616592548934685e-05, "loss": 0.0788, "reward": 1.977678656578064, "reward_std": 0.2267992552369833, "rewards/accuracy_reward": 0.029017859371379018, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9799107611179352, "step": 1204 }, { "clip_ratio": 0.0, "completion_length": 619.4754638671875, "epoch": 0.3599432454633709, "grad_norm": 0.33709660172462463, "kl": 0.223876953125, "learning_rate": 1.6157710580744322e-05, "loss": 0.0592, "reward": 2.063058078289032, "reward_std": 0.25168925523757935, "rewards/accuracy_reward": 0.10491072200238705, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9827009290456772, "step": 1205 }, { "clip_ratio": 0.0, "completion_length": 573.4643020629883, "epoch": 0.3602419535508924, "grad_norm": 0.39909759163856506, "kl": 0.2060546875, "learning_rate": 1.61494889731562e-05, "loss": 0.0389, "reward": 2.0262278020381927, "reward_std": 0.20669321343302727, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9860491454601288, "step": 1206 }, { "clip_ratio": 0.0, "completion_length": 602.1920013427734, "epoch": 0.36054066163841386, "grad_norm": 0.22957025468349457, "kl": 0.1446533203125, "learning_rate": 1.614126067552679e-05, "loss": 0.0291, "reward": 2.0000001788139343, "reward_std": 0.10925697069615126, "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9933035969734192, "step": 1207 }, { "clip_ratio": 0.0, "completion_length": 583.8326263427734, "epoch": 0.36083936972593533, "grad_norm": 0.19923803210258484, "kl": 0.1297607421875, "learning_rate": 1.6133025696807674e-05, "loss": 0.0516, "reward": 2.0474331378936768, "reward_std": 0.18661252409219742, "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.984933078289032, "step": 1208 }, { "clip_ratio": 0.0, "completion_length": 584.3013763427734, "epoch": 0.3611380778134568, "grad_norm": 0.18971475958824158, "kl": 0.134765625, "learning_rate": 1.6124784045957705e-05, "loss": 0.0629, "reward": 2.035714417695999, "reward_std": 0.17519665136933327, "rewards/accuracy_reward": 0.07142857438884676, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9866071939468384, "step": 1209 }, { "clip_ratio": 0.0, "completion_length": 577.8058319091797, "epoch": 0.3614367859009783, "grad_norm": 0.20555347204208374, "kl": 0.14111328125, "learning_rate": 1.6116535731942982e-05, "loss": 0.0578, "reward": 2.012276917695999, "reward_std": 0.1989617645740509, "rewards/accuracy_reward": 0.04241071757860482, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9877232611179352, "step": 1210 }, { "clip_ratio": 0.0, "completion_length": 593.5625305175781, "epoch": 0.36173549398849975, "grad_norm": 0.13315574824810028, "kl": 0.1168212890625, "learning_rate": 1.610828076373687e-05, "loss": 0.0386, "reward": 2.0284599363803864, "reward_std": 0.12957448698580265, "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9905134290456772, "step": 1211 }, { "clip_ratio": 0.0, "completion_length": 520.7009124755859, "epoch": 0.3620342020760212, "grad_norm": 0.12253045290708542, "kl": 0.1064453125, "learning_rate": 1.6100019150319966e-05, "loss": 0.0079, "reward": 2.1489956378936768, "reward_std": 0.13058488070964813, "rewards/accuracy_reward": 0.16071428963914514, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949776977300644, "step": 1212 }, { "clip_ratio": 0.0, "completion_length": 551.2701187133789, "epoch": 0.3623329101635427, "grad_norm": 0.3128413259983063, "kl": 0.1539306640625, "learning_rate": 1.6091750900680088e-05, "loss": 0.0449, "reward": 2.103236675262451, "reward_std": 0.1978599689900875, "rewards/accuracy_reward": 0.1383928586728871, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9827009439468384, "step": 1213 }, { "clip_ratio": 0.0, "completion_length": 536.3437881469727, "epoch": 0.36263161825106416, "grad_norm": 0.6315244436264038, "kl": 0.1513671875, "learning_rate": 1.608347602381229e-05, "loss": 0.0709, "reward": 2.1986608505249023, "reward_std": 0.16779845021665096, "rewards/accuracy_reward": 0.2209821566939354, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9910714626312256, "step": 1214 }, { "clip_ratio": 0.0, "completion_length": 489.3482360839844, "epoch": 0.36293032633858563, "grad_norm": 0.24625541269779205, "kl": 0.1337890625, "learning_rate": 1.6075194528718818e-05, "loss": 0.0302, "reward": 2.1562500596046448, "reward_std": 0.18241567723453045, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9933035969734192, "step": 1215 }, { "clip_ratio": 0.0, "completion_length": 498.20091247558594, "epoch": 0.3632290344261071, "grad_norm": 0.2158639132976532, "kl": 0.1502685546875, "learning_rate": 1.6066906424409135e-05, "loss": 0.0832, "reward": 2.0078126192092896, "reward_std": 0.1359437769278884, "rewards/accuracy_reward": 0.035714288242161274, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.987723246216774, "step": 1216 }, { "clip_ratio": 0.0, "completion_length": 503.9196548461914, "epoch": 0.36352774251362857, "grad_norm": 0.41877850890159607, "kl": 0.1363525390625, "learning_rate": 1.605861171989988e-05, "loss": 0.0748, "reward": 2.060267984867096, "reward_std": 0.15144335944205523, "rewards/accuracy_reward": 0.08035714784637094, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9910714477300644, "step": 1217 }, { "clip_ratio": 0.0, "completion_length": 489.8303909301758, "epoch": 0.36382645060115004, "grad_norm": 0.147544264793396, "kl": 0.1136474609375, "learning_rate": 1.6050310424214885e-05, "loss": 0.0104, "reward": 2.130580484867096, "reward_std": 0.1875695437192917, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9966517984867096, "step": 1218 }, { "clip_ratio": 0.0, "completion_length": 510.78797912597656, "epoch": 0.3641251586886715, "grad_norm": 0.19236363470554352, "kl": 0.1173095703125, "learning_rate": 1.604200254638514e-05, "loss": 0.0504, "reward": 2.0385046005249023, "reward_std": 0.1111270422115922, "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9938616305589676, "step": 1219 }, { "clip_ratio": 0.0, "completion_length": 499.64957427978516, "epoch": 0.364423866776193, "grad_norm": 0.23126409947872162, "kl": 0.1273193359375, "learning_rate": 1.6033688095448808e-05, "loss": 0.0464, "reward": 2.1289064288139343, "reward_std": 0.11784686613827944, "rewards/accuracy_reward": 0.14062500861473382, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949776977300644, "step": 1220 }, { "clip_ratio": 0.0, "completion_length": 568.3973388671875, "epoch": 0.36472257486371445, "grad_norm": 0.16385775804519653, "kl": 0.12060546875, "learning_rate": 1.602536708045119e-05, "loss": 0.0242, "reward": 2.053013503551483, "reward_std": 0.09697395004332066, "rewards/accuracy_reward": 0.06250000488944352, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9949776977300644, "step": 1221 }, { "clip_ratio": 0.0, "completion_length": 555.7634124755859, "epoch": 0.3650212829512359, "grad_norm": 0.15587188303470612, "kl": 0.1461181640625, "learning_rate": 1.6017039510444737e-05, "loss": 0.0777, "reward": 2.040736734867096, "reward_std": 0.20740671828389168, "rewards/accuracy_reward": 0.07589286123402417, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.984933078289032, "step": 1222 }, { "clip_ratio": 0.0, "completion_length": 531.7790451049805, "epoch": 0.3653199910387574, "grad_norm": 0.1588798314332962, "kl": 0.133056640625, "learning_rate": 1.6008705394489032e-05, "loss": 0.0199, "reward": 2.092076003551483, "reward_std": 0.0917809596285224, "rewards/accuracy_reward": 0.10491071874275804, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9938616305589676, "step": 1223 }, { "clip_ratio": 0.0, "completion_length": 552.6361846923828, "epoch": 0.36561869912627887, "grad_norm": 0.4807548522949219, "kl": 0.175048828125, "learning_rate": 1.6000364741650775e-05, "loss": 0.0319, "reward": 2.0926340222358704, "reward_std": 0.16116963326931, "rewards/accuracy_reward": 0.10937500675208867, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9921875298023224, "step": 1224 }, { "clip_ratio": 0.0, "completion_length": 542.2053833007812, "epoch": 0.36591740721380034, "grad_norm": 0.262432336807251, "kl": 0.2840576171875, "learning_rate": 1.5992017561003777e-05, "loss": 0.0496, "reward": 2.012834906578064, "reward_std": 0.21197157353162766, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.983816996216774, "step": 1225 }, { "clip_ratio": 0.0, "completion_length": 532.9419860839844, "epoch": 0.3662161153013218, "grad_norm": 0.332716703414917, "kl": 0.119873046875, "learning_rate": 1.598366386162895e-05, "loss": 0.0332, "reward": 2.0574778020381927, "reward_std": 0.09664397686719894, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949776977300644, "step": 1226 }, { "clip_ratio": 0.0, "completion_length": 577.8951263427734, "epoch": 0.3665148233888433, "grad_norm": 0.1929003894329071, "kl": 0.1444091796875, "learning_rate": 1.597530365261431e-05, "loss": 0.0243, "reward": 2.075892925262451, "reward_std": 0.18719999864697456, "rewards/accuracy_reward": 0.09821429289877415, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9888393133878708, "step": 1227 }, { "clip_ratio": 0.0, "completion_length": 574.1250305175781, "epoch": 0.3668135314763647, "grad_norm": 0.28432244062423706, "kl": 0.15771484375, "learning_rate": 1.5966936943054933e-05, "loss": 0.0651, "reward": 2.144531339406967, "reward_std": 0.20653471909463406, "rewards/accuracy_reward": 0.18080357694998384, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.983816996216774, "step": 1228 }, { "clip_ratio": 0.0, "completion_length": 585.9486846923828, "epoch": 0.36711223956388617, "grad_norm": 1.757881999015808, "kl": 0.37548828125, "learning_rate": 1.5958563742052987e-05, "loss": 0.0459, "reward": 2.0530134439468384, "reward_std": 0.19171074405312538, "rewards/accuracy_reward": 0.07812500232830644, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9905134290456772, "step": 1229 }, { "clip_ratio": 0.0, "completion_length": 572.1183319091797, "epoch": 0.36741094765140764, "grad_norm": 0.16488973796367645, "kl": 0.1129150390625, "learning_rate": 1.5950184058717694e-05, "loss": 0.0385, "reward": 2.1261162161827087, "reward_std": 0.20496245473623276, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.994419664144516, "step": 1230 }, { "clip_ratio": 0.0, "completion_length": 584.9844055175781, "epoch": 0.3677096557389291, "grad_norm": 0.3307594656944275, "kl": 0.1380615234375, "learning_rate": 1.5941797902165325e-05, "loss": 0.0723, "reward": 2.0853795409202576, "reward_std": 0.21830780059099197, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9827009290456772, "step": 1231 }, { "clip_ratio": 0.0, "completion_length": 597.5402069091797, "epoch": 0.3680083638264506, "grad_norm": 0.557966411113739, "kl": 0.1544189453125, "learning_rate": 1.5933405281519195e-05, "loss": 0.0744, "reward": 2.01506707072258, "reward_std": 0.21328108478337526, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.9732143133878708, "rewards/tag_count_reward": 0.9793527126312256, "step": 1232 }, { "clip_ratio": 0.0, "completion_length": 552.2165374755859, "epoch": 0.36830707191397205, "grad_norm": 0.3264070749282837, "kl": 0.1708984375, "learning_rate": 1.5925006205909654e-05, "loss": 0.066, "reward": 2.1434152722358704, "reward_std": 0.2224917747080326, "rewards/accuracy_reward": 0.1897321492433548, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9804687947034836, "step": 1233 }, { "clip_ratio": 0.0, "completion_length": 578.9352874755859, "epoch": 0.3686057800014935, "grad_norm": 0.9557449221611023, "kl": 0.2969970703125, "learning_rate": 1.5916600684474076e-05, "loss": 0.0781, "reward": 2.0479912161827087, "reward_std": 0.2750463895499706, "rewards/accuracy_reward": 0.10491071734577417, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.9743303954601288, "step": 1234 }, { "clip_ratio": 0.0, "completion_length": 583.6205596923828, "epoch": 0.368904488089015, "grad_norm": 0.2797626852989197, "kl": 0.1605224609375, "learning_rate": 1.5908188726356843e-05, "loss": 0.043, "reward": 2.0580357909202576, "reward_std": 0.13208809681236744, "rewards/accuracy_reward": 0.09151786309666932, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9866071790456772, "step": 1235 }, { "clip_ratio": 0.0, "completion_length": 589.5982360839844, "epoch": 0.36920319617653646, "grad_norm": 0.4013192653656006, "kl": 0.208740234375, "learning_rate": 1.589977034070934e-05, "loss": 0.0857, "reward": 1.9531250894069672, "reward_std": 0.2293182723224163, "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9776786118745804, "step": 1236 }, { "clip_ratio": 0.0, "completion_length": 593.9531555175781, "epoch": 0.36950190426405793, "grad_norm": 0.21921592950820923, "kl": 0.1728515625, "learning_rate": 1.5891345536689943e-05, "loss": 0.0871, "reward": 2.0870537161827087, "reward_std": 0.23198588658124208, "rewards/accuracy_reward": 0.13392858067527413, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.9843750298023224, "step": 1237 }, { "clip_ratio": 0.0, "completion_length": 564.3884124755859, "epoch": 0.3698006123515794, "grad_norm": 0.35704752802848816, "kl": 0.1826171875, "learning_rate": 1.5882914323464022e-05, "loss": 0.0921, "reward": 2.0591518580913544, "reward_std": 0.2726907953619957, "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9720982611179352, "step": 1238 }, { "clip_ratio": 0.0, "completion_length": 527.6428756713867, "epoch": 0.3700993204391009, "grad_norm": 0.3425486385822296, "kl": 0.134033203125, "learning_rate": 1.5874476710203902e-05, "loss": 0.0672, "reward": 2.0647322833538055, "reward_std": 0.18033185973763466, "rewards/accuracy_reward": 0.09821429150179029, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9866071939468384, "step": 1239 }, { "clip_ratio": 0.0, "completion_length": 531.6741485595703, "epoch": 0.37039802852662235, "grad_norm": 0.2858772575855255, "kl": 0.14697265625, "learning_rate": 1.586603270608888e-05, "loss": 0.0661, "reward": 2.0998884737491608, "reward_std": 0.23287426680326462, "rewards/accuracy_reward": 0.14732143376022577, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9793527275323868, "step": 1240 }, { "clip_ratio": 0.0, "completion_length": 555.4330749511719, "epoch": 0.3706967366141438, "grad_norm": 0.40009936690330505, "kl": 0.1976318359375, "learning_rate": 1.5857582320305207e-05, "loss": 0.1127, "reward": 2.007812589406967, "reward_std": 0.19470183737576008, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.983258992433548, "step": 1241 }, { "clip_ratio": 0.0, "completion_length": 577.3393249511719, "epoch": 0.3709954447016653, "grad_norm": 0.8637252449989319, "kl": 0.2767333984375, "learning_rate": 1.5849125562046075e-05, "loss": 0.064, "reward": 1.9732143580913544, "reward_std": 0.1727153342217207, "rewards/accuracy_reward": 0.02232142980210483, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.979910746216774, "step": 1242 }, { "clip_ratio": 0.0, "completion_length": 579.2031555175781, "epoch": 0.37129415278918676, "grad_norm": 0.2859642207622528, "kl": 0.162353515625, "learning_rate": 1.584066244051161e-05, "loss": 0.0734, "reward": 2.0005581378936768, "reward_std": 0.2443934641778469, "rewards/accuracy_reward": 0.04687500209547579, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9804687947034836, "step": 1243 }, { "clip_ratio": 0.0, "completion_length": 547.8839569091797, "epoch": 0.37159286087670823, "grad_norm": 0.24260981380939484, "kl": 0.1925048828125, "learning_rate": 1.583219296490885e-05, "loss": 0.0778, "reward": 2.053013503551483, "reward_std": 0.22732090577483177, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9838170111179352, "step": 1244 }, { "clip_ratio": 0.0, "completion_length": 570.4776992797852, "epoch": 0.3718915689642297, "grad_norm": 0.48612770438194275, "kl": 0.220703125, "learning_rate": 1.5823717144451768e-05, "loss": 0.0732, "reward": 2.0161831080913544, "reward_std": 0.18181153759360313, "rewards/accuracy_reward": 0.060267861699685454, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9827009290456772, "step": 1245 }, { "clip_ratio": 0.0, "completion_length": 571.4576263427734, "epoch": 0.3721902770517512, "grad_norm": 0.2191178798675537, "kl": 0.137451171875, "learning_rate": 1.581523498836121e-05, "loss": 0.0353, "reward": 2.133928656578064, "reward_std": 0.18684300780296326, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.9843750149011612, "rewards/tag_count_reward": 0.9910714477300644, "step": 1246 }, { "clip_ratio": 0.0, "completion_length": 593.0602874755859, "epoch": 0.37248898513927264, "grad_norm": 0.24475809931755066, "kl": 0.197265625, "learning_rate": 1.5806746505864947e-05, "loss": 0.0641, "reward": 2.019531339406967, "reward_std": 0.2687748447060585, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9748884290456772, "step": 1247 }, { "clip_ratio": 0.0, "completion_length": 488.4330520629883, "epoch": 0.3727876932267941, "grad_norm": 0.20626607537269592, "kl": 0.1295166015625, "learning_rate": 1.5798251706197606e-05, "loss": 0.0528, "reward": 2.117187649011612, "reward_std": 0.16439055278897285, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9899553954601288, "step": 1248 }, { "clip_ratio": 0.0, "completion_length": 567.7165374755859, "epoch": 0.3730864013143156, "grad_norm": 0.16641777753829956, "kl": 0.10595703125, "learning_rate": 1.5789750598600693e-05, "loss": 0.0391, "reward": 2.1138393878936768, "reward_std": 0.20657880418002605, "rewards/accuracy_reward": 0.14955358067527413, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9866071790456772, "step": 1249 }, { "clip_ratio": 0.0, "completion_length": 493.27234649658203, "epoch": 0.37338510940183706, "grad_norm": 0.14920395612716675, "kl": 0.110107421875, "learning_rate": 1.578124319232259e-05, "loss": 0.0714, "reward": 2.0904018580913544, "reward_std": 0.17382002249360085, "rewards/accuracy_reward": 0.11383929033763707, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9899553805589676, "step": 1250 }, { "clip_ratio": 0.0, "completion_length": 573.8750305175781, "epoch": 0.3736838174893585, "grad_norm": 0.22336354851722717, "kl": 0.1343994140625, "learning_rate": 1.577272949661852e-05, "loss": 0.0409, "reward": 2.0407367050647736, "reward_std": 0.19303172454237938, "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9893973618745804, "step": 1251 }, { "clip_ratio": 0.0, "completion_length": 522.2924423217773, "epoch": 0.37398252557688, "grad_norm": 0.22966362535953522, "kl": 0.156982421875, "learning_rate": 1.576420952075054e-05, "loss": 0.079, "reward": 2.0295759737491608, "reward_std": 0.18479255586862564, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9827009439468384, "step": 1252 }, { "clip_ratio": 0.0, "completion_length": 531.0558242797852, "epoch": 0.37428123366440147, "grad_norm": 0.18737062811851501, "kl": 0.1065673828125, "learning_rate": 1.5755683273987554e-05, "loss": 0.0498, "reward": 2.027343839406967, "reward_std": 0.15447289869189262, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9871652126312256, "step": 1253 }, { "clip_ratio": 0.0, "completion_length": 503.0736770629883, "epoch": 0.37457994175192294, "grad_norm": 0.12644799053668976, "kl": 0.1026611328125, "learning_rate": 1.5747150765605285e-05, "loss": 0.0148, "reward": 2.0747768580913544, "reward_std": 0.13304893113672733, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.994419664144516, "step": 1254 }, { "clip_ratio": 0.0, "completion_length": 553.5781555175781, "epoch": 0.3748786498394444, "grad_norm": 0.2386494129896164, "kl": 0.210205078125, "learning_rate": 1.5738612004886267e-05, "loss": 0.0487, "reward": 1.9927456676959991, "reward_std": 0.1437948690727353, "rewards/accuracy_reward": 0.02008928661234677, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9882812947034836, "step": 1255 }, { "clip_ratio": 0.0, "completion_length": 540.7812652587891, "epoch": 0.3751773579269659, "grad_norm": 0.25896114110946655, "kl": 0.178466796875, "learning_rate": 1.5730067001119832e-05, "loss": 0.0748, "reward": 2.0965403020381927, "reward_std": 0.20765643566846848, "rewards/accuracy_reward": 0.13839286612346768, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9827009439468384, "step": 1256 }, { "clip_ratio": 0.0, "completion_length": 538.3995666503906, "epoch": 0.37547606601448735, "grad_norm": 0.5846652984619141, "kl": 0.19970703125, "learning_rate": 1.5721515763602106e-05, "loss": 0.1011, "reward": 2.0156250596046448, "reward_std": 0.256892915815115, "rewards/accuracy_reward": 0.06473214784637094, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9776786267757416, "step": 1257 }, { "clip_ratio": 0.0, "completion_length": 483.1718978881836, "epoch": 0.3757747741020088, "grad_norm": 0.21827948093414307, "kl": 0.170654296875, "learning_rate": 1.5712958301635993e-05, "loss": 0.0898, "reward": 2.1032367944717407, "reward_std": 0.2151811495423317, "rewards/accuracy_reward": 0.14062500465661287, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9827009290456772, "step": 1258 }, { "clip_ratio": 0.0, "completion_length": 544.0111770629883, "epoch": 0.3760734821895303, "grad_norm": 0.3156997263431549, "kl": 0.18603515625, "learning_rate": 1.5704394624531184e-05, "loss": 0.0385, "reward": 2.0491072237491608, "reward_std": 0.15443539805710316, "rewards/accuracy_reward": 0.07812500488944352, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.988839328289032, "step": 1259 }, { "clip_ratio": 0.0, "completion_length": 532.4085159301758, "epoch": 0.37637219027705177, "grad_norm": 0.49958741664886475, "kl": 0.1805419921875, "learning_rate": 1.5695824741604114e-05, "loss": 0.0524, "reward": 2.0904018878936768, "reward_std": 0.15710200555622578, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9899553954601288, "step": 1260 }, { "clip_ratio": 0.0, "completion_length": 486.43528747558594, "epoch": 0.37667089836457324, "grad_norm": 0.19192224740982056, "kl": 0.171875, "learning_rate": 1.568724866217797e-05, "loss": 0.0655, "reward": 2.0904018878936768, "reward_std": 0.21644221805036068, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.983258992433548, "step": 1261 }, { "clip_ratio": 0.0, "completion_length": 474.2076110839844, "epoch": 0.3769696064520947, "grad_norm": 0.923538863658905, "kl": 0.1417236328125, "learning_rate": 1.56786663955827e-05, "loss": 0.038, "reward": 2.1763393878936768, "reward_std": 0.14105456788092852, "rewards/accuracy_reward": 0.1919642984867096, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9933035969734192, "step": 1262 }, { "clip_ratio": 0.0, "completion_length": 525.6741333007812, "epoch": 0.3772683145396162, "grad_norm": 0.32267987728118896, "kl": 0.14306640625, "learning_rate": 1.5670077951154955e-05, "loss": 0.0811, "reward": 2.1015626192092896, "reward_std": 0.2782890424132347, "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.981026828289032, "step": 1263 }, { "clip_ratio": 0.0, "completion_length": 493.6607360839844, "epoch": 0.37756702262713765, "grad_norm": 0.318539559841156, "kl": 0.1622314453125, "learning_rate": 1.5661483338238127e-05, "loss": 0.0648, "reward": 2.061384081840515, "reward_std": 0.17902828752994537, "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9877232611179352, "step": 1264 }, { "clip_ratio": 0.0, "completion_length": 524.9085083007812, "epoch": 0.3778657307146591, "grad_norm": 0.17791487276554108, "kl": 0.1397705078125, "learning_rate": 1.5652882566182316e-05, "loss": 0.0508, "reward": 2.0435268580913544, "reward_std": 0.14415285363793373, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9899553954601288, "step": 1265 }, { "clip_ratio": 0.0, "completion_length": 524.1250152587891, "epoch": 0.3781644388021806, "grad_norm": 0.4347815215587616, "kl": 0.225830078125, "learning_rate": 1.5644275644344313e-05, "loss": 0.0473, "reward": 2.095982253551483, "reward_std": 0.19448642805218697, "rewards/accuracy_reward": 0.12500000488944352, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9866071790456772, "step": 1266 }, { "clip_ratio": 0.0, "completion_length": 520.2701187133789, "epoch": 0.37846314688970206, "grad_norm": 0.24447813630104065, "kl": 0.123046875, "learning_rate": 1.5635662582087604e-05, "loss": 0.026, "reward": 2.0859376192092896, "reward_std": 0.09136179089546204, "rewards/accuracy_reward": 0.09598214901052415, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9966517984867096, "step": 1267 }, { "clip_ratio": 0.0, "completion_length": 532.7522583007812, "epoch": 0.37876185497722353, "grad_norm": 0.1969151347875595, "kl": 0.155517578125, "learning_rate": 1.5627043388782365e-05, "loss": 0.0616, "reward": 2.141741156578064, "reward_std": 0.1725965989753604, "rewards/accuracy_reward": 0.16964286309666932, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.987723246216774, "step": 1268 }, { "clip_ratio": 0.0, "completion_length": 547.9107360839844, "epoch": 0.379060563064745, "grad_norm": 0.8161503076553345, "kl": 0.169921875, "learning_rate": 1.5618418073805425e-05, "loss": 0.096, "reward": 2.0619420409202576, "reward_std": 0.26442209631204605, "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9815848618745804, "step": 1269 }, { "clip_ratio": 0.0, "completion_length": 571.3772583007812, "epoch": 0.3793592711522665, "grad_norm": 0.229054257273674, "kl": 0.1220703125, "learning_rate": 1.560978664654029e-05, "loss": 0.0461, "reward": 2.158482253551483, "reward_std": 0.16136298701167107, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9910714626312256, "step": 1270 }, { "clip_ratio": 0.0, "completion_length": 522.5424194335938, "epoch": 0.3796579792397879, "grad_norm": 0.25287947058677673, "kl": 0.1787109375, "learning_rate": 1.5601149116377095e-05, "loss": 0.0802, "reward": 2.0396206378936768, "reward_std": 0.20205790735781193, "rewards/accuracy_reward": 0.08258928824216127, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9793527126312256, "step": 1271 }, { "clip_ratio": 0.0, "completion_length": 554.5513610839844, "epoch": 0.37995668732730936, "grad_norm": 0.19290439784526825, "kl": 0.1571044921875, "learning_rate": 1.5592505492712635e-05, "loss": 0.0576, "reward": 2.050781339406967, "reward_std": 0.15772902593016624, "rewards/accuracy_reward": 0.07366071874275804, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9905134439468384, "step": 1272 }, { "clip_ratio": 0.0, "completion_length": 572.8437652587891, "epoch": 0.38025539541483083, "grad_norm": 2.3690829277038574, "kl": 0.341064453125, "learning_rate": 1.5583855784950323e-05, "loss": 0.0881, "reward": 2.1545759439468384, "reward_std": 0.26088231429457664, "rewards/accuracy_reward": 0.20535715483129025, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9782366454601288, "step": 1273 }, { "clip_ratio": 0.0, "completion_length": 552.8683319091797, "epoch": 0.3805541035023523, "grad_norm": 0.17746984958648682, "kl": 0.119140625, "learning_rate": 1.5575200002500197e-05, "loss": 0.0407, "reward": 2.0848215222358704, "reward_std": 0.1711163241416216, "rewards/accuracy_reward": 0.10044643329456449, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.993303582072258, "step": 1274 }, { "clip_ratio": 0.0, "completion_length": 549.366096496582, "epoch": 0.3808528115898738, "grad_norm": 0.15306901931762695, "kl": 0.133056640625, "learning_rate": 1.5566538154778894e-05, "loss": 0.0345, "reward": 2.0820313096046448, "reward_std": 0.1283650202676654, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.990513414144516, "step": 1275 }, { "clip_ratio": 0.0, "completion_length": 535.8147659301758, "epoch": 0.38115151967739525, "grad_norm": 0.25247499346733093, "kl": 0.1441650390625, "learning_rate": 1.555787025120966e-05, "loss": 0.0376, "reward": 2.064732313156128, "reward_std": 0.16219197865575552, "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.9866071492433548, "rewards/tag_count_reward": 0.9910714328289032, "step": 1276 }, { "clip_ratio": 0.0, "completion_length": 539.6830520629883, "epoch": 0.3814502277649167, "grad_norm": 0.17527000606060028, "kl": 0.1219482421875, "learning_rate": 1.554919630122232e-05, "loss": 0.04, "reward": 2.1361607909202576, "reward_std": 0.10926515236496925, "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9933035969734192, "step": 1277 }, { "clip_ratio": 0.0, "completion_length": 500.48216247558594, "epoch": 0.3817489358524382, "grad_norm": 0.9884418249130249, "kl": 0.195068359375, "learning_rate": 1.5540516314253284e-05, "loss": 0.0636, "reward": 2.122209906578064, "reward_std": 0.22952507436275482, "rewards/accuracy_reward": 0.14732143515720963, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9882812798023224, "step": 1278 }, { "clip_ratio": 0.0, "completion_length": 547.7634124755859, "epoch": 0.38204764393995966, "grad_norm": 0.1274106502532959, "kl": 0.140625, "learning_rate": 1.553183029974553e-05, "loss": 0.0284, "reward": 2.0619420409202576, "reward_std": 0.08020107541233301, "rewards/accuracy_reward": 0.07812500488944352, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.992745578289032, "step": 1279 }, { "clip_ratio": 0.0, "completion_length": 497.83038330078125, "epoch": 0.38234635202748113, "grad_norm": 0.35320717096328735, "kl": 0.1405029296875, "learning_rate": 1.5523138267148582e-05, "loss": 0.0799, "reward": 2.0379464626312256, "reward_std": 0.23467306047677994, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9821428954601288, "step": 1280 }, { "clip_ratio": 0.0, "completion_length": 569.4553833007812, "epoch": 0.3826450601150026, "grad_norm": 1.997308611869812, "kl": 0.270263671875, "learning_rate": 1.551444022591853e-05, "loss": 0.0588, "reward": 1.997209906578064, "reward_std": 0.13184263557195663, "rewards/accuracy_reward": 0.024553571827709675, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9882812798023224, "step": 1281 }, { "clip_ratio": 0.0, "completion_length": 541.6361999511719, "epoch": 0.38294376820252407, "grad_norm": 0.2428693324327469, "kl": 0.1153564453125, "learning_rate": 1.5505736185517984e-05, "loss": 0.0407, "reward": 2.066964417695999, "reward_std": 0.12243719771504402, "rewards/accuracy_reward": 0.07812500419095159, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9955357313156128, "step": 1282 }, { "clip_ratio": 0.0, "completion_length": 561.1160888671875, "epoch": 0.38324247629004554, "grad_norm": 0.17941230535507202, "kl": 0.10595703125, "learning_rate": 1.5497026155416087e-05, "loss": 0.0293, "reward": 2.1350447833538055, "reward_std": 0.20902094617486, "rewards/accuracy_reward": 0.15178572107106447, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9921875149011612, "step": 1283 }, { "clip_ratio": 0.0, "completion_length": 559.1183319091797, "epoch": 0.383541184377567, "grad_norm": 0.23167091608047485, "kl": 0.1082763671875, "learning_rate": 1.5488310145088503e-05, "loss": 0.0472, "reward": 2.1406250596046448, "reward_std": 0.1855334658175707, "rewards/accuracy_reward": 0.1785714402794838, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9843750447034836, "step": 1284 }, { "clip_ratio": 0.0, "completion_length": 535.0647430419922, "epoch": 0.3838398924650885, "grad_norm": 0.18108707666397095, "kl": 0.2955322265625, "learning_rate": 1.547958816401739e-05, "loss": 0.0099, "reward": 2.157924175262451, "reward_std": 0.15386850014328957, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.992745578289032, "step": 1285 }, { "clip_ratio": 0.0, "completion_length": 533.4241333007812, "epoch": 0.38413860055260995, "grad_norm": 0.1739879846572876, "kl": 0.1553955078125, "learning_rate": 1.5470860221691414e-05, "loss": 0.0685, "reward": 1.97991082072258, "reward_std": 0.15485317166894674, "rewards/accuracy_reward": 0.017857143422588706, "rewards/format_reward": 0.9799107313156128, "rewards/tag_count_reward": 0.9821428805589676, "step": 1286 }, { "clip_ratio": 0.0, "completion_length": 534.1049423217773, "epoch": 0.3844373086401314, "grad_norm": 0.18470606207847595, "kl": 0.1427001953125, "learning_rate": 1.5462126327605717e-05, "loss": 0.0562, "reward": 2.03850457072258, "reward_std": 0.15647910255938768, "rewards/accuracy_reward": 0.06250000465661287, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9893973618745804, "step": 1287 }, { "clip_ratio": 0.0, "completion_length": 559.2678985595703, "epoch": 0.3847360167276529, "grad_norm": 0.5500738024711609, "kl": 0.1417236328125, "learning_rate": 1.5453386491261923e-05, "loss": 0.0487, "reward": 2.1422992050647736, "reward_std": 0.16719818487763405, "rewards/accuracy_reward": 0.16517857694998384, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.990513414144516, "step": 1288 }, { "clip_ratio": 0.0, "completion_length": 547.9129791259766, "epoch": 0.38503472481517437, "grad_norm": 0.14542542397975922, "kl": 0.0999755859375, "learning_rate": 1.5444640722168114e-05, "loss": 0.0491, "reward": 2.1439733505249023, "reward_std": 0.14676668494939804, "rewards/accuracy_reward": 0.15848215110599995, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9944196790456772, "step": 1289 }, { "clip_ratio": 0.0, "completion_length": 534.2076034545898, "epoch": 0.38533343290269584, "grad_norm": 1.2807670831680298, "kl": 0.36572265625, "learning_rate": 1.5435889029838832e-05, "loss": 0.0521, "reward": 2.0223214626312256, "reward_std": 0.09347299672663212, "rewards/accuracy_reward": 0.040178571827709675, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9910714626312256, "step": 1290 }, { "clip_ratio": 0.0, "completion_length": 559.3125305175781, "epoch": 0.3856321409902173, "grad_norm": 0.21827343106269836, "kl": 0.13134765625, "learning_rate": 1.542713142379506e-05, "loss": 0.0385, "reward": 2.0691965222358704, "reward_std": 0.12085692770779133, "rewards/accuracy_reward": 0.08258928684517741, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9933035969734192, "step": 1291 }, { "clip_ratio": 0.0, "completion_length": 521.3482437133789, "epoch": 0.3859308490777388, "grad_norm": 0.1919410526752472, "kl": 0.14208984375, "learning_rate": 1.541836791356422e-05, "loss": 0.0496, "reward": 2.1350446939468384, "reward_std": 0.20832297019660473, "rewards/accuracy_reward": 0.1674107201397419, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9854911118745804, "step": 1292 }, { "clip_ratio": 0.0, "completion_length": 556.7544860839844, "epoch": 0.38622955716526025, "grad_norm": 0.17860981822013855, "kl": 0.117919921875, "learning_rate": 1.5409598508680138e-05, "loss": 0.0329, "reward": 2.0379465222358704, "reward_std": 0.10890529677271843, "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9910714626312256, "step": 1293 }, { "clip_ratio": 0.0, "completion_length": 585.9777221679688, "epoch": 0.3865282652527817, "grad_norm": 0.18958529829978943, "kl": 0.1422119140625, "learning_rate": 1.5400823218683083e-05, "loss": 0.0593, "reward": 2.044642925262451, "reward_std": 0.20098240301012993, "rewards/accuracy_reward": 0.0848214344587177, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9821428954601288, "step": 1294 }, { "clip_ratio": 0.0, "completion_length": 567.7678833007812, "epoch": 0.3868269733403032, "grad_norm": 0.19237491488456726, "kl": 0.1285400390625, "learning_rate": 1.53920420531197e-05, "loss": 0.0792, "reward": 2.2299107909202576, "reward_std": 0.2355727031826973, "rewards/accuracy_reward": 0.2656250111758709, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9866071939468384, "step": 1295 }, { "clip_ratio": 0.0, "completion_length": 546.9486770629883, "epoch": 0.38712568142782466, "grad_norm": 4.012144565582275, "kl": 0.568603515625, "learning_rate": 1.5383255021543042e-05, "loss": 0.0905, "reward": 2.012276828289032, "reward_std": 0.15758364461362362, "rewards/accuracy_reward": 0.03571428847499192, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9899553954601288, "step": 1296 }, { "clip_ratio": 0.0, "completion_length": 585.7500305175781, "epoch": 0.38742438951534613, "grad_norm": 0.1310054361820221, "kl": 0.1446533203125, "learning_rate": 1.5374462133512534e-05, "loss": 0.0999, "reward": 1.99553582072258, "reward_std": 0.22575382143259048, "rewards/accuracy_reward": 0.04241071501746774, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9799107760190964, "step": 1297 }, { "clip_ratio": 0.0, "completion_length": 597.419677734375, "epoch": 0.3877230976028676, "grad_norm": 0.1607973277568817, "kl": 0.1204833984375, "learning_rate": 1.5365663398593982e-05, "loss": 0.0485, "reward": 2.0178571939468384, "reward_std": 0.18150857836008072, "rewards/accuracy_reward": 0.049107146449387074, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.988839328289032, "step": 1298 }, { "clip_ratio": 0.0, "completion_length": 573.4219207763672, "epoch": 0.3880218056903891, "grad_norm": 0.2140103280544281, "kl": 0.1231689453125, "learning_rate": 1.5356858826359543e-05, "loss": 0.0437, "reward": 2.0027902126312256, "reward_std": 0.1478513665497303, "rewards/accuracy_reward": 0.029017857741564512, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9893973767757416, "step": 1299 }, { "clip_ratio": 0.0, "completion_length": 585.3951110839844, "epoch": 0.38832051377791055, "grad_norm": 0.2165612280368805, "kl": 0.1458740234375, "learning_rate": 1.534804842638773e-05, "loss": 0.0875, "reward": 2.032366156578064, "reward_std": 0.2356181936338544, "rewards/accuracy_reward": 0.08035714784637094, "rewards/format_reward": 0.9732143133878708, "rewards/tag_count_reward": 0.978794664144516, "step": 1300 }, { "clip_ratio": 0.0, "completion_length": 564.084846496582, "epoch": 0.388619221865432, "grad_norm": 0.21418403089046478, "kl": 0.2640380859375, "learning_rate": 1.5339232208263394e-05, "loss": 0.0629, "reward": 2.115513503551483, "reward_std": 0.23521765414625406, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.983816996216774, "step": 1301 }, { "clip_ratio": 0.0, "completion_length": 557.8527069091797, "epoch": 0.3889179299529535, "grad_norm": 0.3204532265663147, "kl": 0.14208984375, "learning_rate": 1.533041018157771e-05, "loss": 0.03, "reward": 2.068080425262451, "reward_std": 0.09468860179185867, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9921875298023224, "step": 1302 }, { "clip_ratio": 0.0, "completion_length": 530.1160888671875, "epoch": 0.38921663804047496, "grad_norm": 0.17443978786468506, "kl": 0.1136474609375, "learning_rate": 1.532158235592819e-05, "loss": 0.0591, "reward": 2.026785761117935, "reward_std": 0.20230349898338318, "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9866071790456772, "step": 1303 }, { "clip_ratio": 0.0, "completion_length": 504.9375228881836, "epoch": 0.38951534612799643, "grad_norm": 0.36252152919769287, "kl": 0.27099609375, "learning_rate": 1.5312748740918643e-05, "loss": 0.0831, "reward": 2.017299234867096, "reward_std": 0.2368696741759777, "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.977120578289032, "step": 1304 }, { "clip_ratio": 0.0, "completion_length": 529.060302734375, "epoch": 0.3898140542155179, "grad_norm": 0.30825668573379517, "kl": 0.152099609375, "learning_rate": 1.5303909346159166e-05, "loss": 0.0893, "reward": 2.0033483505249023, "reward_std": 0.19375116750597954, "rewards/accuracy_reward": 0.03794643050059676, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9877232611179352, "step": 1305 }, { "clip_ratio": 0.0, "completion_length": 545.6875152587891, "epoch": 0.3901127623030394, "grad_norm": 0.2550397515296936, "kl": 0.1226806640625, "learning_rate": 1.529506418126616e-05, "loss": 0.0695, "reward": 2.0848215222358704, "reward_std": 0.17672374285757542, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.988839328289032, "step": 1306 }, { "clip_ratio": 0.0, "completion_length": 512.3683395385742, "epoch": 0.39041147039056084, "grad_norm": 0.23312953114509583, "kl": 0.202880859375, "learning_rate": 1.5286213255862295e-05, "loss": 0.1422, "reward": 2.1328125596046448, "reward_std": 0.274727251380682, "rewards/accuracy_reward": 0.19419643515720963, "rewards/format_reward": 0.96651791036129, "rewards/tag_count_reward": 0.972098246216774, "step": 1307 }, { "clip_ratio": 0.0, "completion_length": 538.8594131469727, "epoch": 0.3907101784780823, "grad_norm": 0.2693275809288025, "kl": 0.137451171875, "learning_rate": 1.527735657957651e-05, "loss": 0.0724, "reward": 2.0580358505249023, "reward_std": 0.22031675651669502, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9843750596046448, "step": 1308 }, { "clip_ratio": 0.0, "completion_length": 536.7277069091797, "epoch": 0.3910088865656038, "grad_norm": 0.1442943811416626, "kl": 0.263427734375, "learning_rate": 1.5268494162044008e-05, "loss": 0.0438, "reward": 2.0613840222358704, "reward_std": 0.1831461638212204, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9899553805589676, "step": 1309 }, { "clip_ratio": 0.0, "completion_length": 558.841552734375, "epoch": 0.39130759465312526, "grad_norm": 0.1522735357284546, "kl": 0.1116943359375, "learning_rate": 1.5259626012906227e-05, "loss": 0.057, "reward": 2.0491071939468384, "reward_std": 0.1968617644160986, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.98214291036129, "step": 1310 }, { "clip_ratio": 0.0, "completion_length": 538.3973541259766, "epoch": 0.39160630274064673, "grad_norm": 0.4335584342479706, "kl": 0.6546630859375, "learning_rate": 1.5250752141810839e-05, "loss": 0.0865, "reward": 2.07366082072258, "reward_std": 0.19083546474575996, "rewards/accuracy_reward": 0.11160715017467737, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9843750447034836, "step": 1311 }, { "clip_ratio": 0.0, "completion_length": 541.4174346923828, "epoch": 0.3919050108281682, "grad_norm": 0.4370613992214203, "kl": 0.2457275390625, "learning_rate": 1.524187255841175e-05, "loss": 0.0106, "reward": 2.0379465222358704, "reward_std": 0.13018860947340727, "rewards/accuracy_reward": 0.05803571850992739, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9933035969734192, "step": 1312 }, { "clip_ratio": 0.0, "completion_length": 551.3750305175781, "epoch": 0.39220371891568967, "grad_norm": 0.1829095035791397, "kl": 0.1141357421875, "learning_rate": 1.5232987272369076e-05, "loss": 0.0415, "reward": 2.1099331080913544, "reward_std": 0.2295248731970787, "rewards/accuracy_reward": 0.14955358020961285, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9849330633878708, "step": 1313 }, { "clip_ratio": 0.0, "completion_length": 521.8861846923828, "epoch": 0.3925024270032111, "grad_norm": 0.1743454784154892, "kl": 0.1236572265625, "learning_rate": 1.5224096293349137e-05, "loss": 0.0623, "reward": 2.0691965222358704, "reward_std": 0.2005091905593872, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9866071790456772, "step": 1314 }, { "clip_ratio": 0.0, "completion_length": 608.0893096923828, "epoch": 0.39280113509073256, "grad_norm": 0.37132528424263, "kl": 0.1448974609375, "learning_rate": 1.5215199631024452e-05, "loss": 0.0745, "reward": 2.0853796005249023, "reward_std": 0.2037077397108078, "rewards/accuracy_reward": 0.12723215040750802, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9827009439468384, "step": 1315 }, { "clip_ratio": 0.0, "completion_length": 597.3437805175781, "epoch": 0.393099843178254, "grad_norm": 0.12078077346086502, "kl": 0.1124267578125, "learning_rate": 1.5206297295073706e-05, "loss": 0.0257, "reward": 2.0931920409202576, "reward_std": 0.16243122331798077, "rewards/accuracy_reward": 0.12276786286383867, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9860491454601288, "step": 1316 }, { "clip_ratio": 0.0, "completion_length": 582.9464569091797, "epoch": 0.3933985512657755, "grad_norm": 0.1405971199274063, "kl": 0.114501953125, "learning_rate": 1.519738929518178e-05, "loss": 0.0417, "reward": 2.0039063692092896, "reward_std": 0.22766468115150928, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9793527126312256, "step": 1317 }, { "clip_ratio": 0.0, "completion_length": 568.7834930419922, "epoch": 0.39369725935329697, "grad_norm": 0.1958257257938385, "kl": 0.1552734375, "learning_rate": 1.51884756410397e-05, "loss": 0.0865, "reward": 2.0027902722358704, "reward_std": 0.2892245426774025, "rewards/accuracy_reward": 0.06026786100119352, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9737723767757416, "step": 1318 }, { "clip_ratio": 0.0, "completion_length": 562.0089569091797, "epoch": 0.39399596744081844, "grad_norm": 0.20797887444496155, "kl": 0.1446533203125, "learning_rate": 1.5179556342344643e-05, "loss": 0.0435, "reward": 2.090401828289032, "reward_std": 0.14007576555013657, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.9732143133878708, "rewards/tag_count_reward": 0.981026828289032, "step": 1319 }, { "clip_ratio": 0.0, "completion_length": 545.6227874755859, "epoch": 0.3942946755283399, "grad_norm": 0.15890908241271973, "kl": 0.1671142578125, "learning_rate": 1.5170631408799938e-05, "loss": 0.0824, "reward": 2.021205484867096, "reward_std": 0.23091118037700653, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9787946790456772, "step": 1320 }, { "clip_ratio": 0.0, "completion_length": 547.272346496582, "epoch": 0.3945933836158614, "grad_norm": 0.1747090071439743, "kl": 0.14453125, "learning_rate": 1.516170085011504e-05, "loss": 0.0855, "reward": 2.0340403020381927, "reward_std": 0.16225475817918777, "rewards/accuracy_reward": 0.06473214644938707, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9893973618745804, "step": 1321 }, { "clip_ratio": 0.0, "completion_length": 543.9888610839844, "epoch": 0.39489209170338285, "grad_norm": 0.13722090423107147, "kl": 0.1080322265625, "learning_rate": 1.5152764676005518e-05, "loss": 0.0685, "reward": 2.054687589406967, "reward_std": 0.19268947839736938, "rewards/accuracy_reward": 0.09151786169968545, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9854911267757416, "step": 1322 }, { "clip_ratio": 0.0, "completion_length": 523.3660888671875, "epoch": 0.3951907997909043, "grad_norm": 0.3159802556037903, "kl": 0.1790771484375, "learning_rate": 1.514382289619305e-05, "loss": 0.0906, "reward": 2.0401786863803864, "reward_std": 0.21427350118756294, "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.98214291036129, "step": 1323 }, { "clip_ratio": 0.0, "completion_length": 517.9977951049805, "epoch": 0.3954895078784258, "grad_norm": 0.1937611699104309, "kl": 0.116943359375, "learning_rate": 1.5134875520405423e-05, "loss": 0.0373, "reward": 2.1367188692092896, "reward_std": 0.1615622527897358, "rewards/accuracy_reward": 0.15625000488944352, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.991629496216774, "step": 1324 }, { "clip_ratio": 0.0, "completion_length": 541.5714569091797, "epoch": 0.39578821596594727, "grad_norm": 0.24210086464881897, "kl": 0.246826171875, "learning_rate": 1.51259225583765e-05, "loss": 0.0395, "reward": 2.1333706378936768, "reward_std": 0.1488004494458437, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9905134290456772, "step": 1325 }, { "clip_ratio": 0.0, "completion_length": 498.70092010498047, "epoch": 0.39608692405346874, "grad_norm": 0.11180644482374191, "kl": 0.112060546875, "learning_rate": 1.511696401984623e-05, "loss": 0.0542, "reward": 2.122767984867096, "reward_std": 0.16655504703521729, "rewards/accuracy_reward": 0.14955358300358057, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9888393133878708, "step": 1326 }, { "clip_ratio": 0.0, "completion_length": 558.122802734375, "epoch": 0.3963856321409902, "grad_norm": 0.06792212277650833, "kl": 0.08203125, "learning_rate": 1.5107999914560618e-05, "loss": 0.0103, "reward": 2.122209906578064, "reward_std": 0.06321168504655361, "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9972098469734192, "step": 1327 }, { "clip_ratio": 0.0, "completion_length": 549.6562728881836, "epoch": 0.3966843402285117, "grad_norm": 0.09812244027853012, "kl": 0.0855712890625, "learning_rate": 1.5099030252271742e-05, "loss": 0.0232, "reward": 2.174107253551483, "reward_std": 0.09225907735526562, "rewards/accuracy_reward": 0.18080358440056443, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9977678805589676, "step": 1328 }, { "clip_ratio": 0.0, "completion_length": 591.3861999511719, "epoch": 0.39698304831603315, "grad_norm": 0.1373293399810791, "kl": 0.14404296875, "learning_rate": 1.509005504273771e-05, "loss": 0.0149, "reward": 2.072544753551483, "reward_std": 0.14693515561521053, "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9921875298023224, "step": 1329 }, { "clip_ratio": 0.0, "completion_length": 582.8861846923828, "epoch": 0.3972817564035546, "grad_norm": 0.18691007792949677, "kl": 0.1175537109375, "learning_rate": 1.5081074295722666e-05, "loss": 0.0493, "reward": 2.044642984867096, "reward_std": 0.19922304153442383, "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.988839328289032, "step": 1330 }, { "clip_ratio": 0.0, "completion_length": 580.607177734375, "epoch": 0.3975804644910761, "grad_norm": 0.11628378927707672, "kl": 0.0924072265625, "learning_rate": 1.5072088020996791e-05, "loss": 0.0311, "reward": 2.228794753551483, "reward_std": 0.12078910414129496, "rewards/accuracy_reward": 0.2477678693830967, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9921875298023224, "step": 1331 }, { "clip_ratio": 0.0, "completion_length": 535.1875305175781, "epoch": 0.39787917257859756, "grad_norm": 0.22511497139930725, "kl": 0.0986328125, "learning_rate": 1.5063096228336265e-05, "loss": 0.0187, "reward": 2.134486675262451, "reward_std": 0.09224463999271393, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.998325914144516, "step": 1332 }, { "clip_ratio": 0.0, "completion_length": 582.7545013427734, "epoch": 0.39817788066611903, "grad_norm": 0.6129051446914673, "kl": 0.30224609375, "learning_rate": 1.5054098927523281e-05, "loss": 0.027, "reward": 2.0485492050647736, "reward_std": 0.1475992426276207, "rewards/accuracy_reward": 0.07142857438884676, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9905134290456772, "step": 1333 }, { "clip_ratio": 0.0, "completion_length": 549.5670013427734, "epoch": 0.3984765887536405, "grad_norm": 0.5589571595191956, "kl": 0.1163330078125, "learning_rate": 1.5045096128346017e-05, "loss": 0.053, "reward": 2.1333706378936768, "reward_std": 0.21846075542271137, "rewards/accuracy_reward": 0.15401786379516125, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9905134290456772, "step": 1334 }, { "clip_ratio": 0.0, "completion_length": 551.7143096923828, "epoch": 0.398775296841162, "grad_norm": 0.12937279045581818, "kl": 0.13232421875, "learning_rate": 1.503608784059864e-05, "loss": 0.0241, "reward": 2.1116071939468384, "reward_std": 0.09678617864847183, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.993303582072258, "rewards/tag_count_reward": 0.9955357313156128, "step": 1335 }, { "clip_ratio": 0.0, "completion_length": 534.4151916503906, "epoch": 0.39907400492868345, "grad_norm": 0.13311761617660522, "kl": 0.1016845703125, "learning_rate": 1.5027074074081282e-05, "loss": 0.0193, "reward": 2.1367189288139343, "reward_std": 0.08808019291609526, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9983258992433548, "step": 1336 }, { "clip_ratio": 0.0, "completion_length": 544.1652069091797, "epoch": 0.3993727130162049, "grad_norm": 0.13810421526432037, "kl": 0.137451171875, "learning_rate": 1.5018054838600033e-05, "loss": 0.0487, "reward": 2.1088170409202576, "reward_std": 0.13031224999576807, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.990513414144516, "step": 1337 }, { "clip_ratio": 0.0, "completion_length": 522.116096496582, "epoch": 0.3996714211037264, "grad_norm": 1.9447762966156006, "kl": 0.3934326171875, "learning_rate": 1.5009030143966948e-05, "loss": 0.0357, "reward": 2.1183037161827087, "reward_std": 0.10996300727128983, "rewards/accuracy_reward": 0.13616072130389512, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9910714626312256, "step": 1338 }, { "clip_ratio": 0.0, "completion_length": 527.4531478881836, "epoch": 0.39997012919124786, "grad_norm": 0.10256651788949966, "kl": 0.1136474609375, "learning_rate": 1.5000000000000002e-05, "loss": 0.0188, "reward": 2.1322546005249023, "reward_std": 0.06905786879360676, "rewards/accuracy_reward": 0.13839286286383867, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9983258992433548, "step": 1339 }, { "clip_ratio": 0.0, "completion_length": 554.4375152587891, "epoch": 0.40026883727876933, "grad_norm": 0.12927256524562836, "kl": 0.0867919921875, "learning_rate": 1.4990964416523108e-05, "loss": 0.0071, "reward": 2.0987724661827087, "reward_std": 0.0868635019287467, "rewards/accuracy_reward": 0.10267857671715319, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9983258992433548, "step": 1340 }, { "clip_ratio": 0.0, "completion_length": 551.9754791259766, "epoch": 0.4005675453662908, "grad_norm": 0.14353443682193756, "kl": 0.170654296875, "learning_rate": 1.4981923403366096e-05, "loss": 0.0374, "reward": 2.0412947833538055, "reward_std": 0.1935544740408659, "rewards/accuracy_reward": 0.07366071967408061, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9854911118745804, "step": 1341 }, { "clip_ratio": 0.0, "completion_length": 515.0558166503906, "epoch": 0.40086625345381227, "grad_norm": 0.49533161520957947, "kl": 0.3826904296875, "learning_rate": 1.4972876970364703e-05, "loss": 0.0956, "reward": 2.09319207072258, "reward_std": 0.1728704832494259, "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9860491454601288, "step": 1342 }, { "clip_ratio": 0.0, "completion_length": 535.6361999511719, "epoch": 0.40116496154133374, "grad_norm": 0.1515824943780899, "kl": 0.140380859375, "learning_rate": 1.496382512736056e-05, "loss": 0.0444, "reward": 2.0251117050647736, "reward_std": 0.19942459277808666, "rewards/accuracy_reward": 0.053571431897580624, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9871651977300644, "step": 1343 }, { "clip_ratio": 0.0, "completion_length": 548.919677734375, "epoch": 0.4014636696288552, "grad_norm": 0.22259147465229034, "kl": 0.2977294921875, "learning_rate": 1.4954767884201186e-05, "loss": 0.0053, "reward": 2.090959906578064, "reward_std": 0.08585312962532043, "rewards/accuracy_reward": 0.10044643376022577, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9972098469734192, "step": 1344 }, { "clip_ratio": 0.0, "completion_length": 536.0156707763672, "epoch": 0.4017623777163767, "grad_norm": 0.28510090708732605, "kl": 0.220703125, "learning_rate": 1.4945705250739972e-05, "loss": 0.0703, "reward": 1.9843750894069672, "reward_std": 0.2233084701001644, "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.9732143431901932, "rewards/tag_count_reward": 0.9799107611179352, "step": 1345 }, { "clip_ratio": 0.0, "completion_length": 537.8794937133789, "epoch": 0.40206108580389815, "grad_norm": 0.17944790422916412, "kl": 0.17626953125, "learning_rate": 1.4936637236836178e-05, "loss": 0.0496, "reward": 2.053571581840515, "reward_std": 0.16106495633721352, "rewards/accuracy_reward": 0.07812500465661287, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9910714626312256, "step": 1346 }, { "clip_ratio": 0.0, "completion_length": 573.7410888671875, "epoch": 0.4023597938914196, "grad_norm": 0.11682826280593872, "kl": 0.1611328125, "learning_rate": 1.492756385235491e-05, "loss": 0.0387, "reward": 1.9832590818405151, "reward_std": 0.1268101530149579, "rewards/accuracy_reward": 0.013392857974395156, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9854910969734192, "step": 1347 }, { "clip_ratio": 0.0, "completion_length": 560.0826110839844, "epoch": 0.4026585019789411, "grad_norm": 0.2151687890291214, "kl": 0.188720703125, "learning_rate": 1.4918485107167127e-05, "loss": 0.0425, "reward": 2.0468750596046448, "reward_std": 0.15646830573678017, "rewards/accuracy_reward": 0.07812500349245965, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9866071790456772, "step": 1348 }, { "clip_ratio": 0.0, "completion_length": 568.7969055175781, "epoch": 0.40295721006646257, "grad_norm": 0.14759521186351776, "kl": 0.1763916015625, "learning_rate": 1.490940101114961e-05, "loss": 0.0531, "reward": 2.119977831840515, "reward_std": 0.20575106516480446, "rewards/accuracy_reward": 0.15848214412108064, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.983816996216774, "step": 1349 }, { "clip_ratio": 0.0, "completion_length": 589.3683319091797, "epoch": 0.40325591815398404, "grad_norm": 0.23314474523067474, "kl": 0.145263671875, "learning_rate": 1.4900311574184967e-05, "loss": 0.0555, "reward": 2.12834832072258, "reward_std": 0.17300471663475037, "rewards/accuracy_reward": 0.1584821459837258, "rewards/format_reward": 0.9821429252624512, "rewards/tag_count_reward": 0.9877232611179352, "step": 1350 }, { "clip_ratio": 0.0, "completion_length": 611.4643249511719, "epoch": 0.4035546262415055, "grad_norm": 0.16291578114032745, "kl": 0.398193359375, "learning_rate": 1.4891216806161613e-05, "loss": 0.0152, "reward": 2.014508992433548, "reward_std": 0.18908417224884033, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9899553954601288, "step": 1351 }, { "clip_ratio": 0.0, "completion_length": 620.5982513427734, "epoch": 0.403853334329027, "grad_norm": 0.2763533890247345, "kl": 0.1920166015625, "learning_rate": 1.488211671697376e-05, "loss": 0.038, "reward": 2.025111675262451, "reward_std": 0.14091841969639063, "rewards/accuracy_reward": 0.0535714328289032, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9893973618745804, "step": 1352 }, { "clip_ratio": 0.0, "completion_length": 612.7388610839844, "epoch": 0.40415204241654845, "grad_norm": 0.2099248170852661, "kl": 0.271484375, "learning_rate": 1.4873011316521421e-05, "loss": 0.0393, "reward": 2.092076003551483, "reward_std": 0.1658701580017805, "rewards/accuracy_reward": 0.11383929289877415, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9938616454601288, "step": 1353 }, { "clip_ratio": 0.0, "completion_length": 631.4844055175781, "epoch": 0.4044507505040699, "grad_norm": 0.5729277729988098, "kl": 0.1317138671875, "learning_rate": 1.4863900614710379e-05, "loss": 0.0418, "reward": 2.212611675262451, "reward_std": 0.2314375899732113, "rewards/accuracy_reward": 0.2455357313156128, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.984933078289032, "step": 1354 }, { "clip_ratio": 0.0, "completion_length": 567.9062652587891, "epoch": 0.4047494585915914, "grad_norm": 0.20403122901916504, "kl": 0.800048828125, "learning_rate": 1.4854784621452176e-05, "loss": 0.0036, "reward": 2.1093751192092896, "reward_std": 0.16815319284796715, "rewards/accuracy_reward": 0.13169643585570157, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9910714626312256, "step": 1355 }, { "clip_ratio": 0.0, "completion_length": 632.5714569091797, "epoch": 0.40504816667911286, "grad_norm": 0.42646563053131104, "kl": 0.4600830078125, "learning_rate": 1.484566334666413e-05, "loss": 0.0468, "reward": 1.9944197237491608, "reward_std": 0.23367795534431934, "rewards/accuracy_reward": 0.04910714365541935, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9787946790456772, "step": 1356 }, { "clip_ratio": 0.0, "completion_length": 635.3236846923828, "epoch": 0.4053468747666343, "grad_norm": 0.3148544132709503, "kl": 0.126220703125, "learning_rate": 1.4836536800269288e-05, "loss": 0.0279, "reward": 2.0825893878936768, "reward_std": 0.16749460622668266, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9910714477300644, "step": 1357 }, { "clip_ratio": 0.0, "completion_length": 562.1361846923828, "epoch": 0.40564558285415575, "grad_norm": 1.2858885526657104, "kl": 0.870849609375, "learning_rate": 1.4827404992196436e-05, "loss": 0.0714, "reward": 2.0669643878936768, "reward_std": 0.1495267115533352, "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9910714626312256, "step": 1358 }, { "clip_ratio": 0.0, "completion_length": 591.0960083007812, "epoch": 0.4059442909416772, "grad_norm": 0.22007274627685547, "kl": 0.1939697265625, "learning_rate": 1.481826793238009e-05, "loss": 0.0461, "reward": 2.1138393580913544, "reward_std": 0.23563328757882118, "rewards/accuracy_reward": 0.1473214291036129, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9866071790456772, "step": 1359 }, { "clip_ratio": 0.0, "completion_length": 588.1964569091797, "epoch": 0.4062429990291987, "grad_norm": 0.29093658924102783, "kl": 0.2012939453125, "learning_rate": 1.4809125630760477e-05, "loss": 0.0285, "reward": 2.064732253551483, "reward_std": 0.1185813806951046, "rewards/accuracy_reward": 0.08705357694998384, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9933036118745804, "step": 1360 }, { "clip_ratio": 0.0, "completion_length": 577.2522583007812, "epoch": 0.40654170711672016, "grad_norm": 0.17366154491901398, "kl": 0.1243896484375, "learning_rate": 1.479997809728352e-05, "loss": 0.0218, "reward": 2.1473215222358704, "reward_std": 0.10068357177078724, "rewards/accuracy_reward": 0.16071429569274187, "rewards/format_reward": 0.9888392984867096, "rewards/tag_count_reward": 0.9977678805589676, "step": 1361 }, { "clip_ratio": 0.0, "completion_length": 581.0893096923828, "epoch": 0.40684041520424163, "grad_norm": 0.2727562487125397, "kl": 0.160400390625, "learning_rate": 1.4790825341900844e-05, "loss": 0.0314, "reward": 2.0429688096046448, "reward_std": 0.13475053384900093, "rewards/accuracy_reward": 0.06473214575089514, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.991629496216774, "step": 1362 }, { "clip_ratio": 0.0, "completion_length": 571.366096496582, "epoch": 0.4071391232917631, "grad_norm": 0.2305491715669632, "kl": 0.1241455078125, "learning_rate": 1.4781667374569746e-05, "loss": 0.0625, "reward": 2.098772406578064, "reward_std": 0.1855385284870863, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9893973618745804, "step": 1363 }, { "clip_ratio": 0.0, "completion_length": 576.9687805175781, "epoch": 0.4074378313792846, "grad_norm": 0.47086140513420105, "kl": 0.31640625, "learning_rate": 1.4772504205253197e-05, "loss": 0.1153, "reward": 2.1121652722358704, "reward_std": 0.23142408579587936, "rewards/accuracy_reward": 0.15401786006987095, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9804688096046448, "step": 1364 }, { "clip_ratio": 0.0, "completion_length": 576.0201110839844, "epoch": 0.40773653946680605, "grad_norm": 0.21951039135456085, "kl": 0.2509765625, "learning_rate": 1.476333584391983e-05, "loss": 0.0318, "reward": 2.088727831840515, "reward_std": 0.18763837590813637, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9927455931901932, "step": 1365 }, { "clip_ratio": 0.0, "completion_length": 629.1964569091797, "epoch": 0.4080352475543275, "grad_norm": 0.1367127150297165, "kl": 0.0892333984375, "learning_rate": 1.4754162300543922e-05, "loss": 0.0217, "reward": 2.1562501192092896, "reward_std": 0.154420112259686, "rewards/accuracy_reward": 0.16294643888249993, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9977678656578064, "step": 1366 }, { "clip_ratio": 0.0, "completion_length": 608.4531707763672, "epoch": 0.408333955641849, "grad_norm": 0.43023163080215454, "kl": 0.202392578125, "learning_rate": 1.4744983585105388e-05, "loss": 0.0257, "reward": 2.1266742050647736, "reward_std": 0.1543823815882206, "rewards/accuracy_reward": 0.14955358020961285, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9927455633878708, "step": 1367 }, { "clip_ratio": 0.0, "completion_length": 656.0960083007812, "epoch": 0.40863266372937046, "grad_norm": 0.12330281734466553, "kl": 0.0992431640625, "learning_rate": 1.4735799707589773e-05, "loss": 0.0238, "reward": 1.9933036267757416, "reward_std": 0.13051528856158257, "rewards/accuracy_reward": 0.022321430034935474, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9910714775323868, "step": 1368 }, { "clip_ratio": 0.0, "completion_length": 623.0402069091797, "epoch": 0.40893137181689193, "grad_norm": 0.2681633234024048, "kl": 0.090576171875, "learning_rate": 1.4726610677988232e-05, "loss": 0.0171, "reward": 2.1222099661827087, "reward_std": 0.14377425238490105, "rewards/accuracy_reward": 0.14285714644938707, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9927455931901932, "step": 1369 }, { "clip_ratio": 0.0, "completion_length": 598.6763687133789, "epoch": 0.4092300799044134, "grad_norm": 0.1742398738861084, "kl": 0.1171875, "learning_rate": 1.4717416506297535e-05, "loss": 0.0373, "reward": 2.0602679550647736, "reward_std": 0.09669382870197296, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9955357313156128, "step": 1370 }, { "clip_ratio": 0.0, "completion_length": 616.9397583007812, "epoch": 0.4095287879919349, "grad_norm": 0.12410473823547363, "kl": 0.0914306640625, "learning_rate": 1.470821720252003e-05, "loss": 0.0096, "reward": 2.0546875596046448, "reward_std": 0.15216714143753052, "rewards/accuracy_reward": 0.06473214528523386, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.996651828289032, "step": 1371 }, { "clip_ratio": 0.0, "completion_length": 578.5245819091797, "epoch": 0.40982749607945634, "grad_norm": 0.10297005623579025, "kl": 0.1697998046875, "learning_rate": 1.4699012776663668e-05, "loss": 0.0351, "reward": 2.0669643878936768, "reward_std": 0.12941131368279457, "rewards/accuracy_reward": 0.0848214344587177, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9910714626312256, "step": 1372 }, { "clip_ratio": 0.0, "completion_length": 594.4866333007812, "epoch": 0.4101262041669778, "grad_norm": 0.2615639865398407, "kl": 0.1502685546875, "learning_rate": 1.4689803238741955e-05, "loss": 0.0262, "reward": 2.0820313096046448, "reward_std": 0.10083319991827011, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949776977300644, "step": 1373 }, { "clip_ratio": 0.0, "completion_length": 600.0536041259766, "epoch": 0.4104249122544993, "grad_norm": 0.29260513186454773, "kl": 0.20263671875, "learning_rate": 1.468058859877397e-05, "loss": 0.0526, "reward": 2.016183078289032, "reward_std": 0.178347360342741, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9893973618745804, "step": 1374 }, { "clip_ratio": 0.0, "completion_length": 591.1071624755859, "epoch": 0.41072362034202076, "grad_norm": 0.3476851284503937, "kl": 0.3116455078125, "learning_rate": 1.4671368866784338e-05, "loss": 0.0236, "reward": 2.0345982909202576, "reward_std": 0.1550319343805313, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9921875298023224, "step": 1375 }, { "clip_ratio": 0.0, "completion_length": 589.3861846923828, "epoch": 0.4110223284295422, "grad_norm": 0.20026203989982605, "kl": 0.110595703125, "learning_rate": 1.4662144052803223e-05, "loss": 0.0267, "reward": 2.1439732909202576, "reward_std": 0.12725186720490456, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9966518133878708, "step": 1376 }, { "clip_ratio": 0.0, "completion_length": 513.5022659301758, "epoch": 0.4113210365170637, "grad_norm": 0.16657917201519012, "kl": 0.177734375, "learning_rate": 1.4652914166866312e-05, "loss": 0.0504, "reward": 2.1718751192092896, "reward_std": 0.1794080138206482, "rewards/accuracy_reward": 0.2053571529686451, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9866071790456772, "step": 1377 }, { "clip_ratio": 0.0, "completion_length": 550.9754638671875, "epoch": 0.41161974460458517, "grad_norm": 0.14787229895591736, "kl": 0.1158447265625, "learning_rate": 1.4643679219014827e-05, "loss": 0.046, "reward": 2.181361675262451, "reward_std": 0.11710398737341166, "rewards/accuracy_reward": 0.196428582072258, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9938616305589676, "step": 1378 }, { "clip_ratio": 0.0, "completion_length": 573.3482208251953, "epoch": 0.41191845269210664, "grad_norm": 0.10350670665502548, "kl": 0.12744140625, "learning_rate": 1.463443921929548e-05, "loss": 0.0275, "reward": 2.0904018878936768, "reward_std": 0.10717056412249804, "rewards/accuracy_reward": 0.1026785729918629, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.994419664144516, "step": 1379 }, { "clip_ratio": 0.0, "completion_length": 531.9665374755859, "epoch": 0.4122171607796281, "grad_norm": 0.1149691715836525, "kl": 0.1024169921875, "learning_rate": 1.4625194177760485e-05, "loss": 0.0124, "reward": 2.0853795409202576, "reward_std": 0.07447467558085918, "rewards/accuracy_reward": 0.08928572130389512, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9983258992433548, "step": 1380 }, { "clip_ratio": 0.0, "completion_length": 497.26341247558594, "epoch": 0.4125158688671496, "grad_norm": 0.15679502487182617, "kl": 0.0999755859375, "learning_rate": 1.4615944104467544e-05, "loss": 0.0066, "reward": 2.1132813692092896, "reward_std": 0.12427168060094118, "rewards/accuracy_reward": 0.12053572246804833, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.997209832072258, "step": 1381 }, { "clip_ratio": 0.0, "completion_length": 620.7120819091797, "epoch": 0.41281457695467105, "grad_norm": 0.2159040868282318, "kl": 0.240234375, "learning_rate": 1.4606689009479829e-05, "loss": 0.0276, "reward": 2.0775671005249023, "reward_std": 0.12019018456339836, "rewards/accuracy_reward": 0.09821428963914514, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.990513414144516, "step": 1382 }, { "clip_ratio": 0.0, "completion_length": 564.0223388671875, "epoch": 0.4131132850421925, "grad_norm": 0.2742241621017456, "kl": 0.14794921875, "learning_rate": 1.4597428902865973e-05, "loss": 0.0367, "reward": 2.0652902722358704, "reward_std": 0.18390102684497833, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.9843750149011612, "rewards/tag_count_reward": 0.9871651977300644, "step": 1383 }, { "clip_ratio": 0.0, "completion_length": 501.44644927978516, "epoch": 0.413411993129714, "grad_norm": 0.49670594930648804, "kl": 0.1893310546875, "learning_rate": 1.4588163794700068e-05, "loss": 0.0569, "reward": 2.1132813692092896, "reward_std": 0.1262818444520235, "rewards/accuracy_reward": 0.13169643329456449, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9927455633878708, "step": 1384 }, { "clip_ratio": 0.0, "completion_length": 561.9062805175781, "epoch": 0.41371070121723547, "grad_norm": 0.1583176553249359, "kl": 0.141357421875, "learning_rate": 1.4578893695061644e-05, "loss": 0.0852, "reward": 2.03850457072258, "reward_std": 0.17662855610251427, "rewards/accuracy_reward": 0.06919643329456449, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9871652275323868, "step": 1385 }, { "clip_ratio": 0.0, "completion_length": 527.1875152587891, "epoch": 0.41400940930475694, "grad_norm": 0.4614390730857849, "kl": 0.335205078125, "learning_rate": 1.456961861403566e-05, "loss": 0.0852, "reward": 2.0764509737491608, "reward_std": 0.22500621899962425, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9827009439468384, "step": 1386 }, { "clip_ratio": 0.0, "completion_length": 563.5625305175781, "epoch": 0.4143081173922784, "grad_norm": 0.3030627369880676, "kl": 0.175048828125, "learning_rate": 1.4560338561712495e-05, "loss": 0.0244, "reward": 2.0697545409202576, "reward_std": 0.14817016571760178, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.991629496216774, "step": 1387 }, { "clip_ratio": 0.0, "completion_length": 560.6674346923828, "epoch": 0.4146068254797999, "grad_norm": 0.5509800910949707, "kl": 0.28271484375, "learning_rate": 1.4551053548187933e-05, "loss": 0.0689, "reward": 2.054129511117935, "reward_std": 0.2381148412823677, "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9893973618745804, "step": 1388 }, { "clip_ratio": 0.0, "completion_length": 531.7790451049805, "epoch": 0.41490553356732135, "grad_norm": 0.2816396951675415, "kl": 0.1622314453125, "learning_rate": 1.4541763583563165e-05, "loss": 0.068, "reward": 2.0418527722358704, "reward_std": 0.19368455186486244, "rewards/accuracy_reward": 0.06919643213041127, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9882812947034836, "step": 1389 }, { "clip_ratio": 0.0, "completion_length": 535.8527069091797, "epoch": 0.4152042416548428, "grad_norm": 0.6908566355705261, "kl": 0.19140625, "learning_rate": 1.4532468677944758e-05, "loss": 0.1073, "reward": 2.098772406578064, "reward_std": 0.23583313822746277, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.984933078289032, "step": 1390 }, { "clip_ratio": 0.0, "completion_length": 551.888427734375, "epoch": 0.4155029497423643, "grad_norm": 0.359466016292572, "kl": 0.1536865234375, "learning_rate": 1.4523168841444657e-05, "loss": 0.0259, "reward": 2.0345983505249023, "reward_std": 0.27802083268761635, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.9854911118745804, "step": 1391 }, { "clip_ratio": 0.0, "completion_length": 551.2857437133789, "epoch": 0.41580165782988576, "grad_norm": 0.22745753824710846, "kl": 0.219482421875, "learning_rate": 1.4513864084180176e-05, "loss": 0.0553, "reward": 2.017299234867096, "reward_std": 0.2793191932141781, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.9464285969734192, "rewards/tag_count_reward": 0.9815848767757416, "step": 1392 }, { "clip_ratio": 0.0, "completion_length": 596.6562652587891, "epoch": 0.41610036591740723, "grad_norm": 0.2667771577835083, "kl": 0.1280517578125, "learning_rate": 1.4504554416273977e-05, "loss": 0.0523, "reward": 2.106584906578064, "reward_std": 0.1789970714598894, "rewards/accuracy_reward": 0.14732143399305642, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9860491454601288, "step": 1393 }, { "clip_ratio": 0.0, "completion_length": 603.4643096923828, "epoch": 0.4163990740049287, "grad_norm": 0.3423832356929779, "kl": 0.1630859375, "learning_rate": 1.4495239847854071e-05, "loss": 0.0228, "reward": 2.042968839406967, "reward_std": 0.25857267156243324, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9871652275323868, "step": 1394 }, { "clip_ratio": 0.0, "completion_length": 559.2276992797852, "epoch": 0.4166977820924502, "grad_norm": 0.16469112038612366, "kl": 0.0887451171875, "learning_rate": 1.4485920389053786e-05, "loss": 0.0315, "reward": 1.97600457072258, "reward_std": 0.2378873974084854, "rewards/accuracy_reward": 0.03125000209547579, "rewards/format_reward": 0.9598214775323868, "rewards/tag_count_reward": 0.984933078289032, "step": 1395 }, { "clip_ratio": 0.0, "completion_length": 580.0402069091797, "epoch": 0.41699649017997165, "grad_norm": 0.15943624079227448, "kl": 0.1026611328125, "learning_rate": 1.4476596050011787e-05, "loss": 0.0102, "reward": 2.05803582072258, "reward_std": 0.19154239632189274, "rewards/accuracy_reward": 0.08482143143191934, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9933036118745804, "step": 1396 }, { "clip_ratio": 0.0, "completion_length": 526.4397659301758, "epoch": 0.4172951982674931, "grad_norm": 0.272894948720932, "kl": 0.1309814453125, "learning_rate": 1.4467266840872041e-05, "loss": 0.0501, "reward": 2.0044643878936768, "reward_std": 0.12394729163497686, "rewards/accuracy_reward": 0.026785715017467737, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9910714626312256, "step": 1397 }, { "clip_ratio": 0.0, "completion_length": 514.6495742797852, "epoch": 0.4175939063550146, "grad_norm": 0.18809615075588226, "kl": 0.11865234375, "learning_rate": 1.4457932771783808e-05, "loss": 0.0273, "reward": 2.017299175262451, "reward_std": 0.10267364699393511, "rewards/accuracy_reward": 0.033482145285233855, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9949777126312256, "step": 1398 }, { "clip_ratio": 0.0, "completion_length": 457.3772430419922, "epoch": 0.41789261444253606, "grad_norm": 0.19246208667755127, "kl": 0.10791015625, "learning_rate": 1.4448593852901644e-05, "loss": 0.0399, "reward": 2.102678656578064, "reward_std": 0.14527585729956627, "rewards/accuracy_reward": 0.11607143515720963, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9955357313156128, "step": 1399 }, { "clip_ratio": 0.0, "completion_length": 448.2768020629883, "epoch": 0.4181913225300575, "grad_norm": 0.21175380051136017, "kl": 0.1285400390625, "learning_rate": 1.443925009438538e-05, "loss": 0.029, "reward": 2.055803656578064, "reward_std": 0.20553640369325876, "rewards/accuracy_reward": 0.07812500419095159, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9933036118745804, "step": 1400 }, { "clip_ratio": 0.0, "completion_length": 451.71653747558594, "epoch": 0.41849003061757895, "grad_norm": 0.4730311632156372, "kl": 0.17529296875, "learning_rate": 1.4429901506400106e-05, "loss": 0.0388, "reward": 2.0284599661827087, "reward_std": 0.1333972467109561, "rewards/accuracy_reward": 0.046875003492459655, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9927455484867096, "step": 1401 }, { "clip_ratio": 0.0, "completion_length": 465.66519927978516, "epoch": 0.4187887387051004, "grad_norm": 0.12950068712234497, "kl": 0.124755859375, "learning_rate": 1.4420548099116167e-05, "loss": 0.0202, "reward": 2.024553656578064, "reward_std": 0.06793725118041039, "rewards/accuracy_reward": 0.033482144586741924, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9977678656578064, "step": 1402 }, { "clip_ratio": 0.0, "completion_length": 468.43082427978516, "epoch": 0.4190874467926219, "grad_norm": 0.2726738452911377, "kl": 0.11669921875, "learning_rate": 1.441118988270916e-05, "loss": 0.0253, "reward": 2.0920759439468384, "reward_std": 0.1588739026337862, "rewards/accuracy_reward": 0.10714286239817739, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9960937649011612, "step": 1403 }, { "clip_ratio": 0.0, "completion_length": 484.2232437133789, "epoch": 0.41938615488014336, "grad_norm": 0.45001542568206787, "kl": 0.177978515625, "learning_rate": 1.4401826867359903e-05, "loss": 0.057, "reward": 2.059709906578064, "reward_std": 0.20341398939490318, "rewards/accuracy_reward": 0.08035714458674192, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9905134290456772, "step": 1404 }, { "clip_ratio": 0.0, "completion_length": 546.5491409301758, "epoch": 0.41968486296766483, "grad_norm": 0.38324999809265137, "kl": 0.1502685546875, "learning_rate": 1.4392459063254438e-05, "loss": 0.0367, "reward": 2.0703126192092896, "reward_std": 0.24864330142736435, "rewards/accuracy_reward": 0.10044643376022577, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9877232611179352, "step": 1405 }, { "clip_ratio": 0.0, "completion_length": 543.1540298461914, "epoch": 0.4199835710551863, "grad_norm": 0.19888578355312347, "kl": 0.1251220703125, "learning_rate": 1.438308648058402e-05, "loss": 0.0499, "reward": 2.0786831974983215, "reward_std": 0.2039795145392418, "rewards/accuracy_reward": 0.10491072200238705, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9893973469734192, "step": 1406 }, { "clip_ratio": 0.0, "completion_length": 581.3125152587891, "epoch": 0.42028227914270777, "grad_norm": 0.3431844413280487, "kl": 0.1290283203125, "learning_rate": 1.4373709129545101e-05, "loss": 0.0657, "reward": 2.0496653020381927, "reward_std": 0.1818600818514824, "rewards/accuracy_reward": 0.08035714505240321, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9871652126312256, "step": 1407 }, { "clip_ratio": 0.0, "completion_length": 592.5111846923828, "epoch": 0.42058098723022924, "grad_norm": 0.2089521288871765, "kl": 0.1280517578125, "learning_rate": 1.4364327020339319e-05, "loss": 0.0631, "reward": 2.075334906578064, "reward_std": 0.20123997144401073, "rewards/accuracy_reward": 0.10491071850992739, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9882812947034836, "step": 1408 }, { "clip_ratio": 0.0, "completion_length": 616.1450958251953, "epoch": 0.4208796953177507, "grad_norm": 0.17867296934127808, "kl": 0.1614990234375, "learning_rate": 1.4354940163173486e-05, "loss": 0.035, "reward": 2.0641742646694183, "reward_std": 0.15992127545177937, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.990513414144516, "step": 1409 }, { "clip_ratio": 0.0, "completion_length": 622.7924346923828, "epoch": 0.4211784034052722, "grad_norm": 0.14941544830799103, "kl": 0.3302001953125, "learning_rate": 1.4345548568259586e-05, "loss": 0.0347, "reward": 2.1088171005249023, "reward_std": 0.19317157939076424, "rewards/accuracy_reward": 0.13392857322469354, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9905134290456772, "step": 1410 }, { "clip_ratio": 0.0, "completion_length": 639.810302734375, "epoch": 0.42147711149279365, "grad_norm": 0.13734722137451172, "kl": 0.144775390625, "learning_rate": 1.4336152245814755e-05, "loss": 0.0514, "reward": 2.0820313692092896, "reward_std": 0.16872656345367432, "rewards/accuracy_reward": 0.10714286053553224, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9882812798023224, "step": 1411 }, { "clip_ratio": 0.0, "completion_length": 651.6138763427734, "epoch": 0.4217758195803151, "grad_norm": 0.20642101764678955, "kl": 0.133544921875, "learning_rate": 1.4326751206061268e-05, "loss": 0.0391, "reward": 2.055803656578064, "reward_std": 0.1363273225724697, "rewards/accuracy_reward": 0.08482143119908869, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.988839328289032, "step": 1412 }, { "clip_ratio": 0.0, "completion_length": 663.8571624755859, "epoch": 0.4220745276678366, "grad_norm": 0.20962676405906677, "kl": 0.1375732421875, "learning_rate": 1.4317345459226536e-05, "loss": 0.0314, "reward": 2.0485492646694183, "reward_std": 0.2094246968626976, "rewards/accuracy_reward": 0.09821429220028222, "rewards/format_reward": 0.96651791036129, "rewards/tag_count_reward": 0.983816996216774, "step": 1413 }, { "clip_ratio": 0.0, "completion_length": 645.6138763427734, "epoch": 0.42237323575535807, "grad_norm": 0.687067985534668, "kl": 0.24853515625, "learning_rate": 1.4307935015543093e-05, "loss": 0.0773, "reward": 1.9737723767757416, "reward_std": 0.2752531096339226, "rewards/accuracy_reward": 0.03348214412108064, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9760045111179352, "step": 1414 }, { "clip_ratio": 0.0, "completion_length": 625.6964569091797, "epoch": 0.42267194384287954, "grad_norm": 0.5197868943214417, "kl": 0.417724609375, "learning_rate": 1.4298519885248574e-05, "loss": 0.0825, "reward": 2.0106027722358704, "reward_std": 0.2491779439151287, "rewards/accuracy_reward": 0.05357142933644354, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9815848618745804, "step": 1415 }, { "clip_ratio": 0.0, "completion_length": 623.6205444335938, "epoch": 0.422970651930401, "grad_norm": 0.311994343996048, "kl": 0.1251220703125, "learning_rate": 1.4289100078585718e-05, "loss": 0.0581, "reward": 2.113839417695999, "reward_std": 0.1633622469380498, "rewards/accuracy_reward": 0.14285714668221772, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9888393133878708, "step": 1416 }, { "clip_ratio": 0.0, "completion_length": 611.0937805175781, "epoch": 0.4232693600179225, "grad_norm": 0.2907688319683075, "kl": 0.1982421875, "learning_rate": 1.4279675605802355e-05, "loss": 0.1135, "reward": 2.1043528020381927, "reward_std": 0.3224915750324726, "rewards/accuracy_reward": 0.1875000037252903, "rewards/format_reward": 0.9508928954601288, "rewards/tag_count_reward": 0.9659598618745804, "step": 1417 }, { "clip_ratio": 0.0, "completion_length": 570.4977874755859, "epoch": 0.42356806810544395, "grad_norm": 0.2058248519897461, "kl": 0.141357421875, "learning_rate": 1.4270246477151386e-05, "loss": 0.0527, "reward": 2.109375089406967, "reward_std": 0.202810637652874, "rewards/accuracy_reward": 0.1495535746216774, "rewards/format_reward": 0.9754464477300644, "rewards/tag_count_reward": 0.9843750298023224, "step": 1418 }, { "clip_ratio": 0.0, "completion_length": 576.8393096923828, "epoch": 0.4238667761929654, "grad_norm": 0.34779560565948486, "kl": 0.215576171875, "learning_rate": 1.4260812702890778e-05, "loss": 0.0529, "reward": 2.0904019474983215, "reward_std": 0.19684760365635157, "rewards/accuracy_reward": 0.12723215157166123, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9832589626312256, "step": 1419 }, { "clip_ratio": 0.0, "completion_length": 558.0067138671875, "epoch": 0.4241654842804869, "grad_norm": 0.609412431716919, "kl": 0.2958984375, "learning_rate": 1.4251374293283555e-05, "loss": 0.1065, "reward": 2.0033483505249023, "reward_std": 0.28413328528404236, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.9720982611179352, "step": 1420 }, { "clip_ratio": 0.0, "completion_length": 582.5826110839844, "epoch": 0.42446419236800836, "grad_norm": 0.2774973511695862, "kl": 0.208740234375, "learning_rate": 1.4241931258597781e-05, "loss": 0.0643, "reward": 2.0228795409202576, "reward_std": 0.1680207410827279, "rewards/accuracy_reward": 0.06026786123402417, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9827009290456772, "step": 1421 }, { "clip_ratio": 0.0, "completion_length": 554.4620819091797, "epoch": 0.42476290045552983, "grad_norm": 0.19395050406455994, "kl": 0.1890869140625, "learning_rate": 1.423248360910655e-05, "loss": 0.0682, "reward": 1.9977679550647736, "reward_std": 0.197933379560709, "rewards/accuracy_reward": 0.03571428847499192, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9843750447034836, "step": 1422 }, { "clip_ratio": 0.0, "completion_length": 573.1183166503906, "epoch": 0.4250616085430513, "grad_norm": 0.38193371891975403, "kl": 0.31982421875, "learning_rate": 1.4223031355087983e-05, "loss": 0.0696, "reward": 2.0658482909202576, "reward_std": 0.29321856424212456, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.9620536267757416, "rewards/tag_count_reward": 0.9765625447034836, "step": 1423 }, { "clip_ratio": 0.0, "completion_length": 567.1138610839844, "epoch": 0.4253603166305728, "grad_norm": 0.3130532205104828, "kl": 0.254638671875, "learning_rate": 1.4213574506825201e-05, "loss": 0.065, "reward": 2.044642984867096, "reward_std": 0.19792966544628143, "rewards/accuracy_reward": 0.08258928824216127, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.98214291036129, "step": 1424 }, { "clip_ratio": 0.0, "completion_length": 563.3772583007812, "epoch": 0.42565902471809425, "grad_norm": 0.2811669111251831, "kl": 0.2073974609375, "learning_rate": 1.4204113074606332e-05, "loss": 0.0807, "reward": 2.083147406578064, "reward_std": 0.23601013142615557, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.970982164144516, "rewards/tag_count_reward": 0.9804687798023224, "step": 1425 }, { "clip_ratio": 0.0, "completion_length": 574.935302734375, "epoch": 0.4259577328056157, "grad_norm": 0.8943202495574951, "kl": 0.44287109375, "learning_rate": 1.419464706872448e-05, "loss": 0.0563, "reward": 2.0691965222358704, "reward_std": 0.2598777674138546, "rewards/accuracy_reward": 0.11383929196745157, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9843750447034836, "step": 1426 }, { "clip_ratio": 0.0, "completion_length": 577.6741180419922, "epoch": 0.4262564408931372, "grad_norm": 0.4310232698917389, "kl": 0.213134765625, "learning_rate": 1.4185176499477742e-05, "loss": 0.0589, "reward": 2.041294753551483, "reward_std": 0.22767912782728672, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9810268133878708, "step": 1427 }, { "clip_ratio": 0.0, "completion_length": 577.0513458251953, "epoch": 0.42655514898065866, "grad_norm": 0.6317571401596069, "kl": 0.260986328125, "learning_rate": 1.4175701377169162e-05, "loss": 0.0764, "reward": 2.0496652722358704, "reward_std": 0.28495078161358833, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.9553571939468384, "rewards/tag_count_reward": 0.9715402126312256, "step": 1428 }, { "clip_ratio": 0.0, "completion_length": 576.8303833007812, "epoch": 0.42685385706818013, "grad_norm": 0.5820203423500061, "kl": 0.297119140625, "learning_rate": 1.4166221712106749e-05, "loss": 0.0946, "reward": 2.1183037161827087, "reward_std": 0.2826809994876385, "rewards/accuracy_reward": 0.17633929289877415, "rewards/format_reward": 0.964285746216774, "rewards/tag_count_reward": 0.9776785969734192, "step": 1429 }, { "clip_ratio": 0.0, "completion_length": 566.9263763427734, "epoch": 0.4271525651557016, "grad_norm": 0.5290917158126831, "kl": 0.408203125, "learning_rate": 1.4156737514603443e-05, "loss": 0.0594, "reward": 2.0145090222358704, "reward_std": 0.21741580590605736, "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.987723246216774, "step": 1430 }, { "clip_ratio": 0.0, "completion_length": 616.4218902587891, "epoch": 0.4274512732432231, "grad_norm": 0.3638021945953369, "kl": 0.1287841796875, "learning_rate": 1.4147248794977127e-05, "loss": 0.0562, "reward": 2.060267984867096, "reward_std": 0.19348927587270737, "rewards/accuracy_reward": 0.10267857369035482, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9866071790456772, "step": 1431 }, { "clip_ratio": 0.0, "completion_length": 619.5089416503906, "epoch": 0.42774998133074454, "grad_norm": 0.3608289957046509, "kl": 0.214111328125, "learning_rate": 1.4137755563550597e-05, "loss": 0.0343, "reward": 2.0340403020381927, "reward_std": 0.16998110339045525, "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9893973618745804, "step": 1432 }, { "clip_ratio": 0.0, "completion_length": 564.2589569091797, "epoch": 0.428048689418266, "grad_norm": 0.41871994733810425, "kl": 0.1824951171875, "learning_rate": 1.4128257830651554e-05, "loss": 0.0569, "reward": 2.083705484867096, "reward_std": 0.13078372925519943, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9899553954601288, "step": 1433 }, { "clip_ratio": 0.0, "completion_length": 616.8504638671875, "epoch": 0.4283473975057875, "grad_norm": 0.43004918098449707, "kl": 0.15185546875, "learning_rate": 1.411875560661261e-05, "loss": 0.0214, "reward": 2.0558036267757416, "reward_std": 0.19305626302957535, "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.9620536267757416, "rewards/tag_count_reward": 0.9866071790456772, "step": 1434 }, { "clip_ratio": 0.0, "completion_length": 548.1317138671875, "epoch": 0.42864610559330896, "grad_norm": 0.14838039875030518, "kl": 0.1708984375, "learning_rate": 1.4109248901771242e-05, "loss": 0.0269, "reward": 2.0781251192092896, "reward_std": 0.14610127545893192, "rewards/accuracy_reward": 0.10937500465661287, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9888393133878708, "step": 1435 }, { "clip_ratio": 0.0, "completion_length": 554.0469131469727, "epoch": 0.42894481368083043, "grad_norm": 0.2794991433620453, "kl": 0.51220703125, "learning_rate": 1.4099737726469823e-05, "loss": 0.0595, "reward": 2.0239956080913544, "reward_std": 0.1285719582810998, "rewards/accuracy_reward": 0.05133928777649999, "rewards/format_reward": 0.9843750149011612, "rewards/tag_count_reward": 0.9882812649011612, "step": 1436 }, { "clip_ratio": 0.0, "completion_length": 542.1317291259766, "epoch": 0.4292435217683519, "grad_norm": 0.16890640556812286, "kl": 0.13720703125, "learning_rate": 1.409022209105557e-05, "loss": 0.0465, "reward": 2.1205358505249023, "reward_std": 0.16072074510157108, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9910714626312256, "step": 1437 }, { "clip_ratio": 0.0, "completion_length": 540.8303680419922, "epoch": 0.42954222985587337, "grad_norm": 0.39541077613830566, "kl": 0.22216796875, "learning_rate": 1.408070200588057e-05, "loss": 0.0796, "reward": 2.0697546005249023, "reward_std": 0.2648947462439537, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.97823666036129, "step": 1438 }, { "clip_ratio": 0.0, "completion_length": 523.5580596923828, "epoch": 0.42984093794339484, "grad_norm": 0.3909253776073456, "kl": 0.196044921875, "learning_rate": 1.407117748130174e-05, "loss": 0.0497, "reward": 2.1233259737491608, "reward_std": 0.16185403242707253, "rewards/accuracy_reward": 0.16071429336443543, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9827009290456772, "step": 1439 }, { "clip_ratio": 0.0, "completion_length": 483.8482360839844, "epoch": 0.4301396460309163, "grad_norm": 1.067259669303894, "kl": 0.59130859375, "learning_rate": 1.4061648527680825e-05, "loss": 0.097, "reward": 2.0563617646694183, "reward_std": 0.20521672442555428, "rewards/accuracy_reward": 0.08928572130389512, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9871652275323868, "step": 1440 }, { "clip_ratio": 0.0, "completion_length": 490.71653747558594, "epoch": 0.4304383541184378, "grad_norm": 0.2410777360200882, "kl": 0.1134033203125, "learning_rate": 1.4052115155384401e-05, "loss": 0.0125, "reward": 2.064174175262451, "reward_std": 0.13408542051911354, "rewards/accuracy_reward": 0.07589286169968545, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949777126312256, "step": 1441 }, { "clip_ratio": 0.0, "completion_length": 479.5714569091797, "epoch": 0.43073706220595925, "grad_norm": 0.6482803821563721, "kl": 0.2425537109375, "learning_rate": 1.4042577374783834e-05, "loss": 0.08, "reward": 2.106026917695999, "reward_std": 0.2010877039283514, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9832589775323868, "step": 1442 }, { "clip_ratio": 0.0, "completion_length": 518.7009201049805, "epoch": 0.43103577029348067, "grad_norm": 0.13446611166000366, "kl": 0.1070556640625, "learning_rate": 1.40330351962553e-05, "loss": 0.0223, "reward": 2.0502232909202576, "reward_std": 0.14148344472050667, "rewards/accuracy_reward": 0.07142857392318547, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9921875447034836, "step": 1443 }, { "clip_ratio": 0.0, "completion_length": 492.01564025878906, "epoch": 0.43133447838100214, "grad_norm": 0.5748964548110962, "kl": 0.129150390625, "learning_rate": 1.402348863017975e-05, "loss": 0.0336, "reward": 2.0256697237491608, "reward_std": 0.09484535455703735, "rewards/accuracy_reward": 0.0446428582072258, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9944196790456772, "step": 1444 }, { "clip_ratio": 0.0, "completion_length": 495.1808319091797, "epoch": 0.4316331864685236, "grad_norm": 0.38791292905807495, "kl": 0.115234375, "learning_rate": 1.401393768694292e-05, "loss": 0.0517, "reward": 2.1015626192092896, "reward_std": 0.14427222311496735, "rewards/accuracy_reward": 0.11383928917348385, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.994419664144516, "step": 1445 }, { "clip_ratio": 0.0, "completion_length": 459.35716247558594, "epoch": 0.4319318945560451, "grad_norm": 0.2548392713069916, "kl": 0.16015625, "learning_rate": 1.4004382376935293e-05, "loss": 0.0542, "reward": 2.098772406578064, "reward_std": 0.14883743040263653, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9893973618745804, "step": 1446 }, { "clip_ratio": 0.0, "completion_length": 494.68306732177734, "epoch": 0.43223060264356655, "grad_norm": 0.6167176365852356, "kl": 0.236328125, "learning_rate": 1.3994822710552108e-05, "loss": 0.0415, "reward": 2.1529018878936768, "reward_std": 0.20908554643392563, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9899553954601288, "step": 1447 }, { "clip_ratio": 0.0, "completion_length": 478.8593978881836, "epoch": 0.432529310731088, "grad_norm": 0.2103998064994812, "kl": 0.1688232421875, "learning_rate": 1.3985258698193351e-05, "loss": 0.0568, "reward": 2.1160714626312256, "reward_std": 0.21377640217542648, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9910714626312256, "step": 1448 }, { "clip_ratio": 0.0, "completion_length": 480.02904510498047, "epoch": 0.4328280188186095, "grad_norm": 0.22564615309238434, "kl": 0.3822021484375, "learning_rate": 1.397569035026373e-05, "loss": 0.0179, "reward": 2.1484376192092896, "reward_std": 0.10188870131969452, "rewards/accuracy_reward": 0.16071429383009672, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9966517984867096, "step": 1449 }, { "clip_ratio": 0.0, "completion_length": 519.2991256713867, "epoch": 0.43312672690613097, "grad_norm": 0.16577734053134918, "kl": 0.116943359375, "learning_rate": 1.3966117677172663e-05, "loss": 0.0597, "reward": 2.1244420409202576, "reward_std": 0.12311662174761295, "rewards/accuracy_reward": 0.14508929220028222, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9927455931901932, "step": 1450 }, { "clip_ratio": 0.0, "completion_length": 527.482177734375, "epoch": 0.43342543499365244, "grad_norm": 0.3378883898258209, "kl": 0.149658203125, "learning_rate": 1.3956540689334286e-05, "loss": 0.0515, "reward": 2.0770089626312256, "reward_std": 0.1608013864606619, "rewards/accuracy_reward": 0.10491071990691125, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.987723246216774, "step": 1451 }, { "clip_ratio": 0.0, "completion_length": 505.45091247558594, "epoch": 0.4337241430811739, "grad_norm": 0.691547691822052, "kl": 0.46826171875, "learning_rate": 1.3946959397167423e-05, "loss": 0.0408, "reward": 2.1902902722358704, "reward_std": 0.18261720426380634, "rewards/accuracy_reward": 0.21205358440056443, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.991629496216774, "step": 1452 }, { "clip_ratio": 0.0, "completion_length": 577.9040451049805, "epoch": 0.4340228511686954, "grad_norm": 0.480336993932724, "kl": 0.20556640625, "learning_rate": 1.393737381109558e-05, "loss": 0.0511, "reward": 1.9827009439468384, "reward_std": 0.1270153559744358, "rewards/accuracy_reward": 0.011160715017467737, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9871652275323868, "step": 1453 }, { "clip_ratio": 0.0, "completion_length": 525.0469055175781, "epoch": 0.43432155925621685, "grad_norm": 0.24502453207969666, "kl": 0.2041015625, "learning_rate": 1.392778394154693e-05, "loss": 0.0549, "reward": 1.9972098767757416, "reward_std": 0.11453312262892723, "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9905134290456772, "step": 1454 }, { "clip_ratio": 0.0, "completion_length": 536.0111846923828, "epoch": 0.4346202673437383, "grad_norm": 0.43174219131469727, "kl": 0.228759765625, "learning_rate": 1.3918189798954322e-05, "loss": 0.073, "reward": 2.016741156578064, "reward_std": 0.19673027098178864, "rewards/accuracy_reward": 0.05133928754366934, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9854911118745804, "step": 1455 }, { "clip_ratio": 0.0, "completion_length": 531.4620742797852, "epoch": 0.4349189754312598, "grad_norm": 0.21784824132919312, "kl": 0.220947265625, "learning_rate": 1.3908591393755234e-05, "loss": 0.0443, "reward": 2.0345982909202576, "reward_std": 0.16091222688555717, "rewards/accuracy_reward": 0.06026785867288709, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.987723246216774, "step": 1456 }, { "clip_ratio": 0.0, "completion_length": 573.7678985595703, "epoch": 0.43521768351878126, "grad_norm": 0.3279295861721039, "kl": 0.2020263671875, "learning_rate": 1.3898988736391792e-05, "loss": 0.0559, "reward": 2.055245667695999, "reward_std": 0.15163080766797066, "rewards/accuracy_reward": 0.0870535783469677, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9860491454601288, "step": 1457 }, { "clip_ratio": 0.0, "completion_length": 561.2790451049805, "epoch": 0.43551639160630273, "grad_norm": 3.9656293392181396, "kl": 0.675048828125, "learning_rate": 1.3889381837310746e-05, "loss": 0.1143, "reward": 2.1127232909202576, "reward_std": 0.21801859885454178, "rewards/accuracy_reward": 0.16294643841683865, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.981026828289032, "step": 1458 }, { "clip_ratio": 0.0, "completion_length": 564.9955749511719, "epoch": 0.4358150996938242, "grad_norm": 0.27604350447654724, "kl": 0.1688232421875, "learning_rate": 1.3879770706963464e-05, "loss": 0.0568, "reward": 2.118861675262451, "reward_std": 0.22719258815050125, "rewards/accuracy_reward": 0.15401786379516125, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.984933078289032, "step": 1459 }, { "clip_ratio": 0.0, "completion_length": 577.0647506713867, "epoch": 0.4361138077813457, "grad_norm": 0.2534720301628113, "kl": 0.1583251953125, "learning_rate": 1.387015535580591e-05, "loss": 0.0655, "reward": 2.0273438692092896, "reward_std": 0.19615241140127182, "rewards/accuracy_reward": 0.0602678582072258, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9871652275323868, "step": 1460 }, { "clip_ratio": 0.0, "completion_length": 609.4844055175781, "epoch": 0.43641251586886715, "grad_norm": 0.2220684289932251, "kl": 0.1580810546875, "learning_rate": 1.3860535794298644e-05, "loss": 0.0412, "reward": 2.0407367050647736, "reward_std": 0.16986338887363672, "rewards/accuracy_reward": 0.07366071781143546, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9871652126312256, "step": 1461 }, { "clip_ratio": 0.0, "completion_length": 633.6004638671875, "epoch": 0.4367112239563886, "grad_norm": 0.47611597180366516, "kl": 0.239990234375, "learning_rate": 1.385091203290681e-05, "loss": 0.091, "reward": 1.9743304550647736, "reward_std": 0.3459855765104294, "rewards/accuracy_reward": 0.05580357415601611, "rewards/format_reward": 0.9508928954601288, "rewards/tag_count_reward": 0.9676339775323868, "step": 1462 }, { "clip_ratio": 0.0, "completion_length": 596.2589569091797, "epoch": 0.4370099320439101, "grad_norm": 0.1229521781206131, "kl": 0.1046142578125, "learning_rate": 1.384128408210011e-05, "loss": 0.0216, "reward": 2.0435268878936768, "reward_std": 0.13865990191698074, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.994419664144516, "step": 1463 }, { "clip_ratio": 0.0, "completion_length": 646.8839416503906, "epoch": 0.43730864013143156, "grad_norm": 0.20393481850624084, "kl": 0.1517333984375, "learning_rate": 1.3831651952352818e-05, "loss": 0.0638, "reward": 1.9983259737491608, "reward_std": 0.25272779539227486, "rewards/accuracy_reward": 0.0602678619325161, "rewards/format_reward": 0.9598214775323868, "rewards/tag_count_reward": 0.9782366454601288, "step": 1464 }, { "clip_ratio": 0.0, "completion_length": 580.0245666503906, "epoch": 0.43760734821895303, "grad_norm": 0.433793842792511, "kl": 0.1165771484375, "learning_rate": 1.3822015654143742e-05, "loss": 0.0438, "reward": 2.090959906578064, "reward_std": 0.1442361194640398, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.992745578289032, "step": 1465 }, { "clip_ratio": 0.0, "completion_length": 624.2433319091797, "epoch": 0.4379060563064745, "grad_norm": 0.24745741486549377, "kl": 0.1712646484375, "learning_rate": 1.3812375197956233e-05, "loss": 0.0903, "reward": 2.0279018878936768, "reward_std": 0.29283084720373154, "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.9575893133878708, "rewards/tag_count_reward": 0.9787946790456772, "step": 1466 }, { "clip_ratio": 0.0, "completion_length": 594.7656478881836, "epoch": 0.43820476439399597, "grad_norm": 0.4022057354450226, "kl": 0.3023681640625, "learning_rate": 1.3802730594278161e-05, "loss": 0.0723, "reward": 2.0245536863803864, "reward_std": 0.25557738170027733, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9754464775323868, "step": 1467 }, { "clip_ratio": 0.0, "completion_length": 583.9643096923828, "epoch": 0.43850347248151744, "grad_norm": 0.9539609551429749, "kl": 0.296630859375, "learning_rate": 1.3793081853601913e-05, "loss": 0.0764, "reward": 2.0502233505249023, "reward_std": 0.2625872381031513, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.981026828289032, "step": 1468 }, { "clip_ratio": 0.0, "completion_length": 639.1652069091797, "epoch": 0.4388021805690389, "grad_norm": 0.18257439136505127, "kl": 0.1258544921875, "learning_rate": 1.3783428986424366e-05, "loss": 0.0314, "reward": 2.0357143878936768, "reward_std": 0.17381156235933304, "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.988839328289032, "step": 1469 }, { "clip_ratio": 0.0, "completion_length": 530.7879638671875, "epoch": 0.4391008886565604, "grad_norm": 1.8657253980636597, "kl": 0.637451171875, "learning_rate": 1.37737720032469e-05, "loss": 0.0642, "reward": 2.041852742433548, "reward_std": 0.18754428811371326, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9860491454601288, "step": 1470 }, { "clip_ratio": 0.0, "completion_length": 565.2098541259766, "epoch": 0.43939959674408186, "grad_norm": 0.481891393661499, "kl": 0.2435302734375, "learning_rate": 1.3764110914575365e-05, "loss": 0.0728, "reward": 2.092076003551483, "reward_std": 0.1914113610982895, "rewards/accuracy_reward": 0.13392857694998384, "rewards/format_reward": 0.9732143133878708, "rewards/tag_count_reward": 0.9849330633878708, "step": 1471 }, { "clip_ratio": 0.0, "completion_length": 500.1696548461914, "epoch": 0.4396983048316033, "grad_norm": 1.0823092460632324, "kl": 0.18896484375, "learning_rate": 1.3754445730920075e-05, "loss": 0.0998, "reward": 2.0892857909202576, "reward_std": 0.2640192434191704, "rewards/accuracy_reward": 0.13169643562287092, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9821428954601288, "step": 1472 }, { "clip_ratio": 0.0, "completion_length": 563.2344055175781, "epoch": 0.4399970129191248, "grad_norm": 0.5220093131065369, "kl": 0.2376708984375, "learning_rate": 1.3744776462795806e-05, "loss": 0.0842, "reward": 2.0485492050647736, "reward_std": 0.2210545800626278, "rewards/accuracy_reward": 0.10937500488944352, "rewards/format_reward": 0.9620536267757416, "rewards/tag_count_reward": 0.977120578289032, "step": 1473 }, { "clip_ratio": 0.0, "completion_length": 541.8214416503906, "epoch": 0.44029572100664627, "grad_norm": 0.2057429999113083, "kl": 0.1239013671875, "learning_rate": 1.3735103120721773e-05, "loss": 0.0626, "reward": 2.0184153020381927, "reward_std": 0.1278936229646206, "rewards/accuracy_reward": 0.044642860535532236, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9916295111179352, "step": 1474 }, { "clip_ratio": 0.0, "completion_length": 535.4620819091797, "epoch": 0.44059442909416774, "grad_norm": 0.2237231284379959, "kl": 0.202392578125, "learning_rate": 1.3725425715221625e-05, "loss": 0.0695, "reward": 2.1244420409202576, "reward_std": 0.1912413164973259, "rewards/accuracy_reward": 0.16294643771834671, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9860491454601288, "step": 1475 }, { "clip_ratio": 0.0, "completion_length": 519.4977874755859, "epoch": 0.4408931371816892, "grad_norm": 0.3001415431499481, "kl": 0.1474609375, "learning_rate": 1.3715744256823427e-05, "loss": 0.1134, "reward": 2.0368304550647736, "reward_std": 0.3450069725513458, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.972098246216774, "step": 1476 }, { "clip_ratio": 0.0, "completion_length": 544.6406555175781, "epoch": 0.4411918452692107, "grad_norm": 0.23846453428268433, "kl": 0.1436767578125, "learning_rate": 1.3706058756059661e-05, "loss": 0.0593, "reward": 2.0234375596046448, "reward_std": 0.17612027935683727, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.981026828289032, "step": 1477 }, { "clip_ratio": 0.0, "completion_length": 512.2589416503906, "epoch": 0.44149055335673215, "grad_norm": 0.23498955368995667, "kl": 0.1817626953125, "learning_rate": 1.3696369223467204e-05, "loss": 0.0486, "reward": 2.086495667695999, "reward_std": 0.17647318728268147, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9905134290456772, "step": 1478 }, { "clip_ratio": 0.0, "completion_length": 547.5580673217773, "epoch": 0.4417892614442536, "grad_norm": 0.20959649980068207, "kl": 0.1627197265625, "learning_rate": 1.3686675669587311e-05, "loss": 0.0685, "reward": 2.0334822237491608, "reward_std": 0.2177325040102005, "rewards/accuracy_reward": 0.07589285913854837, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9843750447034836, "step": 1479 }, { "clip_ratio": 0.0, "completion_length": 541.3147583007812, "epoch": 0.4420879695317751, "grad_norm": 0.25056734681129456, "kl": 0.17236328125, "learning_rate": 1.3676978104965623e-05, "loss": 0.0755, "reward": 2.146205425262451, "reward_std": 0.2521969545632601, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9787946939468384, "step": 1480 }, { "clip_ratio": 0.0, "completion_length": 522.732177734375, "epoch": 0.44238667761929656, "grad_norm": 0.21479375660419464, "kl": 0.16162109375, "learning_rate": 1.3667276540152143e-05, "loss": 0.054, "reward": 2.071986675262451, "reward_std": 0.23464728891849518, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.9665178805589676, "rewards/tag_count_reward": 0.9827009290456772, "step": 1481 }, { "clip_ratio": 0.0, "completion_length": 532.6205596923828, "epoch": 0.44268538570681804, "grad_norm": 0.21422693133354187, "kl": 0.16162109375, "learning_rate": 1.3657570985701217e-05, "loss": 0.0718, "reward": 2.0669643878936768, "reward_std": 0.1640415582805872, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9821428954601288, "step": 1482 }, { "clip_ratio": 0.0, "completion_length": 594.3906402587891, "epoch": 0.4429840937943395, "grad_norm": 0.23466746509075165, "kl": 0.111572265625, "learning_rate": 1.3647861452171536e-05, "loss": 0.0414, "reward": 1.98381707072258, "reward_std": 0.24797337502241135, "rewards/accuracy_reward": 0.040178573690354824, "rewards/format_reward": 0.964285746216774, "rewards/tag_count_reward": 0.9793527126312256, "step": 1483 }, { "clip_ratio": 0.0, "completion_length": 602.4531402587891, "epoch": 0.443282801881861, "grad_norm": 0.2863079011440277, "kl": 0.1656494140625, "learning_rate": 1.3638147950126128e-05, "loss": 0.113, "reward": 1.9358259737491608, "reward_std": 0.39045654982328415, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.9084821790456772, "rewards/tag_count_reward": 0.9380580633878708, "step": 1484 }, { "clip_ratio": 0.0, "completion_length": 551.4442138671875, "epoch": 0.44358150996938245, "grad_norm": 0.33454421162605286, "kl": 0.1826171875, "learning_rate": 1.3628430490132327e-05, "loss": 0.0971, "reward": 1.971540242433548, "reward_std": 0.31716661900281906, "rewards/accuracy_reward": 0.0714285762514919, "rewards/format_reward": 0.941964328289032, "rewards/tag_count_reward": 0.9581473618745804, "step": 1485 }, { "clip_ratio": 0.0, "completion_length": 590.3928985595703, "epoch": 0.44388021805690386, "grad_norm": 0.3827565908432007, "kl": 0.1572265625, "learning_rate": 1.3618709082761773e-05, "loss": 0.0582, "reward": 2.0691965222358704, "reward_std": 0.29725388437509537, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.9620535969734192, "rewards/tag_count_reward": 0.973214328289032, "step": 1486 }, { "clip_ratio": 0.0, "completion_length": 596.0647430419922, "epoch": 0.44417892614442533, "grad_norm": 0.6033827662467957, "kl": 0.273193359375, "learning_rate": 1.3608983738590414e-05, "loss": 0.1484, "reward": 1.9609375894069672, "reward_std": 0.41197142004966736, "rewards/accuracy_reward": 0.13169643376022577, "rewards/format_reward": 0.901785746216774, "rewards/tag_count_reward": 0.92745541036129, "step": 1487 }, { "clip_ratio": 0.0, "completion_length": 560.3794860839844, "epoch": 0.4444776342319468, "grad_norm": 0.8678237795829773, "kl": 0.361328125, "learning_rate": 1.3599254468198462e-05, "loss": 0.1972, "reward": 1.9402902722358704, "reward_std": 0.4075882248580456, "rewards/accuracy_reward": 0.13392857555299997, "rewards/format_reward": 0.8906250447034836, "rewards/tag_count_reward": 0.9157366305589676, "step": 1488 }, { "clip_ratio": 0.0, "completion_length": 585.1942138671875, "epoch": 0.4447763423194683, "grad_norm": 0.21150164306163788, "kl": 0.1314697265625, "learning_rate": 1.3589521282170415e-05, "loss": 0.06, "reward": 2.005580395460129, "reward_std": 0.19764053262770176, "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.9732143133878708, "rewards/tag_count_reward": 0.9787946790456772, "step": 1489 }, { "clip_ratio": 0.0, "completion_length": 564.9219055175781, "epoch": 0.44507505040698975, "grad_norm": 0.24454402923583984, "kl": 0.1175537109375, "learning_rate": 1.3579784191095022e-05, "loss": 0.0636, "reward": 2.0904018580913544, "reward_std": 0.23054054379463196, "rewards/accuracy_reward": 0.14508929592557251, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9765625447034836, "step": 1490 }, { "clip_ratio": 0.0, "completion_length": 518.4464569091797, "epoch": 0.4453737584945112, "grad_norm": 0.14021563529968262, "kl": 0.104736328125, "learning_rate": 1.3570043205565289e-05, "loss": 0.028, "reward": 2.0781251192092896, "reward_std": 0.07450867909938097, "rewards/accuracy_reward": 0.08705357694998384, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9955357313156128, "step": 1491 }, { "clip_ratio": 0.0, "completion_length": 536.6473541259766, "epoch": 0.4456724665820327, "grad_norm": 0.19484686851501465, "kl": 0.17236328125, "learning_rate": 1.356029833617845e-05, "loss": 0.0556, "reward": 1.9709822237491608, "reward_std": 0.14612732827663422, "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9866071790456772, "step": 1492 }, { "clip_ratio": 0.0, "completion_length": 520.4843902587891, "epoch": 0.44597117466955416, "grad_norm": 0.21523040533065796, "kl": 0.1248779296875, "learning_rate": 1.3550549593535965e-05, "loss": 0.0542, "reward": 2.0669643580913544, "reward_std": 0.22353793680667877, "rewards/accuracy_reward": 0.10714286053553224, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.98214291036129, "step": 1493 }, { "clip_ratio": 0.0, "completion_length": 522.819221496582, "epoch": 0.44626988275707563, "grad_norm": 0.10839337855577469, "kl": 0.17724609375, "learning_rate": 1.3540796988243514e-05, "loss": 0.0061, "reward": 2.122209846973419, "reward_std": 0.08893957175314426, "rewards/accuracy_reward": 0.1294642873108387, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9972098469734192, "step": 1494 }, { "clip_ratio": 0.0, "completion_length": 514.6919860839844, "epoch": 0.4465685908445971, "grad_norm": 0.10666202008724213, "kl": 0.160888671875, "learning_rate": 1.3531040530910977e-05, "loss": 0.0205, "reward": 2.060267925262451, "reward_std": 0.13341925479471684, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9910714626312256, "step": 1495 }, { "clip_ratio": 0.0, "completion_length": 469.6294860839844, "epoch": 0.4468672989321186, "grad_norm": 0.17858126759529114, "kl": 0.1336669921875, "learning_rate": 1.3521280232152421e-05, "loss": 0.1259, "reward": 2.046317011117935, "reward_std": 0.1752837635576725, "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9793527126312256, "step": 1496 }, { "clip_ratio": 0.0, "completion_length": 500.3415298461914, "epoch": 0.44716600701964004, "grad_norm": 0.1283605396747589, "kl": 0.109619140625, "learning_rate": 1.3511516102586093e-05, "loss": 0.0311, "reward": 2.0323661863803864, "reward_std": 0.11728889308869839, "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9921875149011612, "step": 1497 }, { "clip_ratio": 0.0, "completion_length": 499.6830596923828, "epoch": 0.4474647151071615, "grad_norm": 0.16133500635623932, "kl": 0.1256103515625, "learning_rate": 1.3501748152834413e-05, "loss": 0.0789, "reward": 2.0027903020381927, "reward_std": 0.1495535783469677, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9871652126312256, "step": 1498 }, { "clip_ratio": 0.0, "completion_length": 494.0044860839844, "epoch": 0.447763423194683, "grad_norm": 0.1305387020111084, "kl": 0.113037109375, "learning_rate": 1.3491976393523952e-05, "loss": 0.0507, "reward": 2.1026787161827087, "reward_std": 0.15971102565526962, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9910714477300644, "step": 1499 }, { "clip_ratio": 0.0, "completion_length": 453.49109649658203, "epoch": 0.44806213128220446, "grad_norm": 0.19070114195346832, "kl": 0.202880859375, "learning_rate": 1.3482200835285421e-05, "loss": 0.0287, "reward": 2.029576063156128, "reward_std": 0.13169643748551607, "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.9843750149011612, "rewards/tag_count_reward": 0.9916294813156128, "step": 1500 }, { "clip_ratio": 0.0, "completion_length": 486.1406555175781, "epoch": 0.4483608393697259, "grad_norm": 0.2513846457004547, "kl": 0.63037109375, "learning_rate": 1.3472421488753678e-05, "loss": 0.0165, "reward": 2.1395090222358704, "reward_std": 0.17927245050668716, "rewards/accuracy_reward": 0.17187501210719347, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.987723246216774, "step": 1501 }, { "clip_ratio": 0.0, "completion_length": 492.1451110839844, "epoch": 0.4486595474572474, "grad_norm": 0.21899664402008057, "kl": 0.349365234375, "learning_rate": 1.3462638364567688e-05, "loss": 0.0218, "reward": 1.9927456378936768, "reward_std": 0.13050232641398907, "rewards/accuracy_reward": 0.013392857974395156, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9905134290456772, "step": 1502 }, { "clip_ratio": 0.0, "completion_length": 499.2656555175781, "epoch": 0.44895825554476887, "grad_norm": 0.12096086889505386, "kl": 0.1129150390625, "learning_rate": 1.3452851473370531e-05, "loss": 0.0217, "reward": 2.071428656578064, "reward_std": 0.13722501136362553, "rewards/accuracy_reward": 0.08482143469154835, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9933035969734192, "step": 1503 }, { "clip_ratio": 0.0, "completion_length": 493.95091247558594, "epoch": 0.44925696363229034, "grad_norm": 0.19943055510520935, "kl": 0.13623046875, "learning_rate": 1.3443060825809387e-05, "loss": 0.0308, "reward": 2.0742188692092896, "reward_std": 0.18509754538536072, "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.991629496216774, "step": 1504 }, { "clip_ratio": 0.0, "completion_length": 521.091552734375, "epoch": 0.4495556717198118, "grad_norm": 0.09477316588163376, "kl": 0.0882568359375, "learning_rate": 1.343326643253552e-05, "loss": -0.0041, "reward": 2.1339287161827087, "reward_std": 0.0745241567492485, "rewards/accuracy_reward": 0.13392857392318547, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1505 }, { "clip_ratio": 0.0, "completion_length": 568.6808166503906, "epoch": 0.4498543798073333, "grad_norm": 0.1138644590973854, "kl": 0.1114501953125, "learning_rate": 1.3423468304204275e-05, "loss": 0.0525, "reward": 2.05022332072258, "reward_std": 0.11970869824290276, "rewards/accuracy_reward": 0.07142857811413705, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9899553805589676, "step": 1506 }, { "clip_ratio": 0.0, "completion_length": 539.7991485595703, "epoch": 0.45015308789485475, "grad_norm": 0.1722840517759323, "kl": 0.28662109375, "learning_rate": 1.3413666451475048e-05, "loss": 0.038, "reward": 2.046875149011612, "reward_std": 0.22875139489769936, "rewards/accuracy_reward": 0.09151786402799189, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9821428954601288, "step": 1507 }, { "clip_ratio": 0.0, "completion_length": 566.0111770629883, "epoch": 0.4504517959823762, "grad_norm": 0.15097777545452118, "kl": 0.1278076171875, "learning_rate": 1.3403860885011297e-05, "loss": 0.0434, "reward": 2.0898438692092896, "reward_std": 0.15839090198278427, "rewards/accuracy_reward": 0.11383929313160479, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.991629496216774, "step": 1508 }, { "clip_ratio": 0.0, "completion_length": 573.3459930419922, "epoch": 0.4507505040698977, "grad_norm": 0.14187893271446228, "kl": 0.1767578125, "learning_rate": 1.3394051615480516e-05, "loss": 0.0072, "reward": 2.090401828289032, "reward_std": 0.18241967260837555, "rewards/accuracy_reward": 0.10937500419095159, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9921875447034836, "step": 1509 }, { "clip_ratio": 0.0, "completion_length": 560.2053833007812, "epoch": 0.45104921215741917, "grad_norm": 0.12329911440610886, "kl": 0.10498046875, "learning_rate": 1.3384238653554234e-05, "loss": 0.0673, "reward": 2.161830425262451, "reward_std": 0.1625830102711916, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9899553954601288, "step": 1510 }, { "clip_ratio": 0.0, "completion_length": 539.7321701049805, "epoch": 0.45134792024494064, "grad_norm": 0.2300575226545334, "kl": 0.1202392578125, "learning_rate": 1.3374422009907984e-05, "loss": 0.0942, "reward": 2.032924234867096, "reward_std": 0.16857812274247408, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9882812798023224, "step": 1511 }, { "clip_ratio": 0.0, "completion_length": 564.1852874755859, "epoch": 0.4516466283324621, "grad_norm": 1.0793145895004272, "kl": 1.04638671875, "learning_rate": 1.3364601695221318e-05, "loss": 0.0333, "reward": 2.0764509737491608, "reward_std": 0.2041449025273323, "rewards/accuracy_reward": 0.10714286006987095, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9893973618745804, "step": 1512 }, { "clip_ratio": 0.0, "completion_length": 515.9553756713867, "epoch": 0.4519453364199836, "grad_norm": 0.2248072475194931, "kl": 0.2481689453125, "learning_rate": 1.3354777720177775e-05, "loss": 0.0825, "reward": 2.1590402722358704, "reward_std": 0.24432019144296646, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9804687947034836, "step": 1513 }, { "clip_ratio": 0.0, "completion_length": 554.2545013427734, "epoch": 0.45224404450750505, "grad_norm": 0.23455113172531128, "kl": 0.2806396484375, "learning_rate": 1.3344950095464872e-05, "loss": 0.057, "reward": 2.118861675262451, "reward_std": 0.17651254124939442, "rewards/accuracy_reward": 0.16294643888249993, "rewards/format_reward": 0.9732143133878708, "rewards/tag_count_reward": 0.9827009290456772, "step": 1514 }, { "clip_ratio": 0.0, "completion_length": 567.6361846923828, "epoch": 0.4525427525950265, "grad_norm": 0.1431729793548584, "kl": 0.3095703125, "learning_rate": 1.333511883177411e-05, "loss": 0.0467, "reward": 2.0753349363803864, "reward_std": 0.22991076856851578, "rewards/accuracy_reward": 0.11607143096625805, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.983816996216774, "step": 1515 }, { "clip_ratio": 0.0, "completion_length": 502.4955596923828, "epoch": 0.452841460682548, "grad_norm": 0.14935550093650818, "kl": 0.1043701171875, "learning_rate": 1.3325283939800935e-05, "loss": 0.0547, "reward": 2.1116071939468384, "reward_std": 0.18376337550580502, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9910714626312256, "step": 1516 }, { "clip_ratio": 0.0, "completion_length": 570.9509124755859, "epoch": 0.45314016877006946, "grad_norm": 0.2388852834701538, "kl": 0.1363525390625, "learning_rate": 1.3315445430244744e-05, "loss": 0.0766, "reward": 2.03850457072258, "reward_std": 0.2009551953524351, "rewards/accuracy_reward": 0.08928571571595967, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9782366454601288, "step": 1517 }, { "clip_ratio": 0.0, "completion_length": 586.7879791259766, "epoch": 0.45343887685759093, "grad_norm": 0.16366739571094513, "kl": 0.338134765625, "learning_rate": 1.3305603313808875e-05, "loss": 0.0449, "reward": 2.0887277722358704, "reward_std": 0.1886889822781086, "rewards/accuracy_reward": 0.12946429196745157, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.983816996216774, "step": 1518 }, { "clip_ratio": 0.0, "completion_length": 592.9620819091797, "epoch": 0.4537375849451124, "grad_norm": 0.41503384709358215, "kl": 0.134033203125, "learning_rate": 1.3295757601200582e-05, "loss": 0.0827, "reward": 1.95256707072258, "reward_std": 0.2329481765627861, "rewards/accuracy_reward": 0.022321429336443543, "rewards/format_reward": 0.9598214775323868, "rewards/tag_count_reward": 0.9704241454601288, "step": 1519 }, { "clip_ratio": 0.0, "completion_length": 548.7433242797852, "epoch": 0.4540362930326339, "grad_norm": 0.40483513474464417, "kl": 0.552978515625, "learning_rate": 1.3285908303131043e-05, "loss": 0.0664, "reward": 2.0167411863803864, "reward_std": 0.2016086708754301, "rewards/accuracy_reward": 0.06473214575089514, "rewards/format_reward": 0.970982164144516, "rewards/tag_count_reward": 0.981026828289032, "step": 1520 }, { "clip_ratio": 0.0, "completion_length": 546.0335006713867, "epoch": 0.45433500112015535, "grad_norm": 0.21131671965122223, "kl": 0.154541015625, "learning_rate": 1.327605543031532e-05, "loss": 0.0568, "reward": 2.0312501192092896, "reward_std": 0.17015941068530083, "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9843750447034836, "step": 1521 }, { "clip_ratio": 0.0, "completion_length": 536.6964569091797, "epoch": 0.4546337092076768, "grad_norm": 0.33576905727386475, "kl": 0.138916015625, "learning_rate": 1.3266198993472377e-05, "loss": 0.0709, "reward": 2.0256697237491608, "reward_std": 0.23319803923368454, "rewards/accuracy_reward": 0.07812500465661287, "rewards/format_reward": 0.970982164144516, "rewards/tag_count_reward": 0.9765625298023224, "step": 1522 }, { "clip_ratio": 0.0, "completion_length": 538.5446624755859, "epoch": 0.4549324172951983, "grad_norm": 0.403288871049881, "kl": 0.20263671875, "learning_rate": 1.3256339003325054e-05, "loss": 0.1242, "reward": 1.9380581676959991, "reward_std": 0.28497718274593353, "rewards/accuracy_reward": 0.02008928661234677, "rewards/format_reward": 0.9531250298023224, "rewards/tag_count_reward": 0.9648437798023224, "step": 1523 }, { "clip_ratio": 0.0, "completion_length": 557.6585006713867, "epoch": 0.45523112538271976, "grad_norm": 0.6041985750198364, "kl": 0.52685546875, "learning_rate": 1.324647547060005e-05, "loss": 0.1334, "reward": 1.9241072833538055, "reward_std": 0.3821380138397217, "rewards/accuracy_reward": 0.08258929080329835, "rewards/format_reward": 0.9084821939468384, "rewards/tag_count_reward": 0.9330357611179352, "step": 1524 }, { "clip_ratio": 0.0, "completion_length": 495.3928756713867, "epoch": 0.45552983347024123, "grad_norm": 1.1155009269714355, "kl": 0.5595703125, "learning_rate": 1.3236608406027918e-05, "loss": 0.2038, "reward": 1.9453126192092896, "reward_std": 0.4160626158118248, "rewards/accuracy_reward": 0.08705357415601611, "rewards/format_reward": 0.9196428954601288, "rewards/tag_count_reward": 0.9386161267757416, "step": 1525 }, { "clip_ratio": 0.0, "completion_length": 559.2321624755859, "epoch": 0.4558285415577627, "grad_norm": 1.1415663957595825, "kl": 0.5009765625, "learning_rate": 1.3226737820343066e-05, "loss": 0.276, "reward": 1.829241156578064, "reward_std": 0.5926388651132584, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.8370536118745804, "rewards/tag_count_reward": 0.8761161118745804, "step": 1526 }, { "clip_ratio": 0.0, "completion_length": 505.5357437133789, "epoch": 0.45612724964528417, "grad_norm": 0.6479904055595398, "kl": 0.450439453125, "learning_rate": 1.321686372428372e-05, "loss": 0.1271, "reward": 2.035714417695999, "reward_std": 0.30360669270157814, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.9352678954601288, "rewards/tag_count_reward": 0.9531250298023224, "step": 1527 }, { "clip_ratio": 0.0, "completion_length": 461.18528747558594, "epoch": 0.45642595773280564, "grad_norm": 0.3057272732257843, "kl": 0.252197265625, "learning_rate": 1.3206986128591925e-05, "loss": 0.2266, "reward": 1.9536831378936768, "reward_std": 0.3593439757823944, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.9330357611179352, "rewards/tag_count_reward": 0.9469866305589676, "step": 1528 }, { "clip_ratio": 0.0, "completion_length": 460.3035888671875, "epoch": 0.45672466582032706, "grad_norm": 0.26745671033859253, "kl": 0.166748046875, "learning_rate": 1.3197105044013544e-05, "loss": 0.1605, "reward": 1.9804688096046448, "reward_std": 0.3202693909406662, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.9531250447034836, "rewards/tag_count_reward": 0.96261166036129, "step": 1529 }, { "clip_ratio": 0.0, "completion_length": 435.11163330078125, "epoch": 0.45702337390784853, "grad_norm": 0.26042962074279785, "kl": 0.1611328125, "learning_rate": 1.3187220481298227e-05, "loss": 0.1754, "reward": 1.9709822535514832, "reward_std": 0.29194703325629234, "rewards/accuracy_reward": 0.06473214644938707, "rewards/format_reward": 0.9464286118745804, "rewards/tag_count_reward": 0.9598214626312256, "step": 1530 }, { "clip_ratio": 0.0, "completion_length": 450.4018096923828, "epoch": 0.45732208199537, "grad_norm": 0.4774284362792969, "kl": 0.237060546875, "learning_rate": 1.3177332451199405e-05, "loss": 0.2376, "reward": 1.9464286863803864, "reward_std": 0.43635885417461395, "rewards/accuracy_reward": 0.09375000116415322, "rewards/format_reward": 0.917410746216774, "rewards/tag_count_reward": 0.9352678954601288, "step": 1531 }, { "clip_ratio": 0.0, "completion_length": 389.88394927978516, "epoch": 0.45762079008289147, "grad_norm": 0.5789512991905212, "kl": 0.19580078125, "learning_rate": 1.3167440964474285e-05, "loss": 0.2126, "reward": 2.048549175262451, "reward_std": 0.33218172937631607, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.941964328289032, "rewards/tag_count_reward": 0.9547991454601288, "step": 1532 }, { "clip_ratio": 0.0, "completion_length": 386.12278747558594, "epoch": 0.45791949817041294, "grad_norm": 0.4959510862827301, "kl": 0.15869140625, "learning_rate": 1.3157546031883843e-05, "loss": 0.1469, "reward": 1.9687500596046448, "reward_std": 0.2634093314409256, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.9441964626312256, "rewards/tag_count_reward": 0.9598214626312256, "step": 1533 }, { "clip_ratio": 0.0, "completion_length": 413.8750228881836, "epoch": 0.4582182062579344, "grad_norm": 0.3473011553287506, "kl": 0.250244140625, "learning_rate": 1.314764766419279e-05, "loss": 0.223, "reward": 1.9112724363803864, "reward_std": 0.2980264127254486, "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.9441964626312256, "rewards/tag_count_reward": 0.9581473618745804, "step": 1534 }, { "clip_ratio": 0.0, "completion_length": 349.83260345458984, "epoch": 0.4585169143454559, "grad_norm": 0.6778643727302551, "kl": 0.4306640625, "learning_rate": 1.3137745872169578e-05, "loss": 0.2245, "reward": 2.09709832072258, "reward_std": 0.23972053453326225, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9743303954601288, "step": 1535 }, { "clip_ratio": 0.0, "completion_length": 339.2901840209961, "epoch": 0.45881562243297735, "grad_norm": 6.628921031951904, "kl": 0.37158203125, "learning_rate": 1.312784066658639e-05, "loss": 0.1693, "reward": 2.080357253551483, "reward_std": 0.21079185605049133, "rewards/accuracy_reward": 0.13839286379516125, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9754464775323868, "step": 1536 }, { "clip_ratio": 0.0, "completion_length": 297.8571548461914, "epoch": 0.4591143305204988, "grad_norm": 2.2159273624420166, "kl": 0.5751953125, "learning_rate": 1.3117932058219123e-05, "loss": 0.2219, "reward": 2.122767925262451, "reward_std": 0.2661058381199837, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9754464626312256, "step": 1537 }, { "clip_ratio": 0.0, "completion_length": 287.0982360839844, "epoch": 0.4594130386080203, "grad_norm": 0.6338444948196411, "kl": 0.2314453125, "learning_rate": 1.3108020057847363e-05, "loss": 0.1458, "reward": 2.08537957072258, "reward_std": 0.0943592470139265, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9893973469734192, "step": 1538 }, { "clip_ratio": 0.0, "completion_length": 301.42857360839844, "epoch": 0.45971174669554177, "grad_norm": 0.7207725644111633, "kl": 0.220947265625, "learning_rate": 1.3098104676254397e-05, "loss": 0.1561, "reward": 2.087053656578064, "reward_std": 0.1700696423649788, "rewards/accuracy_reward": 0.12723214668221772, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9821428805589676, "step": 1539 }, { "clip_ratio": 0.0, "completion_length": 264.72546768188477, "epoch": 0.46001045478306324, "grad_norm": 0.33521294593811035, "kl": 0.3472900390625, "learning_rate": 1.3088185924227195e-05, "loss": 0.0779, "reward": 2.0145089626312256, "reward_std": 0.12269144505262375, "rewards/accuracy_reward": 0.035714288242161274, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9899553805589676, "step": 1540 }, { "clip_ratio": 0.0, "completion_length": 289.3526840209961, "epoch": 0.4603091628705847, "grad_norm": 0.9353376030921936, "kl": 0.2880859375, "learning_rate": 1.3078263812556377e-05, "loss": 0.0412, "reward": 2.1132813692092896, "reward_std": 0.14613725244998932, "rewards/accuracy_reward": 0.13169643888249993, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.992745578289032, "step": 1541 }, { "clip_ratio": 0.0, "completion_length": 268.82814025878906, "epoch": 0.4606078709581062, "grad_norm": 0.29811567068099976, "kl": 0.241455078125, "learning_rate": 1.3068338352036236e-05, "loss": 0.083, "reward": 2.0725446939468384, "reward_std": 0.1022463571280241, "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9899553805589676, "step": 1542 }, { "clip_ratio": 0.0, "completion_length": 276.2143020629883, "epoch": 0.46090657904562765, "grad_norm": 0.3659135699272156, "kl": 0.184814453125, "learning_rate": 1.3058409553464697e-05, "loss": 0.0998, "reward": 1.9787947535514832, "reward_std": 0.1419910565018654, "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.9821429252624512, "rewards/tag_count_reward": 0.9877232611179352, "step": 1543 }, { "clip_ratio": 0.0, "completion_length": 280.8593864440918, "epoch": 0.4612052871331491, "grad_norm": 0.5203359127044678, "kl": 0.194091796875, "learning_rate": 1.3048477427643322e-05, "loss": 0.1062, "reward": 2.0574778020381927, "reward_std": 0.09659424889832735, "rewards/accuracy_reward": 0.07366072107106447, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9927455633878708, "step": 1544 }, { "clip_ratio": 0.0, "completion_length": 312.21876525878906, "epoch": 0.4615039952206706, "grad_norm": 0.21625825762748718, "kl": 0.1484375, "learning_rate": 1.3038541985377286e-05, "loss": 0.0691, "reward": 2.0904019474983215, "reward_std": 0.09513495862483978, "rewards/accuracy_reward": 0.10267857694998384, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.994419664144516, "step": 1545 }, { "clip_ratio": 0.0, "completion_length": 342.0491256713867, "epoch": 0.46180270330819206, "grad_norm": 0.13520021736621857, "kl": 0.193115234375, "learning_rate": 1.302860323747538e-05, "loss": 0.0185, "reward": 1.9927456378936768, "reward_std": 0.06473214644938707, "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949777126312256, "step": 1546 }, { "clip_ratio": 0.0, "completion_length": 367.77903747558594, "epoch": 0.46210141139571353, "grad_norm": 0.1924247443675995, "kl": 0.237060546875, "learning_rate": 1.3018661194749986e-05, "loss": 0.0292, "reward": 2.028459906578064, "reward_std": 0.09771534148603678, "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.992745578289032, "step": 1547 }, { "clip_ratio": 0.0, "completion_length": 412.45760345458984, "epoch": 0.462400119483235, "grad_norm": 0.6723698377609253, "kl": 0.309814453125, "learning_rate": 1.3008715868017075e-05, "loss": 0.074, "reward": 2.0613840222358704, "reward_std": 0.14378640428185463, "rewards/accuracy_reward": 0.09375000419095159, "rewards/format_reward": 0.9799107313156128, "rewards/tag_count_reward": 0.9877232313156128, "step": 1548 }, { "clip_ratio": 0.0, "completion_length": 434.99332427978516, "epoch": 0.4626988275707565, "grad_norm": 0.6468639373779297, "kl": 0.427978515625, "learning_rate": 1.2998767268096183e-05, "loss": 0.1261, "reward": 1.9905134439468384, "reward_std": 0.17411011084914207, "rewards/accuracy_reward": 0.03571428847499192, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9815848618745804, "step": 1549 }, { "clip_ratio": 0.0, "completion_length": 515.8884048461914, "epoch": 0.46299753565827795, "grad_norm": 8.330711364746094, "kl": 0.30078125, "learning_rate": 1.2988815405810415e-05, "loss": 0.2467, "reward": 1.8733259439468384, "reward_std": 0.46806716173887253, "rewards/accuracy_reward": 0.06473214738070965, "rewards/format_reward": 0.8839286267757416, "rewards/tag_count_reward": 0.9246652126312256, "step": 1550 }, { "clip_ratio": 0.0, "completion_length": 512.4285888671875, "epoch": 0.4632962437457994, "grad_norm": 2.2328457832336426, "kl": 0.77490234375, "learning_rate": 1.2978860291986422e-05, "loss": 0.0693, "reward": 2.0055804550647736, "reward_std": 0.26598602905869484, "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.9486607611179352, "rewards/tag_count_reward": 0.9698661118745804, "step": 1551 }, { "clip_ratio": 0.0, "completion_length": 485.6585159301758, "epoch": 0.4635949518333209, "grad_norm": 14.320402145385742, "kl": 1.48388671875, "learning_rate": 1.296890193745439e-05, "loss": 0.1115, "reward": 1.9715402722358704, "reward_std": 0.26071135699748993, "rewards/accuracy_reward": 0.058035716181620955, "rewards/format_reward": 0.9441964775323868, "rewards/tag_count_reward": 0.969308078289032, "step": 1552 }, { "clip_ratio": 0.0, "completion_length": 533.9777145385742, "epoch": 0.46389365992084236, "grad_norm": 1.3794375658035278, "kl": 0.29296875, "learning_rate": 1.295894035304803e-05, "loss": 0.0489, "reward": 2.018415242433548, "reward_std": 0.2623010128736496, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.9598214626312256, "rewards/tag_count_reward": 0.9760045111179352, "step": 1553 }, { "clip_ratio": 0.0, "completion_length": 533.3102874755859, "epoch": 0.46419236800836383, "grad_norm": 0.8791088461875916, "kl": 0.46875, "learning_rate": 1.294897554960458e-05, "loss": 0.0177, "reward": 1.9536831080913544, "reward_std": 0.3384134918451309, "rewards/accuracy_reward": 0.060267859138548374, "rewards/format_reward": 0.93526791036129, "rewards/tag_count_reward": 0.9581473618745804, "step": 1554 }, { "clip_ratio": 0.0, "completion_length": 479.7901916503906, "epoch": 0.4644910760958853, "grad_norm": 3.639195203781128, "kl": 1.38671875, "learning_rate": 1.2939007537964758e-05, "loss": 0.1252, "reward": 1.7477679550647736, "reward_std": 0.6176354885101318, "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.8035714626312256, "rewards/tag_count_reward": 0.8772321939468384, "step": 1555 }, { "clip_ratio": 0.0, "completion_length": 484.9866256713867, "epoch": 0.4647897841834068, "grad_norm": 7.506243705749512, "kl": 1.669921875, "learning_rate": 1.292903632897279e-05, "loss": 0.1153, "reward": 1.6601563096046448, "reward_std": 0.6333552002906799, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.7566964626312256, "rewards/tag_count_reward": 0.8476562947034836, "step": 1556 }, { "clip_ratio": 0.0, "completion_length": 479.95314025878906, "epoch": 0.46508849227092824, "grad_norm": 11.390954971313477, "kl": 1.63671875, "learning_rate": 1.2919061933476371e-05, "loss": 0.1319, "reward": 1.6428572237491608, "reward_std": 0.6992695480585098, "rewards/accuracy_reward": 0.055803575087338686, "rewards/format_reward": 0.7500000298023224, "rewards/tag_count_reward": 0.8370536118745804, "step": 1557 }, { "clip_ratio": 0.0, "completion_length": 520.2745742797852, "epoch": 0.4653872003584497, "grad_norm": 4.881496906280518, "kl": 1.626953125, "learning_rate": 1.2909084362326669e-05, "loss": 0.2455, "reward": 1.4877232611179352, "reward_std": 0.7456652075052261, "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.6316964477300644, "rewards/tag_count_reward": 0.7756696790456772, "step": 1558 }, { "clip_ratio": 0.0, "completion_length": 528.3013534545898, "epoch": 0.4656859084459712, "grad_norm": 14.114680290222168, "kl": 2.1640625, "learning_rate": 1.28991036263783e-05, "loss": 0.2663, "reward": 1.6350447237491608, "reward_std": 0.6570875644683838, "rewards/accuracy_reward": 0.02901785890571773, "rewards/format_reward": 0.7633928954601288, "rewards/tag_count_reward": 0.8426339626312256, "step": 1559 }, { "clip_ratio": 0.0, "completion_length": 556.2500152587891, "epoch": 0.46598461653349266, "grad_norm": 3.904243230819702, "kl": 0.60595703125, "learning_rate": 1.288911973648933e-05, "loss": 0.2047, "reward": 1.7399554252624512, "reward_std": 0.6370501220226288, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.79464291036129, "rewards/tag_count_reward": 0.871651828289032, "step": 1560 }, { "clip_ratio": 0.0, "completion_length": 543.3169860839844, "epoch": 0.46628332462101413, "grad_norm": 6.391267776489258, "kl": 1.34765625, "learning_rate": 1.2879132703521249e-05, "loss": 0.2811, "reward": 1.5418527722358704, "reward_std": 0.768242284655571, "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.6875000298023224, "rewards/tag_count_reward": 0.787388414144516, "step": 1561 }, { "clip_ratio": 0.0, "completion_length": 511.8370666503906, "epoch": 0.4665820327085356, "grad_norm": 3.731956958770752, "kl": 0.689453125, "learning_rate": 1.2869142538338974e-05, "loss": 0.2112, "reward": 1.5647322237491608, "reward_std": 0.7533873021602631, "rewards/accuracy_reward": 0.022321429569274187, "rewards/format_reward": 0.7366071790456772, "rewards/tag_count_reward": 0.8058036118745804, "step": 1562 }, { "clip_ratio": 0.0, "completion_length": 475.4018096923828, "epoch": 0.46688074079605707, "grad_norm": 4.596372127532959, "kl": 0.72265625, "learning_rate": 1.2859149251810823e-05, "loss": 0.1921, "reward": 1.6635045409202576, "reward_std": 0.6659720316529274, "rewards/accuracy_reward": 0.08258928847499192, "rewards/format_reward": 0.761160746216774, "rewards/tag_count_reward": 0.8197545111179352, "step": 1563 }, { "clip_ratio": 0.0, "completion_length": 437.88842010498047, "epoch": 0.46717944888357854, "grad_norm": 7.854588985443115, "kl": 0.9521484375, "learning_rate": 1.284915285480851e-05, "loss": 0.2716, "reward": 1.6875000596046448, "reward_std": 0.6345488429069519, "rewards/accuracy_reward": 0.042410715483129025, "rewards/format_reward": 0.7946428805589676, "rewards/tag_count_reward": 0.8504464775323868, "step": 1564 }, { "clip_ratio": 0.0, "completion_length": 429.2745666503906, "epoch": 0.4674781569711, "grad_norm": 15.444197654724121, "kl": 3.3203125, "learning_rate": 1.2839153358207142e-05, "loss": 0.6022, "reward": 1.6964286267757416, "reward_std": 0.6736363917589188, "rewards/accuracy_reward": 0.07812500116415322, "rewards/format_reward": 0.7812500298023224, "rewards/tag_count_reward": 0.8370536118745804, "step": 1565 }, { "clip_ratio": 0.0, "completion_length": 457.2165298461914, "epoch": 0.4677768650586215, "grad_norm": 22.88117027282715, "kl": 4.765625, "learning_rate": 1.2829150772885186e-05, "loss": 0.6555, "reward": 1.7427456080913544, "reward_std": 0.6751928329467773, "rewards/accuracy_reward": 0.08258928847499192, "rewards/format_reward": 0.8035714626312256, "rewards/tag_count_reward": 0.8565848618745804, "step": 1566 }, { "clip_ratio": 0.0, "completion_length": 369.63170623779297, "epoch": 0.46807557314614295, "grad_norm": 11.30234146118164, "kl": 3.04296875, "learning_rate": 1.2819145109724476e-05, "loss": 0.4465, "reward": 1.7868304252624512, "reward_std": 0.56953314691782, "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.8415178805589676, "rewards/tag_count_reward": 0.8805803954601288, "step": 1567 }, { "clip_ratio": 0.0, "completion_length": 368.6004638671875, "epoch": 0.4683742812336644, "grad_norm": 4.528553485870361, "kl": 2.337890625, "learning_rate": 1.280913637961019e-05, "loss": 0.1835, "reward": 1.9347099363803864, "reward_std": 0.6196972131729126, "rewards/accuracy_reward": 0.2008928619325161, "rewards/format_reward": 0.8459821790456772, "rewards/tag_count_reward": 0.8878348767757416, "step": 1568 }, { "clip_ratio": 0.0, "completion_length": 391.07591247558594, "epoch": 0.4686729893211859, "grad_norm": 3.475992441177368, "kl": 1.091796875, "learning_rate": 1.2799124593430849e-05, "loss": 0.015, "reward": 1.85100457072258, "reward_std": 0.5205830484628677, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.886160746216774, "rewards/tag_count_reward": 0.91573666036129, "step": 1569 }, { "clip_ratio": 0.0, "completion_length": 391.2745666503906, "epoch": 0.46897169740870737, "grad_norm": 4.643089771270752, "kl": 1.4453125, "learning_rate": 1.2789109762078296e-05, "loss": 0.0704, "reward": 1.8281250894069672, "reward_std": 0.5666132271289825, "rewards/accuracy_reward": 0.07589286169968545, "rewards/format_reward": 0.85714291036129, "rewards/tag_count_reward": 0.895089328289032, "step": 1570 }, { "clip_ratio": 0.0, "completion_length": 400.8861770629883, "epoch": 0.46927040549622884, "grad_norm": 2.2416491508483887, "kl": 1.3193359375, "learning_rate": 1.2779091896447682e-05, "loss": 0.1995, "reward": 1.8772322237491608, "reward_std": 0.5095186829566956, "rewards/accuracy_reward": 0.08482143259607255, "rewards/format_reward": 0.879464328289032, "rewards/tag_count_reward": 0.9129464775323868, "step": 1571 }, { "clip_ratio": 0.0, "completion_length": 402.29466247558594, "epoch": 0.46956911358375025, "grad_norm": 3.450453281402588, "kl": 1.662109375, "learning_rate": 1.2769071007437466e-05, "loss": 0.1762, "reward": 1.868303656578064, "reward_std": 0.49448274821043015, "rewards/accuracy_reward": 0.06250000209547579, "rewards/format_reward": 0.8839286118745804, "rewards/tag_count_reward": 0.9218750447034836, "step": 1572 }, { "clip_ratio": 0.0, "completion_length": 358.39733123779297, "epoch": 0.4698678216712717, "grad_norm": 9.747532844543457, "kl": 2.021484375, "learning_rate": 1.2759047105949391e-05, "loss": 0.1739, "reward": 1.9112724363803864, "reward_std": 0.41201820224523544, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.9040178954601288, "rewards/tag_count_reward": 0.9335937947034836, "step": 1573 }, { "clip_ratio": 0.0, "completion_length": 331.6227798461914, "epoch": 0.4701665297587932, "grad_norm": 0.9530859589576721, "kl": 0.904296875, "learning_rate": 1.2749020202888485e-05, "loss": 0.1379, "reward": 1.9397321939468384, "reward_std": 0.5024787411093712, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.8816964775323868, "rewards/tag_count_reward": 0.9174107611179352, "step": 1574 }, { "clip_ratio": 0.0, "completion_length": 332.2455520629883, "epoch": 0.47046523784631467, "grad_norm": 1.0976117849349976, "kl": 0.88330078125, "learning_rate": 1.2738990309163025e-05, "loss": 0.0549, "reward": 1.9313617050647736, "reward_std": 0.35357416421175003, "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.917410746216774, "rewards/tag_count_reward": 0.9469866454601288, "step": 1575 }, { "clip_ratio": 0.0, "completion_length": 326.4419708251953, "epoch": 0.47076394593383614, "grad_norm": 0.6030490398406982, "kl": 0.46630859375, "learning_rate": 1.2728957435684561e-05, "loss": 0.0653, "reward": 1.957589328289032, "reward_std": 0.2936122417449951, "rewards/accuracy_reward": 0.04910714412108064, "rewards/format_reward": 0.9441964626312256, "rewards/tag_count_reward": 0.964285746216774, "step": 1576 }, { "clip_ratio": 0.0, "completion_length": 269.1116256713867, "epoch": 0.4710626540213576, "grad_norm": 1.121793270111084, "kl": 0.68017578125, "learning_rate": 1.2718921593367874e-05, "loss": 0.1457, "reward": 2.03459832072258, "reward_std": 0.3759559616446495, "rewards/accuracy_reward": 0.14062500861473382, "rewards/format_reward": 0.9397321790456772, "rewards/tag_count_reward": 0.9542411118745804, "step": 1577 }, { "clip_ratio": 0.0, "completion_length": 226.2053680419922, "epoch": 0.4713613621088791, "grad_norm": 0.5903354287147522, "kl": 0.37451171875, "learning_rate": 1.2708882793130974e-05, "loss": 0.1749, "reward": 1.9748885035514832, "reward_std": 0.23413890600204468, "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9704241752624512, "step": 1578 }, { "clip_ratio": 0.0, "completion_length": 237.37054443359375, "epoch": 0.47166007019640055, "grad_norm": 0.344332754611969, "kl": 0.2001953125, "learning_rate": 1.2698841045895096e-05, "loss": 0.1955, "reward": 2.0535715222358704, "reward_std": 0.15277874656021595, "rewards/accuracy_reward": 0.08482143515720963, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9843750596046448, "step": 1579 }, { "clip_ratio": 0.0, "completion_length": 206.60045623779297, "epoch": 0.471958778283922, "grad_norm": 0.2929181456565857, "kl": 0.308837890625, "learning_rate": 1.2688796362584676e-05, "loss": 0.1089, "reward": 2.0608260333538055, "reward_std": 0.1699548065662384, "rewards/accuracy_reward": 0.09598214598372579, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.984933078289032, "step": 1580 }, { "clip_ratio": 0.0, "completion_length": 198.83259963989258, "epoch": 0.4722574863714435, "grad_norm": 0.3481777310371399, "kl": 0.201904296875, "learning_rate": 1.2678748754127344e-05, "loss": 0.1397, "reward": 2.0435268878936768, "reward_std": 0.1310805380344391, "rewards/accuracy_reward": 0.06473214738070965, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9899553805589676, "step": 1581 }, { "clip_ratio": 0.0, "completion_length": 179.23215103149414, "epoch": 0.47255619445896496, "grad_norm": 0.8405432105064392, "kl": 0.203857421875, "learning_rate": 1.2668698231453908e-05, "loss": 0.2202, "reward": 2.093750089406967, "reward_std": 0.1191515363752842, "rewards/accuracy_reward": 0.11830357951112092, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.988839328289032, "step": 1582 }, { "clip_ratio": 0.0, "completion_length": 195.09375762939453, "epoch": 0.47285490254648643, "grad_norm": 0.6037490367889404, "kl": 0.282470703125, "learning_rate": 1.2658644805498361e-05, "loss": 0.2254, "reward": 2.0585938096046448, "reward_std": 0.1408484112471342, "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.984933078289032, "step": 1583 }, { "clip_ratio": 0.0, "completion_length": 190.03572463989258, "epoch": 0.4731536106340079, "grad_norm": 0.5298799276351929, "kl": 0.256103515625, "learning_rate": 1.2648588487197842e-05, "loss": 0.2471, "reward": 2.06584832072258, "reward_std": 0.1530716698616743, "rewards/accuracy_reward": 0.09375000861473382, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.987723246216774, "step": 1584 }, { "clip_ratio": 0.0, "completion_length": 196.7879524230957, "epoch": 0.4734523187215294, "grad_norm": 0.39179274439811707, "kl": 0.233642578125, "learning_rate": 1.2638529287492635e-05, "loss": 0.1059, "reward": 2.1250001192092896, "reward_std": 0.11072594858705997, "rewards/accuracy_reward": 0.13839286286383867, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9933035969734192, "step": 1585 }, { "clip_ratio": 0.0, "completion_length": 192.55804824829102, "epoch": 0.47375102680905085, "grad_norm": 3.5653951168060303, "kl": 0.698486328125, "learning_rate": 1.262846721732617e-05, "loss": 0.1236, "reward": 2.114397406578064, "reward_std": 0.1567186564207077, "rewards/accuracy_reward": 0.13392857927829027, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.991629496216774, "step": 1586 }, { "clip_ratio": 0.0, "completion_length": 183.56920623779297, "epoch": 0.4740497348965723, "grad_norm": 4.3979411125183105, "kl": 0.220703125, "learning_rate": 1.2618402287644989e-05, "loss": 0.1054, "reward": 2.087611734867096, "reward_std": 0.11628833692520857, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9960937649011612, "step": 1587 }, { "clip_ratio": 0.0, "completion_length": 197.32144165039062, "epoch": 0.4743484429840938, "grad_norm": 0.4362083077430725, "kl": 0.181396484375, "learning_rate": 1.2608334509398752e-05, "loss": 0.063, "reward": 2.107142925262451, "reward_std": 0.13606741651892662, "rewards/accuracy_reward": 0.11830358020961285, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9955357313156128, "step": 1588 }, { "clip_ratio": 0.0, "completion_length": 201.23215103149414, "epoch": 0.47464715107161526, "grad_norm": 0.7354264259338379, "kl": 0.255859375, "learning_rate": 1.2598263893540207e-05, "loss": 0.1584, "reward": 2.096540242433548, "reward_std": 0.17163363471627235, "rewards/accuracy_reward": 0.12276786169968545, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9893973469734192, "step": 1589 }, { "clip_ratio": 0.0, "completion_length": 187.05804443359375, "epoch": 0.47494585915913673, "grad_norm": 0.6461039185523987, "kl": 1.38720703125, "learning_rate": 1.2588190451025209e-05, "loss": 0.0802, "reward": 2.03850457072258, "reward_std": 0.12603804282844067, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.991629496216774, "step": 1590 }, { "clip_ratio": 0.0, "completion_length": 200.21206283569336, "epoch": 0.4752445672466582, "grad_norm": 1.1521344184875488, "kl": 0.749755859375, "learning_rate": 1.2578114192812669e-05, "loss": 0.0227, "reward": 2.102120578289032, "reward_std": 0.1158034186810255, "rewards/accuracy_reward": 0.11160715040750802, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9972098469734192, "step": 1591 }, { "clip_ratio": 0.0, "completion_length": 223.47768783569336, "epoch": 0.47554327533417967, "grad_norm": 2.786315679550171, "kl": 0.399658203125, "learning_rate": 1.2568035129864569e-05, "loss": 0.1214, "reward": 2.1171876192092896, "reward_std": 0.12200526148080826, "rewards/accuracy_reward": 0.13616071827709675, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9921875298023224, "step": 1592 }, { "clip_ratio": 0.0, "completion_length": 236.00671005249023, "epoch": 0.47584198342170114, "grad_norm": 0.5216723084449768, "kl": 0.25439453125, "learning_rate": 1.255795327314594e-05, "loss": 0.163, "reward": 2.0613840222358704, "reward_std": 0.15262571349740028, "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9921875447034836, "step": 1593 }, { "clip_ratio": 0.0, "completion_length": 262.52456283569336, "epoch": 0.4761406915092226, "grad_norm": 2.8749234676361084, "kl": 0.198486328125, "learning_rate": 1.2547868633624858e-05, "loss": 0.0935, "reward": 2.102678656578064, "reward_std": 0.16534782573580742, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9910714477300644, "step": 1594 }, { "clip_ratio": 0.0, "completion_length": 233.15402603149414, "epoch": 0.4764393995967441, "grad_norm": 0.6039649844169617, "kl": 0.268798828125, "learning_rate": 1.2537781222272423e-05, "loss": 0.0285, "reward": 2.065290242433548, "reward_std": 0.14287807792425156, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9938616454601288, "step": 1595 }, { "clip_ratio": 0.0, "completion_length": 278.2857246398926, "epoch": 0.47673810768426556, "grad_norm": 0.45594266057014465, "kl": 0.71484375, "learning_rate": 1.2527691050062743e-05, "loss": -0.0076, "reward": 2.140625089406967, "reward_std": 0.16188351064920425, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9888393133878708, "step": 1596 }, { "clip_ratio": 0.0, "completion_length": 290.42189025878906, "epoch": 0.477036815771787, "grad_norm": 0.17301945388317108, "kl": 0.25732421875, "learning_rate": 1.2517598127972943e-05, "loss": 0.0511, "reward": 2.078125089406967, "reward_std": 0.12460112012922764, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9866071790456772, "step": 1597 }, { "clip_ratio": 0.0, "completion_length": 262.2388496398926, "epoch": 0.4773355238593085, "grad_norm": 1.9822685718536377, "kl": 0.36083984375, "learning_rate": 1.250750246698313e-05, "loss": 0.0583, "reward": 2.05412957072258, "reward_std": 0.1769004613161087, "rewards/accuracy_reward": 0.0870535783469677, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9871652275323868, "step": 1598 }, { "clip_ratio": 0.0, "completion_length": 277.69866943359375, "epoch": 0.47763423194682997, "grad_norm": 0.266641229391098, "kl": 0.186767578125, "learning_rate": 1.2497404078076396e-05, "loss": 0.0056, "reward": 2.0507813692092896, "reward_std": 0.14404060877859592, "rewards/accuracy_reward": 0.06250000139698386, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949776977300644, "step": 1599 }, { "clip_ratio": 0.0, "completion_length": 323.0067138671875, "epoch": 0.47793294003435144, "grad_norm": 2.1489527225494385, "kl": 0.194091796875, "learning_rate": 1.2487302972238795e-05, "loss": 0.0144, "reward": 2.077009081840515, "reward_std": 0.12874540034681559, "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.994419664144516, "step": 1600 }, { "clip_ratio": 0.0, "completion_length": 344.9330520629883, "epoch": 0.4782316481218729, "grad_norm": 0.1404573619365692, "kl": 0.3359375, "learning_rate": 1.2477199160459345e-05, "loss": 0.0152, "reward": 2.0747768878936768, "reward_std": 0.12614197377115488, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.9888392984867096, "rewards/tag_count_reward": 0.9921875149011612, "step": 1601 }, { "clip_ratio": 0.0, "completion_length": 388.67859649658203, "epoch": 0.4785303562093944, "grad_norm": 0.22621449828147888, "kl": 0.205810546875, "learning_rate": 1.246709265373e-05, "loss": 0.0667, "reward": 2.0943081378936768, "reward_std": 0.14123749919235706, "rewards/accuracy_reward": 0.11383929336443543, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9916294813156128, "step": 1602 }, { "clip_ratio": 0.0, "completion_length": 407.2299270629883, "epoch": 0.47882906429691585, "grad_norm": 0.18067975342273712, "kl": 0.1865234375, "learning_rate": 1.2456983463045644e-05, "loss": 0.0828, "reward": 2.040736675262451, "reward_std": 0.13447590917348862, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.991629496216774, "step": 1603 }, { "clip_ratio": 0.0, "completion_length": 460.4397506713867, "epoch": 0.4791277723844373, "grad_norm": 0.14966416358947754, "kl": 0.228515625, "learning_rate": 1.2446871599404095e-05, "loss": 0.0468, "reward": 2.0027902722358704, "reward_std": 0.13338622823357582, "rewards/accuracy_reward": 0.03125000232830644, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9871652275323868, "step": 1604 }, { "clip_ratio": 0.0, "completion_length": 473.85938262939453, "epoch": 0.4794264804719588, "grad_norm": 0.13514436781406403, "kl": 0.596923828125, "learning_rate": 1.2436757073806065e-05, "loss": 0.0327, "reward": 2.092634081840515, "reward_std": 0.1116138193756342, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.994419664144516, "step": 1605 }, { "clip_ratio": 0.0, "completion_length": 495.4643096923828, "epoch": 0.47972518855948026, "grad_norm": 1.7004420757293701, "kl": 0.2294921875, "learning_rate": 1.2426639897255166e-05, "loss": 0.0431, "reward": 2.0965402722358704, "reward_std": 0.18153483606874943, "rewards/accuracy_reward": 0.13169643888249993, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9871652275323868, "step": 1606 }, { "clip_ratio": 0.0, "completion_length": 502.0156555175781, "epoch": 0.48002389664700174, "grad_norm": 2.1899640560150146, "kl": 0.41015625, "learning_rate": 1.2416520080757892e-05, "loss": 0.1012, "reward": 2.03459832072258, "reward_std": 0.2818370293825865, "rewards/accuracy_reward": 0.09598214644938707, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9765625596046448, "step": 1607 }, { "clip_ratio": 0.0, "completion_length": 493.5312728881836, "epoch": 0.4803226047345232, "grad_norm": 0.991590678691864, "kl": 0.286376953125, "learning_rate": 1.2406397635323617e-05, "loss": 0.0738, "reward": 2.053571581840515, "reward_std": 0.21674077678471804, "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9866071790456772, "step": 1608 }, { "clip_ratio": 0.0, "completion_length": 486.21429443359375, "epoch": 0.4806213128220447, "grad_norm": 0.13757111132144928, "kl": 0.321533203125, "learning_rate": 1.239627257196457e-05, "loss": 0.0196, "reward": 2.040178656578064, "reward_std": 0.12302087247371674, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.995535746216774, "step": 1609 }, { "clip_ratio": 0.0, "completion_length": 495.22994232177734, "epoch": 0.48092002090956615, "grad_norm": 0.1584738790988922, "kl": 0.156982421875, "learning_rate": 1.2386144901695817e-05, "loss": 0.0189, "reward": 2.044642984867096, "reward_std": 0.13396254926919937, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9955357313156128, "step": 1610 }, { "clip_ratio": 0.0, "completion_length": 543.1875228881836, "epoch": 0.4812187289970876, "grad_norm": 0.22797833383083344, "kl": 0.219482421875, "learning_rate": 1.2376014635535285e-05, "loss": 0.0406, "reward": 2.1177456378936768, "reward_std": 0.2742708548903465, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9838170111179352, "step": 1611 }, { "clip_ratio": 0.0, "completion_length": 570.8727874755859, "epoch": 0.4815174370846091, "grad_norm": 0.1813584417104721, "kl": 0.35107421875, "learning_rate": 1.2365881784503704e-05, "loss": 0.0504, "reward": 2.0468751192092896, "reward_std": 0.21795134618878365, "rewards/accuracy_reward": 0.09375000605359674, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9843750298023224, "step": 1612 }, { "clip_ratio": 0.0, "completion_length": 577.2187652587891, "epoch": 0.48181614517213056, "grad_norm": 0.22895725071430206, "kl": 0.32080078125, "learning_rate": 1.2355746359624621e-05, "loss": 0.0172, "reward": 2.0351563096046448, "reward_std": 0.2410703431814909, "rewards/accuracy_reward": 0.07589286100119352, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.983816996216774, "step": 1613 }, { "clip_ratio": 0.0, "completion_length": 565.7946624755859, "epoch": 0.48211485325965203, "grad_norm": 0.1457672417163849, "kl": 0.3876953125, "learning_rate": 1.2345608371924384e-05, "loss": 0.0274, "reward": 2.0602679550647736, "reward_std": 0.17388197407126427, "rewards/accuracy_reward": 0.09375000325962901, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9866071790456772, "step": 1614 }, { "clip_ratio": 0.0, "completion_length": 550.5602874755859, "epoch": 0.48241356134717345, "grad_norm": 0.234925776720047, "kl": 0.228515625, "learning_rate": 1.2335467832432136e-05, "loss": 0.0126, "reward": 2.0362724363803864, "reward_std": 0.2107427641749382, "rewards/accuracy_reward": 0.07366071874275804, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9871652275323868, "step": 1615 }, { "clip_ratio": 0.0, "completion_length": 576.5022430419922, "epoch": 0.4827122694346949, "grad_norm": 1.082073450088501, "kl": 0.23095703125, "learning_rate": 1.2325324752179788e-05, "loss": 0.0577, "reward": 2.032924175262451, "reward_std": 0.2304920069873333, "rewards/accuracy_reward": 0.06696429010480642, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9882812947034836, "step": 1616 }, { "clip_ratio": 0.0, "completion_length": 547.8259124755859, "epoch": 0.4830109775222164, "grad_norm": 0.25518539547920227, "kl": 0.2196044921875, "learning_rate": 1.2315179142202012e-05, "loss": 0.0366, "reward": 2.1383930444717407, "reward_std": 0.13890758249908686, "rewards/accuracy_reward": 0.15625000488944352, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9910714477300644, "step": 1617 }, { "clip_ratio": 0.0, "completion_length": 535.9710083007812, "epoch": 0.48330968560973786, "grad_norm": 0.20485395193099976, "kl": 0.68310546875, "learning_rate": 1.2305031013536244e-05, "loss": 0.035, "reward": 2.0786831378936768, "reward_std": 0.19841178879141808, "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.984933078289032, "step": 1618 }, { "clip_ratio": 0.0, "completion_length": 555.2411041259766, "epoch": 0.48360839369725933, "grad_norm": 0.2708071172237396, "kl": 0.593505859375, "learning_rate": 1.2294880377222649e-05, "loss": 0.0444, "reward": 2.1116072237491608, "reward_std": 0.2328457273542881, "rewards/accuracy_reward": 0.1562500037252903, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9821428954601288, "step": 1619 }, { "clip_ratio": 0.0, "completion_length": 550.4977951049805, "epoch": 0.4839071017847808, "grad_norm": 0.1636449247598648, "kl": 0.17333984375, "learning_rate": 1.2284727244304126e-05, "loss": 0.0278, "reward": 2.075334906578064, "reward_std": 0.18218902498483658, "rewards/accuracy_reward": 0.09598214644938707, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9927455633878708, "step": 1620 }, { "clip_ratio": 0.0, "completion_length": 565.5268096923828, "epoch": 0.4842058098723023, "grad_norm": 0.39162540435791016, "kl": 0.443359375, "learning_rate": 1.227457162582629e-05, "loss": 0.0497, "reward": 2.0195313096046448, "reward_std": 0.22822269424796104, "rewards/accuracy_reward": 0.05580357322469354, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.983816996216774, "step": 1621 }, { "clip_ratio": 0.0, "completion_length": 568.4888763427734, "epoch": 0.48450451795982374, "grad_norm": 0.16161483526229858, "kl": 0.13037109375, "learning_rate": 1.2264413532837456e-05, "loss": 0.021, "reward": 2.0892857909202576, "reward_std": 0.1540526356548071, "rewards/accuracy_reward": 0.10267857508733869, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.995535746216774, "step": 1622 }, { "clip_ratio": 0.0, "completion_length": 482.07592010498047, "epoch": 0.4848032260473452, "grad_norm": 0.1982269436120987, "kl": 0.5626220703125, "learning_rate": 1.2254252976388637e-05, "loss": 0.0151, "reward": 2.1484375596046448, "reward_std": 0.1560599310323596, "rewards/accuracy_reward": 0.1718750037252903, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9899553805589676, "step": 1623 }, { "clip_ratio": 0.0, "completion_length": 525.850471496582, "epoch": 0.4851019341348667, "grad_norm": 0.15945205092430115, "kl": 0.6470947265625, "learning_rate": 1.2244089967533515e-05, "loss": -0.0028, "reward": 2.0982143878936768, "reward_std": 0.19773593731224537, "rewards/accuracy_reward": 0.1227678656578064, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9910714775323868, "step": 1624 }, { "clip_ratio": 0.0, "completion_length": 569.2879638671875, "epoch": 0.48540064222238816, "grad_norm": 1.7902545928955078, "kl": 0.61962890625, "learning_rate": 1.2233924517328456e-05, "loss": 0.0119, "reward": 2.023437589406967, "reward_std": 0.1821996457874775, "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9899553954601288, "step": 1625 }, { "clip_ratio": 0.0, "completion_length": 549.2678833007812, "epoch": 0.4856993503099096, "grad_norm": 0.19948209822177887, "kl": 0.6348876953125, "learning_rate": 1.2223756636832471e-05, "loss": -0.009, "reward": 2.1356027126312256, "reward_std": 0.16709168069064617, "rewards/accuracy_reward": 0.15625000977888703, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.992745578289032, "step": 1626 }, { "clip_ratio": 0.0, "completion_length": 541.1049346923828, "epoch": 0.4859980583974311, "grad_norm": 0.4988417327404022, "kl": 0.5880126953125, "learning_rate": 1.2213586337107217e-05, "loss": 0.0313, "reward": 2.1551340222358704, "reward_std": 0.22790442034602165, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9921875298023224, "step": 1627 }, { "clip_ratio": 0.0, "completion_length": 580.5558319091797, "epoch": 0.48629676648495257, "grad_norm": 0.16327713429927826, "kl": 0.29296875, "learning_rate": 1.220341362921698e-05, "loss": 0.0142, "reward": 2.0396206080913544, "reward_std": 0.09953026473522186, "rewards/accuracy_reward": 0.05357142956927419, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9949776977300644, "step": 1628 }, { "clip_ratio": 0.0, "completion_length": 543.6071624755859, "epoch": 0.48659547457247404, "grad_norm": 0.7487332224845886, "kl": 1.01220703125, "learning_rate": 1.2193238524228677e-05, "loss": 0.0164, "reward": 2.1004465222358704, "reward_std": 0.21650858223438263, "rewards/accuracy_reward": 0.12946429220028222, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9866071790456772, "step": 1629 }, { "clip_ratio": 0.0, "completion_length": 580.7835083007812, "epoch": 0.4868941826599955, "grad_norm": 4.633334159851074, "kl": 1.457763671875, "learning_rate": 1.2183061033211817e-05, "loss": 0.0833, "reward": 2.0161831378936768, "reward_std": 0.25549642369151115, "rewards/accuracy_reward": 0.07142857508733869, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.982700914144516, "step": 1630 }, { "clip_ratio": 0.0, "completion_length": 573.5647583007812, "epoch": 0.487192890747517, "grad_norm": 0.3372010886669159, "kl": 0.2440185546875, "learning_rate": 1.2172881167238515e-05, "loss": 0.042, "reward": 2.0770090222358704, "reward_std": 0.19634636864066124, "rewards/accuracy_reward": 0.1138392947614193, "rewards/format_reward": 0.9754464477300644, "rewards/tag_count_reward": 0.987723246216774, "step": 1631 }, { "clip_ratio": 0.0, "completion_length": 582.3058319091797, "epoch": 0.48749159883503845, "grad_norm": 0.1206597164273262, "kl": 0.113525390625, "learning_rate": 1.216269893738347e-05, "loss": 0.0111, "reward": 2.0892858505249023, "reward_std": 0.09299687948077917, "rewards/accuracy_reward": 0.10044643399305642, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9977678954601288, "step": 1632 }, { "clip_ratio": 0.0, "completion_length": 590.7902069091797, "epoch": 0.4877903069225599, "grad_norm": 0.25098758935928345, "kl": 0.2357177734375, "learning_rate": 1.2152514354723948e-05, "loss": 0.0163, "reward": 2.0379465520381927, "reward_std": 0.15173897612839937, "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9910714477300644, "step": 1633 }, { "clip_ratio": 0.0, "completion_length": 591.2165298461914, "epoch": 0.4880890150100814, "grad_norm": 0.2778991460800171, "kl": 0.24267578125, "learning_rate": 1.2142327430339777e-05, "loss": 0.023, "reward": 2.0552456080913544, "reward_std": 0.204232107847929, "rewards/accuracy_reward": 0.09375000093132257, "rewards/format_reward": 0.9732143431901932, "rewards/tag_count_reward": 0.9882812798023224, "step": 1634 }, { "clip_ratio": 0.0, "completion_length": 577.9553680419922, "epoch": 0.48838772309760287, "grad_norm": 0.48614197969436646, "kl": 0.3546142578125, "learning_rate": 1.213213817531333e-05, "loss": 0.0239, "reward": 2.0011161863803864, "reward_std": 0.1571940053254366, "rewards/accuracy_reward": 0.026785715715959668, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9899553954601288, "step": 1635 }, { "clip_ratio": 0.0, "completion_length": 574.3995819091797, "epoch": 0.48868643118512434, "grad_norm": 0.13192857801914215, "kl": 0.1005859375, "learning_rate": 1.2121946600729524e-05, "loss": 0.006, "reward": 2.111049175262451, "reward_std": 0.1705128699541092, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.997209832072258, "step": 1636 }, { "clip_ratio": 0.0, "completion_length": 566.2946624755859, "epoch": 0.4889851392726458, "grad_norm": 0.26006510853767395, "kl": 0.227783203125, "learning_rate": 1.2111752717675788e-05, "loss": 0.062, "reward": 2.0373885333538055, "reward_std": 0.17982641607522964, "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9905134439468384, "step": 1637 }, { "clip_ratio": 0.0, "completion_length": 570.185302734375, "epoch": 0.4892838473601673, "grad_norm": 1.5967906713485718, "kl": 0.156982421875, "learning_rate": 1.2101556537242069e-05, "loss": 0.0196, "reward": 2.0228796005249023, "reward_std": 0.08875143248587847, "rewards/accuracy_reward": 0.029017858440056443, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9983258992433548, "step": 1638 }, { "clip_ratio": 0.0, "completion_length": 554.7835083007812, "epoch": 0.48958255544768875, "grad_norm": 0.1646958738565445, "kl": 0.38916015625, "learning_rate": 1.2091358070520813e-05, "loss": 0.0126, "reward": 2.1143974661827087, "reward_std": 0.21580640599131584, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9871652126312256, "step": 1639 }, { "clip_ratio": 0.0, "completion_length": 560.6495742797852, "epoch": 0.4898812635352102, "grad_norm": 0.14629609882831573, "kl": 0.126708984375, "learning_rate": 1.2081157328606951e-05, "loss": 0.0338, "reward": 2.111607253551483, "reward_std": 0.18541727028787136, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9955357313156128, "step": 1640 }, { "clip_ratio": 0.0, "completion_length": 568.5558319091797, "epoch": 0.4901799716227317, "grad_norm": 0.1762150526046753, "kl": 0.127685546875, "learning_rate": 1.2070954322597893e-05, "loss": 0.0303, "reward": 2.1344866156578064, "reward_std": 0.15925164707005024, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9938616454601288, "step": 1641 }, { "clip_ratio": 0.0, "completion_length": 557.0424423217773, "epoch": 0.49047867971025316, "grad_norm": 0.21554982662200928, "kl": 0.1256103515625, "learning_rate": 1.2060749063593503e-05, "loss": 0.0166, "reward": 2.1250001192092896, "reward_std": 0.11175358109176159, "rewards/accuracy_reward": 0.12946429196745157, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 1.0, "step": 1642 }, { "clip_ratio": 0.0, "completion_length": 575.7254791259766, "epoch": 0.49077738779777463, "grad_norm": 1.9879367351531982, "kl": 0.421630859375, "learning_rate": 1.205054156269611e-05, "loss": 0.0396, "reward": 2.0943081378936768, "reward_std": 0.0691964328289032, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9938616305589676, "step": 1643 }, { "clip_ratio": 0.0, "completion_length": 541.5937957763672, "epoch": 0.4910760958852961, "grad_norm": 0.2787121832370758, "kl": 0.343017578125, "learning_rate": 1.204033183101047e-05, "loss": 0.0295, "reward": 2.0145090222358704, "reward_std": 0.14865506812930107, "rewards/accuracy_reward": 0.03571428684517741, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9899553954601288, "step": 1644 }, { "clip_ratio": 0.0, "completion_length": 597.6786041259766, "epoch": 0.4913748039728176, "grad_norm": 0.41068896651268005, "kl": 0.3671875, "learning_rate": 1.203011987964377e-05, "loss": 0.0414, "reward": 2.024553656578064, "reward_std": 0.19603809341788292, "rewards/accuracy_reward": 0.060267859837040305, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9843750447034836, "step": 1645 }, { "clip_ratio": 0.0, "completion_length": 539.7924270629883, "epoch": 0.49167351206033905, "grad_norm": 0.12358429282903671, "kl": 0.1470947265625, "learning_rate": 1.2019905719705618e-05, "loss": 0.0392, "reward": 2.0664063096046448, "reward_std": 0.09208215493708849, "rewards/accuracy_reward": 0.08258929220028222, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9927455633878708, "step": 1646 }, { "clip_ratio": 0.0, "completion_length": 530.5223541259766, "epoch": 0.4919722201478605, "grad_norm": 0.17302095890045166, "kl": 0.1771240234375, "learning_rate": 1.2009689362308014e-05, "loss": 0.0564, "reward": 2.165736734867096, "reward_std": 0.16199100762605667, "rewards/accuracy_reward": 0.191964291036129, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9871651977300644, "step": 1647 }, { "clip_ratio": 0.0, "completion_length": 549.1250305175781, "epoch": 0.492270928235382, "grad_norm": 0.13561087846755981, "kl": 0.1337890625, "learning_rate": 1.1999470818565355e-05, "loss": 0.0188, "reward": 2.079241156578064, "reward_std": 0.096259955316782, "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.996651828289032, "step": 1648 }, { "clip_ratio": 0.0, "completion_length": 570.7991333007812, "epoch": 0.49256963632290346, "grad_norm": 0.18004277348518372, "kl": 0.1431884765625, "learning_rate": 1.1989250099594412e-05, "loss": 0.04, "reward": 2.1328126192092896, "reward_std": 0.18930701725184917, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9921875447034836, "step": 1649 }, { "clip_ratio": 0.0, "completion_length": 578.7768096923828, "epoch": 0.49286834441042493, "grad_norm": 0.14682213962078094, "kl": 0.1759033203125, "learning_rate": 1.1979027216514329e-05, "loss": 0.0124, "reward": 2.0312501192092896, "reward_std": 0.10682361386716366, "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9933035969734192, "step": 1650 }, { "clip_ratio": 0.0, "completion_length": 580.2053833007812, "epoch": 0.4931670524979464, "grad_norm": 0.17153599858283997, "kl": 0.183837890625, "learning_rate": 1.1968802180446602e-05, "loss": 0.049, "reward": 2.099888503551483, "reward_std": 0.21792021580040455, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.983816996216774, "step": 1651 }, { "clip_ratio": 0.0, "completion_length": 558.7678833007812, "epoch": 0.49346576058546787, "grad_norm": 0.10219306498765945, "kl": 0.1024169921875, "learning_rate": 1.1958575002515062e-05, "loss": 0.0147, "reward": 2.1065849661827087, "reward_std": 0.09679610282182693, "rewards/accuracy_reward": 0.10937500605359674, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 1652 }, { "clip_ratio": 0.0, "completion_length": 591.9643249511719, "epoch": 0.49376446867298934, "grad_norm": 0.22287631034851074, "kl": 0.2373046875, "learning_rate": 1.1948345693845884e-05, "loss": 0.0456, "reward": 2.002790242433548, "reward_std": 0.1977940909564495, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9849330633878708, "step": 1653 }, { "clip_ratio": 0.0, "completion_length": 576.0067291259766, "epoch": 0.4940631767605108, "grad_norm": 0.12506070733070374, "kl": 0.123291015625, "learning_rate": 1.1938114265567552e-05, "loss": 0.0353, "reward": 2.0530134439468384, "reward_std": 0.10918493382632732, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9927455633878708, "step": 1654 }, { "clip_ratio": 0.0, "completion_length": 533.9531478881836, "epoch": 0.4943618848480323, "grad_norm": 0.15427805483341217, "kl": 0.165283203125, "learning_rate": 1.192788072881085e-05, "loss": 0.0425, "reward": 2.0334822237491608, "reward_std": 0.1805781126022339, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9866071939468384, "step": 1655 }, { "clip_ratio": 0.0, "completion_length": 574.9352798461914, "epoch": 0.49466059293555376, "grad_norm": 0.18929754197597504, "kl": 0.1461181640625, "learning_rate": 1.1917645094708867e-05, "loss": 0.0596, "reward": 2.037946581840515, "reward_std": 0.24318337440490723, "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9866071790456772, "step": 1656 }, { "clip_ratio": 0.0, "completion_length": 590.6741485595703, "epoch": 0.4949593010230752, "grad_norm": 0.2220846563577652, "kl": 0.145751953125, "learning_rate": 1.1907407374396973e-05, "loss": 0.0274, "reward": 2.0541295409202576, "reward_std": 0.11145107634365559, "rewards/accuracy_reward": 0.07812500232830644, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9893973618745804, "step": 1657 }, { "clip_ratio": 0.0, "completion_length": 507.4375305175781, "epoch": 0.49525800911059664, "grad_norm": 0.7219834923744202, "kl": 0.207275390625, "learning_rate": 1.18971675790128e-05, "loss": 0.0829, "reward": 2.1255581378936768, "reward_std": 0.219564164057374, "rewards/accuracy_reward": 0.1696428693830967, "rewards/format_reward": 0.9732143431901932, "rewards/tag_count_reward": 0.9827009290456772, "step": 1658 }, { "clip_ratio": 0.0, "completion_length": 588.8571624755859, "epoch": 0.4955567171981181, "grad_norm": 0.22115330398082733, "kl": 0.13427734375, "learning_rate": 1.1886925719696243e-05, "loss": 0.0531, "reward": 2.0172992050647736, "reward_std": 0.18771815672516823, "rewards/accuracy_reward": 0.037946431431919336, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.992745578289032, "step": 1659 }, { "clip_ratio": 0.0, "completion_length": 553.9085083007812, "epoch": 0.4958554252856396, "grad_norm": 0.14741678535938263, "kl": 0.1385498046875, "learning_rate": 1.1876681807589443e-05, "loss": 0.0412, "reward": 2.090959906578064, "reward_std": 0.17080579325556755, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9905134290456772, "step": 1660 }, { "clip_ratio": 0.0, "completion_length": 600.9174346923828, "epoch": 0.49615413337316105, "grad_norm": 0.11724910885095596, "kl": 0.1182861328125, "learning_rate": 1.1866435853836773e-05, "loss": 0.0435, "reward": 2.0786831378936768, "reward_std": 0.18411995843052864, "rewards/accuracy_reward": 0.10937500465661287, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9893973618745804, "step": 1661 }, { "clip_ratio": 0.0, "completion_length": 627.7299346923828, "epoch": 0.4964528414606825, "grad_norm": 0.1618737429380417, "kl": 0.16943359375, "learning_rate": 1.1856187869584821e-05, "loss": 0.0485, "reward": 2.0965403020381927, "reward_std": 0.21855221316218376, "rewards/accuracy_reward": 0.13392857694998384, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9827009290456772, "step": 1662 }, { "clip_ratio": 0.0, "completion_length": 575.8638610839844, "epoch": 0.496751549548204, "grad_norm": 0.1640520840883255, "kl": 0.154296875, "learning_rate": 1.1845937865982393e-05, "loss": 0.0426, "reward": 2.056361675262451, "reward_std": 0.18901559337973595, "rewards/accuracy_reward": 0.08928571944124997, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.984933078289032, "step": 1663 }, { "clip_ratio": 0.0, "completion_length": 597.5335083007812, "epoch": 0.49705025763572547, "grad_norm": 0.16604147851467133, "kl": 0.1624755859375, "learning_rate": 1.1835685854180489e-05, "loss": 0.0641, "reward": 2.1121652722358704, "reward_std": 0.29086774215102196, "rewards/accuracy_reward": 0.15178572572767735, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9827009290456772, "step": 1664 }, { "clip_ratio": 0.0, "completion_length": 612.0111999511719, "epoch": 0.49734896572324694, "grad_norm": 0.13862192630767822, "kl": 0.2080078125, "learning_rate": 1.1825431845332293e-05, "loss": 0.0263, "reward": 2.055803656578064, "reward_std": 0.20633277297019958, "rewards/accuracy_reward": 0.0825892873108387, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9888393431901932, "step": 1665 }, { "clip_ratio": 0.0, "completion_length": 607.0290374755859, "epoch": 0.4976476738107684, "grad_norm": 0.13627788424491882, "kl": 0.1455078125, "learning_rate": 1.1815175850593159e-05, "loss": 0.052, "reward": 2.04241082072258, "reward_std": 0.2042703814804554, "rewards/accuracy_reward": 0.07812500419095159, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9866071790456772, "step": 1666 }, { "clip_ratio": 0.0, "completion_length": 573.8415222167969, "epoch": 0.4979463818982899, "grad_norm": 0.12552829086780548, "kl": 0.099853515625, "learning_rate": 1.1804917881120608e-05, "loss": 0.0171, "reward": 2.1517858505249023, "reward_std": 0.16721350699663162, "rewards/accuracy_reward": 0.165178582072258, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.995535746216774, "step": 1667 }, { "clip_ratio": 0.0, "completion_length": 608.4665374755859, "epoch": 0.49824508998581135, "grad_norm": 0.1512712687253952, "kl": 0.134033203125, "learning_rate": 1.1794657948074301e-05, "loss": 0.0261, "reward": 2.0044643878936768, "reward_std": 0.20934216678142548, "rewards/accuracy_reward": 0.03794643050059676, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9866071939468384, "step": 1668 }, { "clip_ratio": 0.0, "completion_length": 592.4397583007812, "epoch": 0.4985437980733328, "grad_norm": 0.13076429069042206, "kl": 0.107421875, "learning_rate": 1.1784396062616046e-05, "loss": 0.0315, "reward": 2.0580358505249023, "reward_std": 0.15550674498081207, "rewards/accuracy_reward": 0.07812500279396772, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9933036267757416, "step": 1669 }, { "clip_ratio": 0.0, "completion_length": 607.3036041259766, "epoch": 0.4988425061608543, "grad_norm": 0.11482016742229462, "kl": 0.1063232421875, "learning_rate": 1.177413223590976e-05, "loss": 0.0479, "reward": 2.088169753551483, "reward_std": 0.1721154972910881, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.987723246216774, "step": 1670 }, { "clip_ratio": 0.0, "completion_length": 583.6741333007812, "epoch": 0.49914121424837576, "grad_norm": 0.132505863904953, "kl": 0.136962890625, "learning_rate": 1.1763866479121486e-05, "loss": 0.0446, "reward": 2.018973231315613, "reward_std": 0.19248790107667446, "rewards/accuracy_reward": 0.04910714365541935, "rewards/format_reward": 0.9821429252624512, "rewards/tag_count_reward": 0.9877232611179352, "step": 1671 }, { "clip_ratio": 0.0, "completion_length": 580.4799346923828, "epoch": 0.49943992233589724, "grad_norm": 0.12019042670726776, "kl": 0.10107421875, "learning_rate": 1.1753598803419361e-05, "loss": 0.0248, "reward": 2.077567011117935, "reward_std": 0.1403685323894024, "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.993303582072258, "rewards/tag_count_reward": 0.9949776828289032, "step": 1672 }, { "clip_ratio": 0.0, "completion_length": 587.8727874755859, "epoch": 0.4997386304234187, "grad_norm": 0.2362484335899353, "kl": 0.1429443359375, "learning_rate": 1.1743329219973609e-05, "loss": 0.0496, "reward": 2.0820313692092896, "reward_std": 0.1815511342138052, "rewards/accuracy_reward": 0.10267857275903225, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9905134439468384, "step": 1673 }, { "clip_ratio": 0.0, "completion_length": 590.1897583007812, "epoch": 0.5000373385109402, "grad_norm": 0.11887161433696747, "kl": 0.10693359375, "learning_rate": 1.1733057739956531e-05, "loss": 0.0158, "reward": 2.0753349661827087, "reward_std": 0.14728944934904575, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949777126312256, "step": 1674 }, { "clip_ratio": 0.0, "completion_length": 599.9933319091797, "epoch": 0.5003360465984616, "grad_norm": 0.13958050310611725, "kl": 0.152099609375, "learning_rate": 1.1722784374542489e-05, "loss": 0.0293, "reward": 2.0658483505249023, "reward_std": 0.31295695155858994, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.9732143431901932, "rewards/tag_count_reward": 0.9810268431901932, "step": 1675 }, { "clip_ratio": 0.0, "completion_length": 595.4085235595703, "epoch": 0.5006347546859832, "grad_norm": 0.1804960072040558, "kl": 0.188720703125, "learning_rate": 1.17125091349079e-05, "loss": 0.0813, "reward": 2.08537957072258, "reward_std": 0.2710386961698532, "rewards/accuracy_reward": 0.13616071827709675, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9760045111179352, "step": 1676 }, { "clip_ratio": 0.0, "completion_length": 541.4062728881836, "epoch": 0.5009334627735046, "grad_norm": 0.16308261454105377, "kl": 0.1693115234375, "learning_rate": 1.1702232032231213e-05, "loss": 0.0774, "reward": 2.0982143878936768, "reward_std": 0.24360688775777817, "rewards/accuracy_reward": 0.13616072293370962, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9821428954601288, "step": 1677 }, { "clip_ratio": 0.0, "completion_length": 621.0580596923828, "epoch": 0.5012321708610261, "grad_norm": 0.11328668892383575, "kl": 0.10546875, "learning_rate": 1.1691953077692915e-05, "loss": 0.0269, "reward": 2.0279018580913544, "reward_std": 0.09733945969492197, "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9944196790456772, "step": 1678 }, { "clip_ratio": 0.0, "completion_length": 559.8772583007812, "epoch": 0.5015308789485475, "grad_norm": 0.09946552664041519, "kl": 0.26708984375, "learning_rate": 1.1681672282475495e-05, "loss": 0.0296, "reward": 2.056361675262451, "reward_std": 0.14103491976857185, "rewards/accuracy_reward": 0.08035714784637094, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9893973618745804, "step": 1679 }, { "clip_ratio": 0.0, "completion_length": 626.4844055175781, "epoch": 0.501829587036069, "grad_norm": 0.14305105805397034, "kl": 0.1114501953125, "learning_rate": 1.1671389657763457e-05, "loss": 0.0377, "reward": 2.0725446939468384, "reward_std": 0.24170211143791676, "rewards/accuracy_reward": 0.10267857694998384, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9899553805589676, "step": 1680 }, { "clip_ratio": 0.0, "completion_length": 618.0870819091797, "epoch": 0.5021282951235905, "grad_norm": 0.13746967911720276, "kl": 0.10400390625, "learning_rate": 1.166110521474328e-05, "loss": 0.0387, "reward": 2.1171875596046448, "reward_std": 0.16497142985463142, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9921875298023224, "step": 1681 }, { "clip_ratio": 0.0, "completion_length": 615.888427734375, "epoch": 0.5024270032111119, "grad_norm": 0.5152573585510254, "kl": 0.1951904296875, "learning_rate": 1.1650818964603439e-05, "loss": 0.0421, "reward": 2.0385045409202576, "reward_std": 0.18646783009171486, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9893973618745804, "step": 1682 }, { "clip_ratio": 0.0, "completion_length": 574.1607513427734, "epoch": 0.5027257112986334, "grad_norm": 0.15632525086402893, "kl": 0.1065673828125, "learning_rate": 1.1640530918534361e-05, "loss": 0.0414, "reward": 2.0825894474983215, "reward_std": 0.20473385229706764, "rewards/accuracy_reward": 0.10937500861473382, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9910714626312256, "step": 1683 }, { "clip_ratio": 0.0, "completion_length": 569.8951263427734, "epoch": 0.5030244193861548, "grad_norm": 0.1571969836950302, "kl": 0.1524658203125, "learning_rate": 1.163024108772843e-05, "loss": 0.0359, "reward": 2.2226563692092896, "reward_std": 0.2354949191212654, "rewards/accuracy_reward": 0.247767873108387, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.992745578289032, "step": 1684 }, { "clip_ratio": 0.0, "completion_length": 541.3750228881836, "epoch": 0.5033231274736764, "grad_norm": 0.13366834819316864, "kl": 0.1319580078125, "learning_rate": 1.161994948337998e-05, "loss": 0.0331, "reward": 2.029017925262451, "reward_std": 0.17347734235227108, "rewards/accuracy_reward": 0.05133928940631449, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9910714626312256, "step": 1685 }, { "clip_ratio": 0.0, "completion_length": 533.2812805175781, "epoch": 0.5036218355611978, "grad_norm": 0.19976453483104706, "kl": 0.499755859375, "learning_rate": 1.1609656116685265e-05, "loss": 0.0329, "reward": 2.055803656578064, "reward_std": 0.1931649949401617, "rewards/accuracy_reward": 0.08258928940631449, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.988839328289032, "step": 1686 }, { "clip_ratio": 0.0, "completion_length": 613.7879638671875, "epoch": 0.5039205436487193, "grad_norm": 0.1429137885570526, "kl": 0.1396484375, "learning_rate": 1.1599360998842454e-05, "loss": 0.0278, "reward": 2.0184152126312256, "reward_std": 0.13103245105594397, "rewards/accuracy_reward": 0.037946428870782256, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.991629496216774, "step": 1687 }, { "clip_ratio": 0.0, "completion_length": 524.100471496582, "epoch": 0.5042192517362407, "grad_norm": 0.2828579843044281, "kl": 0.1466064453125, "learning_rate": 1.1589064141051633e-05, "loss": 0.0318, "reward": 2.090959906578064, "reward_std": 0.16977499425411224, "rewards/accuracy_reward": 0.1049107164144516, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9949776977300644, "step": 1688 }, { "clip_ratio": 0.0, "completion_length": 575.7410888671875, "epoch": 0.5045179598237622, "grad_norm": 0.14978571236133575, "kl": 0.2916259765625, "learning_rate": 1.1578765554514772e-05, "loss": 0.015, "reward": 2.060267984867096, "reward_std": 0.13204334769397974, "rewards/accuracy_reward": 0.08258929033763707, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9910714626312256, "step": 1689 }, { "clip_ratio": 0.0, "completion_length": 577.513427734375, "epoch": 0.5048166679112837, "grad_norm": 0.13006898760795593, "kl": 0.1304931640625, "learning_rate": 1.1568465250435725e-05, "loss": 0.0356, "reward": 2.0831474363803864, "reward_std": 0.16577571630477905, "rewards/accuracy_reward": 0.11383929057046771, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9893973469734192, "step": 1690 }, { "clip_ratio": 0.0, "completion_length": 605.5089416503906, "epoch": 0.5051153759988052, "grad_norm": 0.20680660009384155, "kl": 0.167724609375, "learning_rate": 1.1558163240020209e-05, "loss": 0.0304, "reward": 2.064732253551483, "reward_std": 0.17095475643873215, "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.988839328289032, "step": 1691 }, { "clip_ratio": 0.0, "completion_length": 610.9754638671875, "epoch": 0.5054140840863266, "grad_norm": 0.1421739161014557, "kl": 0.176025390625, "learning_rate": 1.1547859534475805e-05, "loss": 0.0502, "reward": 2.0507812798023224, "reward_std": 0.16864940151572227, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9860491454601288, "step": 1692 }, { "clip_ratio": 0.0, "completion_length": 609.3192138671875, "epoch": 0.5057127921738481, "grad_norm": 0.12777331471443176, "kl": 0.1259765625, "learning_rate": 1.1537554145011932e-05, "loss": 0.0225, "reward": 2.001674145460129, "reward_std": 0.13074734713882208, "rewards/accuracy_reward": 0.020089286379516125, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9927455484867096, "step": 1693 }, { "clip_ratio": 0.0, "completion_length": 529.6875305175781, "epoch": 0.5060115002613695, "grad_norm": 0.12633243203163147, "kl": 0.0853271484375, "learning_rate": 1.152724708283985e-05, "loss": 0.0069, "reward": 2.2315849661827087, "reward_std": 0.11124755907803774, "rewards/accuracy_reward": 0.2343750074505806, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 1694 }, { "clip_ratio": 0.0, "completion_length": 577.5826263427734, "epoch": 0.5063102083488911, "grad_norm": 0.11384643614292145, "kl": 0.09619140625, "learning_rate": 1.1516938359172624e-05, "loss": 0.0037, "reward": 2.0864956974983215, "reward_std": 0.10071282181888819, "rewards/accuracy_reward": 0.08928572060540318, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 1695 }, { "clip_ratio": 0.0, "completion_length": 575.9308319091797, "epoch": 0.5066089164364125, "grad_norm": 0.15631915628910065, "kl": 0.1783447265625, "learning_rate": 1.150662798522514e-05, "loss": 0.0542, "reward": 2.0312501192092896, "reward_std": 0.2046079784631729, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9799107611179352, "step": 1696 }, { "clip_ratio": 0.0, "completion_length": 553.1384124755859, "epoch": 0.506907624523934, "grad_norm": 0.1679946780204773, "kl": 0.17529296875, "learning_rate": 1.1496315972214076e-05, "loss": 0.0465, "reward": 2.0848215222358704, "reward_std": 0.15633169189095497, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.988839328289032, "step": 1697 }, { "clip_ratio": 0.0, "completion_length": 569.5982513427734, "epoch": 0.5072063326114554, "grad_norm": 0.12386886030435562, "kl": 0.1253662109375, "learning_rate": 1.1486002331357887e-05, "loss": 0.0081, "reward": 2.095982253551483, "reward_std": 0.10948435962200165, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9977678656578064, "step": 1698 }, { "clip_ratio": 0.0, "completion_length": 590.1875457763672, "epoch": 0.507505040698977, "grad_norm": 0.5788133144378662, "kl": 0.3106689453125, "learning_rate": 1.1475687073876806e-05, "loss": 0.0768, "reward": 2.0334822237491608, "reward_std": 0.2273225635290146, "rewards/accuracy_reward": 0.07142857369035482, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9843750447034836, "step": 1699 }, { "clip_ratio": 0.0, "completion_length": 577.9218902587891, "epoch": 0.5078037487864984, "grad_norm": 0.25517240166664124, "kl": 0.238525390625, "learning_rate": 1.146537021099282e-05, "loss": 0.078, "reward": 2.033482253551483, "reward_std": 0.2706870920956135, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9754464775323868, "step": 1700 }, { "clip_ratio": 0.0, "completion_length": 602.3995666503906, "epoch": 0.5081024568740199, "grad_norm": 0.18096943199634552, "kl": 0.1448974609375, "learning_rate": 1.1455051753929668e-05, "loss": 0.0587, "reward": 2.107701063156128, "reward_std": 0.21832131687551737, "rewards/accuracy_reward": 0.13616072130389512, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9871652126312256, "step": 1701 }, { "clip_ratio": 0.0, "completion_length": 546.138427734375, "epoch": 0.5084011649615413, "grad_norm": 0.20271003246307373, "kl": 0.3309326171875, "learning_rate": 1.1444731713912818e-05, "loss": 0.0196, "reward": 2.033482253551483, "reward_std": 0.1458922689780593, "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9910714626312256, "step": 1702 }, { "clip_ratio": 0.0, "completion_length": 597.2701110839844, "epoch": 0.5086998730490628, "grad_norm": 0.15407605469226837, "kl": 0.12890625, "learning_rate": 1.1434410102169462e-05, "loss": 0.0388, "reward": 2.1383929550647736, "reward_std": 0.19024229422211647, "rewards/accuracy_reward": 0.17187500488944352, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9888393133878708, "step": 1703 }, { "clip_ratio": 0.0, "completion_length": 540.2120742797852, "epoch": 0.5089985811365842, "grad_norm": 0.14603300392627716, "kl": 0.2373046875, "learning_rate": 1.1424086929928502e-05, "loss": 0.015, "reward": 2.0892857909202576, "reward_std": 0.11902236379683018, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9933035969734192, "step": 1704 }, { "clip_ratio": 0.0, "completion_length": 627.5625305175781, "epoch": 0.5092972892241058, "grad_norm": 0.22580790519714355, "kl": 0.255615234375, "learning_rate": 1.1413762208420536e-05, "loss": 0.0324, "reward": 2.0691965520381927, "reward_std": 0.18919116258621216, "rewards/accuracy_reward": 0.10491071874275804, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9866071790456772, "step": 1705 }, { "clip_ratio": 0.0, "completion_length": 570.678596496582, "epoch": 0.5095959973116272, "grad_norm": 0.14426539838314056, "kl": 0.1353759765625, "learning_rate": 1.1403435948877855e-05, "loss": 0.0418, "reward": 2.0496652722358704, "reward_std": 0.21853198111057281, "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9871652126312256, "step": 1706 }, { "clip_ratio": 0.0, "completion_length": 586.9754943847656, "epoch": 0.5098947053991487, "grad_norm": 0.109906867146492, "kl": 0.126953125, "learning_rate": 1.139310816253441e-05, "loss": 0.0314, "reward": 2.0658482909202576, "reward_std": 0.12062234617769718, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9921875298023224, "step": 1707 }, { "clip_ratio": 0.0, "completion_length": 590.5089569091797, "epoch": 0.5101934134866701, "grad_norm": 1.1910521984100342, "kl": 0.33740234375, "learning_rate": 1.1382778860625826e-05, "loss": 0.0794, "reward": 2.0943081080913544, "reward_std": 0.2489062286913395, "rewards/accuracy_reward": 0.14062500605359674, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9827009290456772, "step": 1708 }, { "clip_ratio": 0.0, "completion_length": 601.9397583007812, "epoch": 0.5104921215741917, "grad_norm": 0.13699652254581451, "kl": 0.1011962890625, "learning_rate": 1.1372448054389364e-05, "loss": 0.0209, "reward": 2.1255581378936768, "reward_std": 0.19734062254428864, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.991629496216774, "step": 1709 }, { "clip_ratio": 0.0, "completion_length": 580.5268249511719, "epoch": 0.5107908296617131, "grad_norm": 0.14559362828731537, "kl": 0.125244140625, "learning_rate": 1.1362115755063936e-05, "loss": 0.0126, "reward": 2.1132812798023224, "reward_std": 0.11112472228705883, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9949777126312256, "step": 1710 }, { "clip_ratio": 0.0, "completion_length": 550.2745895385742, "epoch": 0.5110895377492346, "grad_norm": 0.125139519572258, "kl": 0.0987548828125, "learning_rate": 1.1351781973890068e-05, "loss": 0.0123, "reward": 2.1127232909202576, "reward_std": 0.1211213544011116, "rewards/accuracy_reward": 0.12276786053553224, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.996651828289032, "step": 1711 }, { "clip_ratio": 0.0, "completion_length": 608.3326110839844, "epoch": 0.511388245836756, "grad_norm": 0.15754060447216034, "kl": 0.171875, "learning_rate": 1.1341446722109901e-05, "loss": 0.0442, "reward": 2.091517925262451, "reward_std": 0.22339611500501633, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9843750447034836, "step": 1712 }, { "clip_ratio": 0.0, "completion_length": 557.5714569091797, "epoch": 0.5116869539242775, "grad_norm": 0.1216995120048523, "kl": 0.114990234375, "learning_rate": 1.1331110010967177e-05, "loss": 0.031, "reward": 2.0848215222358704, "reward_std": 0.15029465965926647, "rewards/accuracy_reward": 0.10267858020961285, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9933036118745804, "step": 1713 }, { "clip_ratio": 0.0, "completion_length": 601.8571624755859, "epoch": 0.511985662011799, "grad_norm": 3.6667375564575195, "kl": 0.31201171875, "learning_rate": 1.1320771851707225e-05, "loss": 0.064, "reward": 2.05803582072258, "reward_std": 0.18562186881899834, "rewards/accuracy_reward": 0.08928571734577417, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9843750298023224, "step": 1714 }, { "clip_ratio": 0.0, "completion_length": 593.7544860839844, "epoch": 0.5122843700993205, "grad_norm": 0.12474174052476883, "kl": 0.099853515625, "learning_rate": 1.1310432255576944e-05, "loss": 0.0214, "reward": 2.1445313692092896, "reward_std": 0.1616159789264202, "rewards/accuracy_reward": 0.15625000558793545, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949776977300644, "step": 1715 }, { "clip_ratio": 0.0, "completion_length": 562.412971496582, "epoch": 0.5125830781868419, "grad_norm": 0.15526098012924194, "kl": 0.101806640625, "learning_rate": 1.1300091233824806e-05, "loss": 0.0288, "reward": 2.1082590520381927, "reward_std": 0.19191144034266472, "rewards/accuracy_reward": 0.12500001047737896, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9944196790456772, "step": 1716 }, { "clip_ratio": 0.0, "completion_length": 612.1406555175781, "epoch": 0.5128817862743634, "grad_norm": 0.2735785245895386, "kl": 0.169677734375, "learning_rate": 1.128974879770083e-05, "loss": 0.0609, "reward": 2.076451003551483, "reward_std": 0.2532893233001232, "rewards/accuracy_reward": 0.12276786426082253, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9804687947034836, "step": 1717 }, { "clip_ratio": 0.0, "completion_length": 595.9464416503906, "epoch": 0.5131804943618848, "grad_norm": 0.24443437159061432, "kl": 0.173095703125, "learning_rate": 1.1279404958456572e-05, "loss": 0.0459, "reward": 2.102678656578064, "reward_std": 0.19233887642621994, "rewards/accuracy_reward": 0.13169643189758062, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9866071939468384, "step": 1718 }, { "clip_ratio": 0.0, "completion_length": 549.8750152587891, "epoch": 0.5134792024494064, "grad_norm": 0.14487919211387634, "kl": 0.116455078125, "learning_rate": 1.1269059727345111e-05, "loss": 0.0265, "reward": 2.1668527722358704, "reward_std": 0.1739705353975296, "rewards/accuracy_reward": 0.1785714402794838, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949776977300644, "step": 1719 }, { "clip_ratio": 0.0, "completion_length": 595.6071624755859, "epoch": 0.5137779105369278, "grad_norm": 0.32402142882347107, "kl": 0.223876953125, "learning_rate": 1.1258713115621051e-05, "loss": 0.0425, "reward": 2.0691965222358704, "reward_std": 0.26633910089731216, "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9866071939468384, "step": 1720 }, { "clip_ratio": 0.0, "completion_length": 588.2567291259766, "epoch": 0.5140766186244493, "grad_norm": 0.14065758883953094, "kl": 0.2041015625, "learning_rate": 1.1248365134540489e-05, "loss": 0.0535, "reward": 2.0390625596046448, "reward_std": 0.16482791677117348, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9899553954601288, "step": 1721 }, { "clip_ratio": 0.0, "completion_length": 608.4241333007812, "epoch": 0.5143753267119707, "grad_norm": 0.10956374555826187, "kl": 0.1036376953125, "learning_rate": 1.1238015795361011e-05, "loss": 0.0209, "reward": 2.0535714626312256, "reward_std": 0.11821345239877701, "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.995535746216774, "step": 1722 }, { "clip_ratio": 0.0, "completion_length": 588.6919860839844, "epoch": 0.5146740347994921, "grad_norm": 0.10329099744558334, "kl": 0.1258544921875, "learning_rate": 1.1227665109341686e-05, "loss": 0.0146, "reward": 2.0887277722358704, "reward_std": 0.10918493196368217, "rewards/accuracy_reward": 0.10044643236324191, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949777126312256, "step": 1723 }, { "clip_ratio": 0.0, "completion_length": 622.0067138671875, "epoch": 0.5149727428870137, "grad_norm": 0.13919955492019653, "kl": 0.138427734375, "learning_rate": 1.1217313087743048e-05, "loss": 0.0188, "reward": 2.0770090222358704, "reward_std": 0.16058618761599064, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9899553805589676, "step": 1724 }, { "clip_ratio": 0.0, "completion_length": 630.3125305175781, "epoch": 0.5152714509745351, "grad_norm": 0.18964217603206635, "kl": 0.1871337890625, "learning_rate": 1.1206959741827079e-05, "loss": 0.0659, "reward": 2.0295759439468384, "reward_std": 0.21110327914357185, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9849330633878708, "step": 1725 }, { "clip_ratio": 0.0, "completion_length": 567.6718902587891, "epoch": 0.5155701590620566, "grad_norm": 0.16361823678016663, "kl": 0.138916015625, "learning_rate": 1.1196605082857204e-05, "loss": 0.0331, "reward": 2.041852831840515, "reward_std": 0.20327529683709145, "rewards/accuracy_reward": 0.06026785960420966, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.992745578289032, "step": 1726 }, { "clip_ratio": 0.0, "completion_length": 595.7768249511719, "epoch": 0.515868867149578, "grad_norm": 0.17833387851715088, "kl": 0.1917724609375, "learning_rate": 1.1186249122098282e-05, "loss": 0.0458, "reward": 2.032366156578064, "reward_std": 0.13650763779878616, "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9921875298023224, "step": 1727 }, { "clip_ratio": 0.0, "completion_length": 586.6719055175781, "epoch": 0.5161675752370996, "grad_norm": 1.2958893775939941, "kl": 0.279296875, "learning_rate": 1.117589187081658e-05, "loss": 0.0542, "reward": 2.157924175262451, "reward_std": 0.16469615139067173, "rewards/accuracy_reward": 0.17410715157166123, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9949777275323868, "step": 1728 }, { "clip_ratio": 0.0, "completion_length": 594.4933319091797, "epoch": 0.516466283324621, "grad_norm": 0.9346919655799866, "kl": 0.326904296875, "learning_rate": 1.1165533340279771e-05, "loss": 0.0343, "reward": 2.095982253551483, "reward_std": 0.14079732913523912, "rewards/accuracy_reward": 0.10937500675208867, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9933035969734192, "step": 1729 }, { "clip_ratio": 0.0, "completion_length": 565.6518096923828, "epoch": 0.5167649914121425, "grad_norm": 2.5288870334625244, "kl": 0.4385986328125, "learning_rate": 1.115517354175692e-05, "loss": 0.0309, "reward": 2.208705425262451, "reward_std": 0.1501120962202549, "rewards/accuracy_reward": 0.2299107313156128, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.994419664144516, "step": 1730 }, { "clip_ratio": 0.0, "completion_length": 599.6875152587891, "epoch": 0.5170636994996639, "grad_norm": 5.103280544281006, "kl": 0.31494140625, "learning_rate": 1.1144812486518478e-05, "loss": 0.0568, "reward": 2.0150670409202576, "reward_std": 0.16874276287853718, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9838170111179352, "step": 1731 }, { "clip_ratio": 0.0, "completion_length": 621.6026992797852, "epoch": 0.5173624075871854, "grad_norm": 4.025388717651367, "kl": 0.1290283203125, "learning_rate": 1.1134450185836254e-05, "loss": 0.0337, "reward": 2.1004465222358704, "reward_std": 0.17698492296040058, "rewards/accuracy_reward": 0.13169643771834671, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9888393133878708, "step": 1732 }, { "clip_ratio": 0.0, "completion_length": 577.5602874755859, "epoch": 0.5176611156747069, "grad_norm": 5.642408847808838, "kl": 0.22705078125, "learning_rate": 1.1124086650983415e-05, "loss": 0.0569, "reward": 2.0970982909202576, "reward_std": 0.17158551327884197, "rewards/accuracy_reward": 0.12276786379516125, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9899553954601288, "step": 1733 }, { "clip_ratio": 0.0, "completion_length": 584.8661041259766, "epoch": 0.5179598237622284, "grad_norm": 1.1211895942687988, "kl": 0.2626953125, "learning_rate": 1.1113721893234472e-05, "loss": 0.0286, "reward": 2.1445313096046448, "reward_std": 0.10498691257089376, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949777126312256, "step": 1734 }, { "clip_ratio": 0.0, "completion_length": 572.7835083007812, "epoch": 0.5182585318497498, "grad_norm": 0.34948208928108215, "kl": 0.1768798828125, "learning_rate": 1.1103355923865266e-05, "loss": 0.008, "reward": 2.1093751192092896, "reward_std": 0.07262345030903816, "rewards/accuracy_reward": 0.11383928963914514, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9977678656578064, "step": 1735 }, { "clip_ratio": 0.0, "completion_length": 589.4375152587891, "epoch": 0.5185572399372713, "grad_norm": 17.552671432495117, "kl": 1.7513427734375, "learning_rate": 1.1092988754152956e-05, "loss": 0.1385, "reward": 2.087611675262451, "reward_std": 0.21882518380880356, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.991629496216774, "step": 1736 }, { "clip_ratio": 0.0, "completion_length": 584.4687805175781, "epoch": 0.5188559480247927, "grad_norm": 0.8136669993400574, "kl": 1.0440673828125, "learning_rate": 1.1082620395376006e-05, "loss": 0.0596, "reward": 2.002232253551483, "reward_std": 0.17658159974962473, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9843750447034836, "step": 1737 }, { "clip_ratio": 0.0, "completion_length": 601.5937805175781, "epoch": 0.5191546561123143, "grad_norm": 1.158494472503662, "kl": 0.76806640625, "learning_rate": 1.1072250858814173e-05, "loss": 0.0858, "reward": 2.0306920409202576, "reward_std": 0.21593026258051395, "rewards/accuracy_reward": 0.060267859138548374, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9882812947034836, "step": 1738 }, { "clip_ratio": 0.0, "completion_length": 575.7455596923828, "epoch": 0.5194533641998357, "grad_norm": 2.2529661655426025, "kl": 0.2763671875, "learning_rate": 1.1061880155748497e-05, "loss": 0.0373, "reward": 2.0853795409202576, "reward_std": 0.06978206802159548, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9960937649011612, "step": 1739 }, { "clip_ratio": 0.0, "completion_length": 570.0201263427734, "epoch": 0.5197520722873572, "grad_norm": 2.6795272827148438, "kl": 0.4482421875, "learning_rate": 1.1051508297461286e-05, "loss": 0.0678, "reward": 2.0848215520381927, "reward_std": 0.16049314849078655, "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.988839328289032, "step": 1740 }, { "clip_ratio": 0.0, "completion_length": 567.2678680419922, "epoch": 0.5200507803748786, "grad_norm": 2.3234739303588867, "kl": 0.8955078125, "learning_rate": 1.104113529523611e-05, "loss": 0.0785, "reward": 2.1015625596046448, "reward_std": 0.17946169339120388, "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9877232611179352, "step": 1741 }, { "clip_ratio": 0.0, "completion_length": 559.8303833007812, "epoch": 0.5203494884624001, "grad_norm": 1.4918020963668823, "kl": 0.3837890625, "learning_rate": 1.1030761160357773e-05, "loss": 0.0494, "reward": 2.0518974363803864, "reward_std": 0.1366214770823717, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9938616454601288, "step": 1742 }, { "clip_ratio": 0.0, "completion_length": 574.4040374755859, "epoch": 0.5206481965499216, "grad_norm": 1.7624788284301758, "kl": 0.487548828125, "learning_rate": 1.1020385904112318e-05, "loss": 0.0783, "reward": 2.021205484867096, "reward_std": 0.13255967013537884, "rewards/accuracy_reward": 0.042410715483129025, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9921875447034836, "step": 1743 }, { "clip_ratio": 0.0, "completion_length": 590.1986846923828, "epoch": 0.5209469046374431, "grad_norm": 0.3062087893486023, "kl": 0.27587890625, "learning_rate": 1.101000953778701e-05, "loss": 0.029, "reward": 2.1049107909202576, "reward_std": 0.15401234664022923, "rewards/accuracy_reward": 0.11607143515720963, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.995535746216774, "step": 1744 }, { "clip_ratio": 0.0, "completion_length": 606.1451110839844, "epoch": 0.5212456127249645, "grad_norm": 0.9999996423721313, "kl": 0.224609375, "learning_rate": 1.0999632072670314e-05, "loss": 0.026, "reward": 2.064174175262451, "reward_std": 0.15831842459738255, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9949776977300644, "step": 1745 }, { "clip_ratio": 0.0, "completion_length": 569.1004867553711, "epoch": 0.521544320812486, "grad_norm": 2.336604595184326, "kl": 0.42431640625, "learning_rate": 1.0989253520051898e-05, "loss": 0.0574, "reward": 2.1489956080913544, "reward_std": 0.20979502610862255, "rewards/accuracy_reward": 0.18526786309666932, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9838170111179352, "step": 1746 }, { "clip_ratio": 0.0, "completion_length": 574.325927734375, "epoch": 0.5218430289000074, "grad_norm": 0.5189833045005798, "kl": 0.3828125, "learning_rate": 1.097887389122261e-05, "loss": 0.078, "reward": 2.046875089406967, "reward_std": 0.1694694235920906, "rewards/accuracy_reward": 0.0825892873108387, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9866071790456772, "step": 1747 }, { "clip_ratio": 0.0, "completion_length": 588.1741180419922, "epoch": 0.522141736987529, "grad_norm": 0.21277056634426117, "kl": 0.3524169921875, "learning_rate": 1.0968493197474469e-05, "loss": 0.0149, "reward": 2.0608260333538055, "reward_std": 0.17548557929694653, "rewards/accuracy_reward": 0.07812500558793545, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.99386166036129, "step": 1748 }, { "clip_ratio": 0.0, "completion_length": 569.0915451049805, "epoch": 0.5224404450750504, "grad_norm": 0.6612910032272339, "kl": 0.3656005859375, "learning_rate": 1.095811145010065e-05, "loss": 0.0931, "reward": 2.078125089406967, "reward_std": 0.2085141558200121, "rewards/accuracy_reward": 0.11383928847499192, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9866071790456772, "step": 1749 }, { "clip_ratio": 0.0, "completion_length": 517.4464492797852, "epoch": 0.5227391531625719, "grad_norm": 0.2041643261909485, "kl": 0.10888671875, "learning_rate": 1.094772866039548e-05, "loss": 0.0082, "reward": 2.0864956974983215, "reward_std": 0.16439768858253956, "rewards/accuracy_reward": 0.09375000325962901, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9994419813156128, "step": 1750 }, { "clip_ratio": 0.0, "completion_length": 574.3147659301758, "epoch": 0.5230378612500933, "grad_norm": 0.40833550691604614, "kl": 0.2476806640625, "learning_rate": 1.0937344839654416e-05, "loss": 0.0444, "reward": 2.017857253551483, "reward_std": 0.13668943010270596, "rewards/accuracy_reward": 0.04017857392318547, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9910714626312256, "step": 1751 }, { "clip_ratio": 0.0, "completion_length": 591.6160888671875, "epoch": 0.5233365693376149, "grad_norm": 0.7162529826164246, "kl": 0.289794921875, "learning_rate": 1.0926959999174032e-05, "loss": 0.0285, "reward": 2.1015626192092896, "reward_std": 0.19559410400688648, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9921875298023224, "step": 1752 }, { "clip_ratio": 0.0, "completion_length": 532.678596496582, "epoch": 0.5236352774251363, "grad_norm": 0.2735196053981781, "kl": 0.1787109375, "learning_rate": 1.0916574150252024e-05, "loss": 0.0135, "reward": 2.1646206080913544, "reward_std": 0.19466150179505348, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9949776977300644, "step": 1753 }, { "clip_ratio": 0.0, "completion_length": 549.2790374755859, "epoch": 0.5239339855126578, "grad_norm": 1.1009521484375, "kl": 0.508544921875, "learning_rate": 1.0906187304187175e-05, "loss": 0.034, "reward": 2.0468751788139343, "reward_std": 0.21658143773674965, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9866071939468384, "step": 1754 }, { "clip_ratio": 0.0, "completion_length": 577.0000305175781, "epoch": 0.5242326936001792, "grad_norm": 1.9244847297668457, "kl": 0.512451171875, "learning_rate": 1.0895799472279351e-05, "loss": 0.0712, "reward": 2.087053656578064, "reward_std": 0.2080293893814087, "rewards/accuracy_reward": 0.11383929010480642, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9888393133878708, "step": 1755 }, { "clip_ratio": 0.0, "completion_length": 576.3415374755859, "epoch": 0.5245314016877007, "grad_norm": 0.4494386315345764, "kl": 0.4482421875, "learning_rate": 1.0885410665829503e-05, "loss": 0.0732, "reward": 2.0217635333538055, "reward_std": 0.1687302216887474, "rewards/accuracy_reward": 0.058035715483129025, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9860491454601288, "step": 1756 }, { "clip_ratio": 0.0, "completion_length": 571.0178985595703, "epoch": 0.5248301097752222, "grad_norm": 0.8235184550285339, "kl": 0.436767578125, "learning_rate": 1.087502089613963e-05, "loss": 0.0306, "reward": 2.139509081840515, "reward_std": 0.2327747829258442, "rewards/accuracy_reward": 0.16294643469154835, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9899553954601288, "step": 1757 }, { "clip_ratio": 0.0, "completion_length": 570.0223388671875, "epoch": 0.5251288178627437, "grad_norm": 0.551304817199707, "kl": 0.2884521484375, "learning_rate": 1.0864630174512783e-05, "loss": 0.021, "reward": 2.1400670409202576, "reward_std": 0.22031569480895996, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.990513414144516, "step": 1758 }, { "clip_ratio": 0.0, "completion_length": 572.6205596923828, "epoch": 0.5254275259502651, "grad_norm": 0.9589059948921204, "kl": 0.2135009765625, "learning_rate": 1.0854238512253045e-05, "loss": 0.0453, "reward": 2.095424175262451, "reward_std": 0.19847392663359642, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.992745578289032, "step": 1759 }, { "clip_ratio": 0.0, "completion_length": 558.5491333007812, "epoch": 0.5257262340377866, "grad_norm": 0.19662509858608246, "kl": 0.1676025390625, "learning_rate": 1.0843845920665534e-05, "loss": 0.0202, "reward": 2.137834906578064, "reward_std": 0.13065248727798462, "rewards/accuracy_reward": 0.14508929289877415, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9972098469734192, "step": 1760 }, { "clip_ratio": 0.0, "completion_length": 596.6339569091797, "epoch": 0.526024942125308, "grad_norm": 1.9210340976715088, "kl": 0.9080810546875, "learning_rate": 1.0833452411056366e-05, "loss": 0.0841, "reward": 2.0312501192092896, "reward_std": 0.1913491114974022, "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9843750596046448, "step": 1761 }, { "clip_ratio": 0.0, "completion_length": 559.5290451049805, "epoch": 0.5263236502128296, "grad_norm": 2.6422266960144043, "kl": 0.896484375, "learning_rate": 1.0823057994732661e-05, "loss": 0.1492, "reward": 2.1244420409202576, "reward_std": 0.22926399856805801, "rewards/accuracy_reward": 0.16741072200238705, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9815848618745804, "step": 1762 }, { "clip_ratio": 0.0, "completion_length": 582.4263763427734, "epoch": 0.526622358300351, "grad_norm": 0.274692565202713, "kl": 0.206298828125, "learning_rate": 1.0812662683002528e-05, "loss": 0.0208, "reward": 2.0770090222358704, "reward_std": 0.15581971779465675, "rewards/accuracy_reward": 0.1071428656578064, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9921875298023224, "step": 1763 }, { "clip_ratio": 0.0, "completion_length": 531.4442291259766, "epoch": 0.5269210663878725, "grad_norm": 1.0912342071533203, "kl": 0.201171875, "learning_rate": 1.0802266487175044e-05, "loss": 0.0107, "reward": 2.050781339406967, "reward_std": 0.08425757940858603, "rewards/accuracy_reward": 0.06250000116415322, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.997209832072258, "step": 1764 }, { "clip_ratio": 0.0, "completion_length": 580.1786041259766, "epoch": 0.5272197744753939, "grad_norm": 1.1280404329299927, "kl": 0.5146484375, "learning_rate": 1.0791869418560254e-05, "loss": 0.1092, "reward": 1.9754465222358704, "reward_std": 0.34118978679180145, "rewards/accuracy_reward": 0.07366071944124997, "rewards/format_reward": 0.9285714775323868, "rewards/tag_count_reward": 0.973214328289032, "step": 1765 }, { "clip_ratio": 0.0, "completion_length": 862.0982666015625, "epoch": 0.5275184825629153, "grad_norm": 44.80141067504883, "kl": 10.453125, "learning_rate": 1.0781471488469146e-05, "loss": 0.5387, "reward": 0.9202009290456772, "reward_std": 0.44758106768131256, "rewards/accuracy_reward": 0.04241071757860482, "rewards/format_reward": 0.13616072200238705, "rewards/tag_count_reward": 0.7416294813156128, "step": 1766 }, { "clip_ratio": 0.0, "completion_length": 839.1540679931641, "epoch": 0.5278171906504369, "grad_norm": 50.694129943847656, "kl": 11.703125, "learning_rate": 1.0771072708213652e-05, "loss": 0.5755, "reward": 0.8822545260190964, "reward_std": 0.38517846912145615, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0959821455180645, "rewards/tag_count_reward": 0.7148437798023224, "step": 1767 }, { "clip_ratio": 0.0, "completion_length": 780.919677734375, "epoch": 0.5281158987379583, "grad_norm": 35.13034439086914, "kl": 9.0234375, "learning_rate": 1.0760673089106626e-05, "loss": 0.5978, "reward": 0.9447545260190964, "reward_std": 0.48432382196187973, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.1875000111758709, "rewards/tag_count_reward": 0.757254496216774, "step": 1768 }, { "clip_ratio": 0.0, "completion_length": 602.5357360839844, "epoch": 0.5284146068254798, "grad_norm": 9.021455764770508, "kl": 1.861328125, "learning_rate": 1.075027264246183e-05, "loss": 0.3535, "reward": 1.6757813096046448, "reward_std": 0.639447882771492, "rewards/accuracy_reward": 0.09821428963914514, "rewards/format_reward": 0.667410746216774, "rewards/tag_count_reward": 0.9101562947034836, "step": 1769 }, { "clip_ratio": 0.0, "completion_length": 490.05359649658203, "epoch": 0.5287133149130012, "grad_norm": 6.2164106369018555, "kl": 0.206298828125, "learning_rate": 1.0739871379593935e-05, "loss": 0.0725, "reward": 2.0842634737491608, "reward_std": 0.19489147514104843, "rewards/accuracy_reward": 0.11160715040750802, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9905134290456772, "step": 1770 }, { "clip_ratio": 0.0, "completion_length": 472.41967010498047, "epoch": 0.5290120230005227, "grad_norm": 0.649368941783905, "kl": 0.16259765625, "learning_rate": 1.0729469311818496e-05, "loss": 0.0216, "reward": 2.1523438692092896, "reward_std": 0.0842773811891675, "rewards/accuracy_reward": 0.1629464328289032, "rewards/format_reward": 0.993303582072258, "rewards/tag_count_reward": 0.9960937649011612, "step": 1771 }, { "clip_ratio": 0.0, "completion_length": 509.8951110839844, "epoch": 0.5293107310880442, "grad_norm": 4.656844615936279, "kl": 0.314453125, "learning_rate": 1.0719066450451943e-05, "loss": 0.0477, "reward": 2.1489956378936768, "reward_std": 0.17332126945257187, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.992745578289032, "step": 1772 }, { "clip_ratio": 0.0, "completion_length": 488.40850830078125, "epoch": 0.5296094391755657, "grad_norm": 3.676971197128296, "kl": 0.5242919921875, "learning_rate": 1.0708662806811563e-05, "loss": 0.0899, "reward": 2.1277902722358704, "reward_std": 0.20880644023418427, "rewards/accuracy_reward": 0.15625000232830644, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9893973469734192, "step": 1773 }, { "clip_ratio": 0.0, "completion_length": 499.8817138671875, "epoch": 0.5299081472630871, "grad_norm": 1.6373193264007568, "kl": 0.8076171875, "learning_rate": 1.0698258392215508e-05, "loss": 0.0981, "reward": 2.0496652722358704, "reward_std": 0.1779013853520155, "rewards/accuracy_reward": 0.08035714365541935, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9871652275323868, "step": 1774 }, { "clip_ratio": 0.0, "completion_length": 501.61163330078125, "epoch": 0.5302068553506086, "grad_norm": 0.3398004174232483, "kl": 0.373779296875, "learning_rate": 1.068785321798276e-05, "loss": 0.0222, "reward": 2.1021206378936768, "reward_std": 0.13829956762492657, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9949776977300644, "step": 1775 }, { "clip_ratio": 0.0, "completion_length": 506.3192138671875, "epoch": 0.53050556343813, "grad_norm": 1.983547329902649, "kl": 0.5489501953125, "learning_rate": 1.0677447295433122e-05, "loss": 0.0661, "reward": 2.088169753551483, "reward_std": 0.09939506207592785, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9944196790456772, "step": 1776 }, { "clip_ratio": 0.0, "completion_length": 499.22100830078125, "epoch": 0.5308042715256516, "grad_norm": 4.141530990600586, "kl": 0.8868408203125, "learning_rate": 1.0667040635887231e-05, "loss": 0.0922, "reward": 2.1088170409202576, "reward_std": 0.15524422843009233, "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9927455484867096, "step": 1777 }, { "clip_ratio": 0.0, "completion_length": 492.2745895385742, "epoch": 0.531102979613173, "grad_norm": 1.5448088645935059, "kl": 0.8782958984375, "learning_rate": 1.0656633250666501e-05, "loss": 0.0775, "reward": 2.165736675262451, "reward_std": 0.21122610941529274, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.991629496216774, "step": 1778 }, { "clip_ratio": 0.0, "completion_length": 510.3437728881836, "epoch": 0.5314016877006945, "grad_norm": 0.978751003742218, "kl": 0.69384765625, "learning_rate": 1.0646225151093154e-05, "loss": 0.0476, "reward": 2.1177456974983215, "reward_std": 0.14848867151886225, "rewards/accuracy_reward": 0.13839286798611283, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9905134290456772, "step": 1779 }, { "clip_ratio": 0.0, "completion_length": 498.37056732177734, "epoch": 0.5317003957882159, "grad_norm": 1.0197558403015137, "kl": 0.4132080078125, "learning_rate": 1.0635816348490176e-05, "loss": 0.0207, "reward": 2.0401787161827087, "reward_std": 0.1443783575668931, "rewards/accuracy_reward": 0.05357143050059676, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9933035969734192, "step": 1780 }, { "clip_ratio": 0.0, "completion_length": 518.4710006713867, "epoch": 0.5319991038757375, "grad_norm": 0.819713830947876, "kl": 0.22802734375, "learning_rate": 1.062540685418133e-05, "loss": 0.0249, "reward": 2.0848215222358704, "reward_std": 0.17128508910536766, "rewards/accuracy_reward": 0.10714286100119352, "rewards/format_reward": 0.9821429252624512, "rewards/tag_count_reward": 0.995535746216774, "step": 1781 }, { "clip_ratio": 0.0, "completion_length": 504.24778747558594, "epoch": 0.5322978119632589, "grad_norm": 1.4165726900100708, "kl": 0.1663818359375, "learning_rate": 1.0614996679491123e-05, "loss": -0.0034, "reward": 2.1434152722358704, "reward_std": 0.16082165855914354, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9938616305589676, "step": 1782 }, { "clip_ratio": 0.0, "completion_length": 521.8102874755859, "epoch": 0.5325965200507804, "grad_norm": 3.0771703720092773, "kl": 0.656005859375, "learning_rate": 1.0604585835744802e-05, "loss": 0.0829, "reward": 2.0245537161827087, "reward_std": 0.169528566300869, "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.988839328289032, "step": 1783 }, { "clip_ratio": 0.0, "completion_length": 572.0268096923828, "epoch": 0.5328952281383018, "grad_norm": 0.9403454065322876, "kl": 0.9180908203125, "learning_rate": 1.0594174334268352e-05, "loss": 0.0623, "reward": 2.0491071939468384, "reward_std": 0.14040336851030588, "rewards/accuracy_reward": 0.0736607180442661, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9888393133878708, "step": 1784 }, { "clip_ratio": 0.0, "completion_length": 532.944221496582, "epoch": 0.5331939362258233, "grad_norm": 3.37717342376709, "kl": 0.96484375, "learning_rate": 1.058376218638846e-05, "loss": 0.053, "reward": 2.1992188692092896, "reward_std": 0.16916834190487862, "rewards/accuracy_reward": 0.2142857201397419, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9938616305589676, "step": 1785 }, { "clip_ratio": 0.0, "completion_length": 524.2544937133789, "epoch": 0.5334926443133448, "grad_norm": 4.837312698364258, "kl": 1.0914306640625, "learning_rate": 1.0573349403432524e-05, "loss": 0.1342, "reward": 2.071986734867096, "reward_std": 0.1526624606922269, "rewards/accuracy_reward": 0.09821429057046771, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9893973618745804, "step": 1786 }, { "clip_ratio": 0.0, "completion_length": 547.1205596923828, "epoch": 0.5337913524008663, "grad_norm": 1.3670111894607544, "kl": 0.5830078125, "learning_rate": 1.0562935996728629e-05, "loss": 0.0209, "reward": 2.166294753551483, "reward_std": 0.14741660468280315, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9966518133878708, "step": 1787 }, { "clip_ratio": 0.0, "completion_length": 601.9174499511719, "epoch": 0.5340900604883877, "grad_norm": 0.7583140134811401, "kl": 0.319091796875, "learning_rate": 1.0552521977605546e-05, "loss": 0.0103, "reward": 2.099330425262451, "reward_std": 0.11837773956358433, "rewards/accuracy_reward": 0.11830357951112092, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9921875298023224, "step": 1788 }, { "clip_ratio": 0.0, "completion_length": 549.1428756713867, "epoch": 0.5343887685759092, "grad_norm": 1.021346092224121, "kl": 0.2952880859375, "learning_rate": 1.0542107357392704e-05, "loss": 0.0459, "reward": 2.134486675262451, "reward_std": 0.1901172362267971, "rewards/accuracy_reward": 0.16071429662406445, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9916295111179352, "step": 1789 }, { "clip_ratio": 0.0, "completion_length": 570.138427734375, "epoch": 0.5346874766634306, "grad_norm": 0.2220473289489746, "kl": 0.0977783203125, "learning_rate": 1.0531692147420187e-05, "loss": 0.0077, "reward": 2.1082590222358704, "reward_std": 0.10961449053138494, "rewards/accuracy_reward": 0.11383928917348385, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9988839328289032, "step": 1790 }, { "clip_ratio": 0.0, "completion_length": 570.7411041259766, "epoch": 0.5349861847509522, "grad_norm": 1.6831843852996826, "kl": 0.3470458984375, "learning_rate": 1.0521276359018728e-05, "loss": 0.0305, "reward": 2.008370667695999, "reward_std": 0.11505272705107927, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.990513414144516, "step": 1791 }, { "clip_ratio": 0.0, "completion_length": 578.794677734375, "epoch": 0.5352848928384736, "grad_norm": 3.040095806121826, "kl": 0.7340087890625, "learning_rate": 1.0510860003519681e-05, "loss": 0.0983, "reward": 2.0418528020381927, "reward_std": 0.19477375969290733, "rewards/accuracy_reward": 0.06473214528523386, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9927455633878708, "step": 1792 }, { "clip_ratio": 0.0, "completion_length": 599.3817291259766, "epoch": 0.5355836009259951, "grad_norm": 0.6945080757141113, "kl": 0.2227783203125, "learning_rate": 1.0500443092255017e-05, "loss": 0.006, "reward": 2.119977742433548, "reward_std": 0.1800572145730257, "rewards/accuracy_reward": 0.1517857275903225, "rewards/format_reward": 0.975446492433548, "rewards/tag_count_reward": 0.9927455633878708, "step": 1793 }, { "clip_ratio": 0.0, "completion_length": 591.2321624755859, "epoch": 0.5358823090135165, "grad_norm": 1.8598713874816895, "kl": 0.46826171875, "learning_rate": 1.049002563655732e-05, "loss": 0.0453, "reward": 2.0708706080913544, "reward_std": 0.18600528687238693, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9927455633878708, "step": 1794 }, { "clip_ratio": 0.0, "completion_length": 599.4955596923828, "epoch": 0.536181017101038, "grad_norm": 2.6778252124786377, "kl": 1.11083984375, "learning_rate": 1.0479607647759755e-05, "loss": 0.0973, "reward": 2.1250001788139343, "reward_std": 0.2675684504210949, "rewards/accuracy_reward": 0.1718750037252903, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.98214291036129, "step": 1795 }, { "clip_ratio": 0.0, "completion_length": 545.5379638671875, "epoch": 0.5364797251885595, "grad_norm": 0.11019422858953476, "kl": 0.2314453125, "learning_rate": 1.0469189137196081e-05, "loss": 0.0006, "reward": 2.0775670409202576, "reward_std": 0.10521850734949112, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9972098469734192, "step": 1796 }, { "clip_ratio": 0.0, "completion_length": 554.4107360839844, "epoch": 0.536778433276081, "grad_norm": 5.5223774909973145, "kl": 1.890625, "learning_rate": 1.0458770116200605e-05, "loss": 0.1673, "reward": 2.0965403020381927, "reward_std": 0.21987433917820454, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.970982164144516, "rewards/tag_count_reward": 0.9827009290456772, "step": 1797 }, { "clip_ratio": 0.0, "completion_length": 598.1986999511719, "epoch": 0.5370771413636024, "grad_norm": 2.943739414215088, "kl": 1.402099609375, "learning_rate": 1.044835059610821e-05, "loss": 0.1366, "reward": 2.0580357909202576, "reward_std": 0.1603229008615017, "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9910714626312256, "step": 1798 }, { "clip_ratio": 0.0, "completion_length": 603.1205749511719, "epoch": 0.5373758494511239, "grad_norm": 2.2326807975769043, "kl": 1.26171875, "learning_rate": 1.043793058825431e-05, "loss": 0.071, "reward": 2.051897406578064, "reward_std": 0.206081822514534, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9871652126312256, "step": 1799 }, { "clip_ratio": 0.0, "completion_length": 603.5893096923828, "epoch": 0.5376745575386453, "grad_norm": 9.343196868896484, "kl": 2.7928466796875, "learning_rate": 1.0427510103974853e-05, "loss": 0.1673, "reward": 2.0178571939468384, "reward_std": 0.19421611540019512, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9843750447034836, "step": 1800 }, { "clip_ratio": 0.0, "completion_length": 583.2299346923828, "epoch": 0.5379732656261669, "grad_norm": 0.6216228604316711, "kl": 0.3192138671875, "learning_rate": 1.0417089154606299e-05, "loss": 0.0123, "reward": 2.0753349661827087, "reward_std": 0.09151786100119352, "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9949777126312256, "step": 1801 }, { "clip_ratio": 0.0, "completion_length": 611.4888610839844, "epoch": 0.5382719737136883, "grad_norm": 1.520065426826477, "kl": 0.498046875, "learning_rate": 1.0406667751485628e-05, "loss": 0.0434, "reward": 2.079799234867096, "reward_std": 0.1956400666385889, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9882813096046448, "step": 1802 }, { "clip_ratio": 0.0, "completion_length": 581.194221496582, "epoch": 0.5385706818012098, "grad_norm": 3.085817813873291, "kl": 0.7769775390625, "learning_rate": 1.03962459059503e-05, "loss": 0.0705, "reward": 2.076451003551483, "reward_std": 0.13453439623117447, "rewards/accuracy_reward": 0.09151786169968545, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9960937947034836, "step": 1803 }, { "clip_ratio": 0.0, "completion_length": 583.2522583007812, "epoch": 0.5388693898887312, "grad_norm": 1.6944376230239868, "kl": 0.49951171875, "learning_rate": 1.0385823629338262e-05, "loss": 0.0692, "reward": 1.9977679550647736, "reward_std": 0.13839363399893045, "rewards/accuracy_reward": 0.022321428870782256, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.988839328289032, "step": 1804 }, { "clip_ratio": 0.0, "completion_length": 560.6741333007812, "epoch": 0.5391680979762528, "grad_norm": 2.7065553665161133, "kl": 0.42333984375, "learning_rate": 1.0375400932987932e-05, "loss": 0.0513, "reward": 1.997209906578064, "reward_std": 0.12104080058634281, "rewards/accuracy_reward": 0.017857144121080637, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.992745578289032, "step": 1805 }, { "clip_ratio": 0.0, "completion_length": 613.3750305175781, "epoch": 0.5394668060637742, "grad_norm": 1.657029151916504, "kl": 0.72802734375, "learning_rate": 1.0364977828238176e-05, "loss": 0.0302, "reward": 2.051897406578064, "reward_std": 0.19944793358445168, "rewards/accuracy_reward": 0.08705357275903225, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.984933078289032, "step": 1806 }, { "clip_ratio": 0.0, "completion_length": 550.1384124755859, "epoch": 0.5397655141512957, "grad_norm": 2.103269577026367, "kl": 0.6568603515625, "learning_rate": 1.0354554326428319e-05, "loss": 0.0327, "reward": 2.0608259737491608, "reward_std": 0.12854798883199692, "rewards/accuracy_reward": 0.07589286309666932, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9938616305589676, "step": 1807 }, { "clip_ratio": 0.0, "completion_length": 562.2500152587891, "epoch": 0.5400642222388171, "grad_norm": 2.365189552307129, "kl": 0.3905029296875, "learning_rate": 1.0344130438898101e-05, "loss": 0.0191, "reward": 2.0306921005249023, "reward_std": 0.12585796881467104, "rewards/accuracy_reward": 0.09375000721774995, "rewards/format_reward": 0.9642857313156128, "rewards/tag_count_reward": 0.9726562649011612, "step": 1808 }, { "clip_ratio": 0.0, "completion_length": 519.5960006713867, "epoch": 0.5403629303263385, "grad_norm": 1.0180060863494873, "kl": 0.5283203125, "learning_rate": 1.0333706176987697e-05, "loss": 0.0301, "reward": 2.076451003551483, "reward_std": 0.12838959693908691, "rewards/accuracy_reward": 0.09151786216534674, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9938616305589676, "step": 1809 }, { "clip_ratio": 0.0, "completion_length": 551.1272583007812, "epoch": 0.5406616384138601, "grad_norm": 5.512302875518799, "kl": 1.41162109375, "learning_rate": 1.0323281552037678e-05, "loss": 0.1782, "reward": 2.0362724363803864, "reward_std": 0.19551461935043335, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9871652126312256, "step": 1810 }, { "clip_ratio": 0.0, "completion_length": 576.5647430419922, "epoch": 0.5409603465013815, "grad_norm": 1.327096700668335, "kl": 0.5899658203125, "learning_rate": 1.0312856575389016e-05, "loss": 0.0452, "reward": 2.107142925262451, "reward_std": 0.10335747245699167, "rewards/accuracy_reward": 0.11830357578583062, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9955357313156128, "step": 1811 }, { "clip_ratio": 0.0, "completion_length": 576.0245742797852, "epoch": 0.541259054588903, "grad_norm": 1.4565181732177734, "kl": 0.830810546875, "learning_rate": 1.0302431258383062e-05, "loss": 0.0379, "reward": 2.0742188692092896, "reward_std": 0.20431456342339516, "rewards/accuracy_reward": 0.10491072060540318, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9893973469734192, "step": 1812 }, { "clip_ratio": 0.0, "completion_length": 540.084846496582, "epoch": 0.5415577626764244, "grad_norm": 1.4780380725860596, "kl": 0.3990478515625, "learning_rate": 1.0292005612361542e-05, "loss": 0.0452, "reward": 2.0770090222358704, "reward_std": 0.057275486178696156, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9988839328289032, "step": 1813 }, { "clip_ratio": 0.0, "completion_length": 582.6071624755859, "epoch": 0.5418564707639459, "grad_norm": 2.3006985187530518, "kl": 0.3709716796875, "learning_rate": 1.0281579648666533e-05, "loss": 0.0576, "reward": 2.0429688692092896, "reward_std": 0.13616106938570738, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.991629496216774, "step": 1814 }, { "clip_ratio": 0.0, "completion_length": 565.9799194335938, "epoch": 0.5421551788514674, "grad_norm": 0.3726756274700165, "kl": 0.3619384765625, "learning_rate": 1.0271153378640464e-05, "loss": 0.0329, "reward": 2.1021206378936768, "reward_std": 0.15669592656195164, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9949776977300644, "step": 1815 }, { "clip_ratio": 0.0, "completion_length": 602.1473388671875, "epoch": 0.5424538869389889, "grad_norm": 0.7700011730194092, "kl": 0.582275390625, "learning_rate": 1.02607268136261e-05, "loss": 0.0426, "reward": 2.130022406578064, "reward_std": 0.17622553557157516, "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9938616305589676, "step": 1816 }, { "clip_ratio": 0.0, "completion_length": 586.5468978881836, "epoch": 0.5427525950265103, "grad_norm": 5.2676777839660645, "kl": 1.3310546875, "learning_rate": 1.025029996496651e-05, "loss": 0.1017, "reward": 2.1261162161827087, "reward_std": 0.2162278788164258, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9877232611179352, "step": 1817 }, { "clip_ratio": 0.0, "completion_length": 569.6674346923828, "epoch": 0.5430513031140318, "grad_norm": 1.5120224952697754, "kl": 0.656982421875, "learning_rate": 1.0239872844005094e-05, "loss": 0.043, "reward": 2.1244420409202576, "reward_std": 0.12173982430249453, "rewards/accuracy_reward": 0.14732143515720963, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.992745578289032, "step": 1818 }, { "clip_ratio": 0.0, "completion_length": 569.9598541259766, "epoch": 0.5433500112015532, "grad_norm": 0.3728059232234955, "kl": 0.3284912109375, "learning_rate": 1.0229445462085531e-05, "loss": 0.028, "reward": 2.129464328289032, "reward_std": 0.14124061167240143, "rewards/accuracy_reward": 0.1406250111758709, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9955357313156128, "step": 1819 }, { "clip_ratio": 0.0, "completion_length": 621.0714569091797, "epoch": 0.5436487192890748, "grad_norm": 0.5609433054924011, "kl": 0.598388671875, "learning_rate": 1.0219017830551797e-05, "loss": 0.0235, "reward": 2.0998885333538055, "reward_std": 0.14658642932772636, "rewards/accuracy_reward": 0.11830357578583062, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9927455633878708, "step": 1820 }, { "clip_ratio": 0.0, "completion_length": 593.6830749511719, "epoch": 0.5439474273765962, "grad_norm": 0.529446542263031, "kl": 0.2794189453125, "learning_rate": 1.0208589960748127e-05, "loss": 0.0277, "reward": 2.1819196939468384, "reward_std": 0.11747457087039948, "rewards/accuracy_reward": 0.19196429289877415, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9966517984867096, "step": 1821 }, { "clip_ratio": 0.0, "completion_length": 613.6116333007812, "epoch": 0.5442461354641177, "grad_norm": 0.9026580452919006, "kl": 0.703369140625, "learning_rate": 1.0198161864019024e-05, "loss": 0.0591, "reward": 2.0507813096046448, "reward_std": 0.112008236348629, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9905134439468384, "step": 1822 }, { "clip_ratio": 0.0, "completion_length": 585.4330596923828, "epoch": 0.5445448435516391, "grad_norm": 1.4231427907943726, "kl": 0.88232421875, "learning_rate": 1.0187733551709236e-05, "loss": 0.0782, "reward": 2.0530134439468384, "reward_std": 0.2061488814651966, "rewards/accuracy_reward": 0.08928572130389512, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9860491454601288, "step": 1823 }, { "clip_ratio": 0.0, "completion_length": 560.325927734375, "epoch": 0.5448435516391607, "grad_norm": 1.6340529918670654, "kl": 0.6800537109375, "learning_rate": 1.0177305035163745e-05, "loss": 0.0299, "reward": 2.086495578289032, "reward_std": 0.16047331131994724, "rewards/accuracy_reward": 0.11160715040750802, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9905134290456772, "step": 1824 }, { "clip_ratio": 0.0, "completion_length": 580.6919708251953, "epoch": 0.5451422597266821, "grad_norm": 0.5744231343269348, "kl": 0.310302734375, "learning_rate": 1.016687632572775e-05, "loss": 0.0202, "reward": 2.197544753551483, "reward_std": 0.17712872475385666, "rewards/accuracy_reward": 0.21205357694998384, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9966517984867096, "step": 1825 }, { "clip_ratio": 0.0, "completion_length": 580.7388610839844, "epoch": 0.5454409678142036, "grad_norm": 10.463835716247559, "kl": 1.526611328125, "learning_rate": 1.0156447434746669e-05, "loss": 0.1037, "reward": 2.0736607909202576, "reward_std": 0.14454596117138863, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9955357313156128, "step": 1826 }, { "clip_ratio": 0.0, "completion_length": 608.3594055175781, "epoch": 0.545739675901725, "grad_norm": 1.0767422914505005, "kl": 0.3568115234375, "learning_rate": 1.0146018373566114e-05, "loss": 0.0428, "reward": 2.0864956378936768, "reward_std": 0.09715914912521839, "rewards/accuracy_reward": 0.1071428582072258, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9949776977300644, "step": 1827 }, { "clip_ratio": 0.0, "completion_length": 625.1160888671875, "epoch": 0.5460383839892465, "grad_norm": 3.5674431324005127, "kl": 1.771728515625, "learning_rate": 1.0135589153531879e-05, "loss": 0.0602, "reward": 2.0742188096046448, "reward_std": 0.126617643982172, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.975446492433548, "rewards/tag_count_reward": 0.9893973618745804, "step": 1828 }, { "clip_ratio": 0.0, "completion_length": 589.1875305175781, "epoch": 0.546337092076768, "grad_norm": 2.6124789714813232, "kl": 0.8792724609375, "learning_rate": 1.0125159785989933e-05, "loss": 0.0679, "reward": 2.007812589406967, "reward_std": 0.116306958720088, "rewards/accuracy_reward": 0.024553572293370962, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9921875298023224, "step": 1829 }, { "clip_ratio": 0.0, "completion_length": 630.6071624755859, "epoch": 0.5466358001642895, "grad_norm": 50.18336868286133, "kl": 17.96875, "learning_rate": 1.0114730282286408e-05, "loss": 0.8316, "reward": 1.6445313394069672, "reward_std": 0.5887133479118347, "rewards/accuracy_reward": 0.133928582072258, "rewards/format_reward": 0.5915178880095482, "rewards/tag_count_reward": 0.9190848618745804, "step": 1830 }, { "clip_ratio": 0.0, "completion_length": 550.3460083007812, "epoch": 0.5469345082518109, "grad_norm": 24.472028732299805, "kl": 2.708984375, "learning_rate": 1.0104300653767582e-05, "loss": 0.101, "reward": 2.073102831840515, "reward_std": 0.2523689344525337, "rewards/accuracy_reward": 0.1227678656578064, "rewards/format_reward": 0.9620535969734192, "rewards/tag_count_reward": 0.9882812947034836, "step": 1831 }, { "clip_ratio": 0.0, "completion_length": 584.9866333007812, "epoch": 0.5472332163393324, "grad_norm": 5.10753059387207, "kl": 0.8529052734375, "learning_rate": 1.0093870911779866e-05, "loss": 0.0638, "reward": 2.1261161863803864, "reward_std": 0.1694105938076973, "rewards/accuracy_reward": 0.14508929569274187, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9921875298023224, "step": 1832 }, { "clip_ratio": 0.0, "completion_length": 553.0156402587891, "epoch": 0.5475319244268538, "grad_norm": 3.2284765243530273, "kl": 0.80517578125, "learning_rate": 1.0083441067669797e-05, "loss": 0.1054, "reward": 2.073102831840515, "reward_std": 0.22669288143515587, "rewards/accuracy_reward": 0.10491072223521769, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9860491454601288, "step": 1833 }, { "clip_ratio": 0.0, "completion_length": 570.5045013427734, "epoch": 0.5478306325143754, "grad_norm": 1.3830814361572266, "kl": 0.22021484375, "learning_rate": 1.0073011132784026e-05, "loss": 0.0167, "reward": 2.117745578289032, "reward_std": 0.14336936734616756, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.997209832072258, "step": 1834 }, { "clip_ratio": 0.0, "completion_length": 554.1741333007812, "epoch": 0.5481293406018968, "grad_norm": 1.2765169143676758, "kl": 0.8465576171875, "learning_rate": 1.00625811184693e-05, "loss": 0.065, "reward": 2.0636161267757416, "reward_std": 0.19598515704274178, "rewards/accuracy_reward": 0.08928571571595967, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9899553805589676, "step": 1835 }, { "clip_ratio": 0.0, "completion_length": 540.9732360839844, "epoch": 0.5484280486894183, "grad_norm": 1.356277585029602, "kl": 1.0146484375, "learning_rate": 1.0052151036072446e-05, "loss": 0.0719, "reward": 2.0736608505249023, "reward_std": 0.16817734949290752, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9866071790456772, "step": 1836 }, { "clip_ratio": 0.0, "completion_length": 564.1049270629883, "epoch": 0.5487267567769397, "grad_norm": 0.6888157725334167, "kl": 0.3505859375, "learning_rate": 1.004172089694038e-05, "loss": 0.0176, "reward": 2.091517925262451, "reward_std": 0.14737516455352306, "rewards/accuracy_reward": 0.10714286053553224, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.995535746216774, "step": 1837 }, { "clip_ratio": 0.0, "completion_length": 583.6049194335938, "epoch": 0.5490254648644612, "grad_norm": 1.6885027885437012, "kl": 0.3819580078125, "learning_rate": 1.0031290712420065e-05, "loss": 0.046, "reward": 2.0239956378936768, "reward_std": 0.1232454963028431, "rewards/accuracy_reward": 0.04017857275903225, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9949776977300644, "step": 1838 }, { "clip_ratio": 0.0, "completion_length": 602.9710083007812, "epoch": 0.5493241729519827, "grad_norm": 0.6890919804573059, "kl": 0.2200927734375, "learning_rate": 1.0020860493858524e-05, "loss": 0.0275, "reward": 2.1015626192092896, "reward_std": 0.20607321429997683, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9921875149011612, "step": 1839 }, { "clip_ratio": 0.0, "completion_length": 558.2299423217773, "epoch": 0.5496228810395042, "grad_norm": 1.5449650287628174, "kl": 0.7685546875, "learning_rate": 1.0010430252602808e-05, "loss": 0.0693, "reward": 2.083705484867096, "reward_std": 0.14410031028091908, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9899553954601288, "step": 1840 }, { "clip_ratio": 0.0, "completion_length": 577.1049346923828, "epoch": 0.5499215891270256, "grad_norm": 2.465759038925171, "kl": 0.4267578125, "learning_rate": 1e-05, "loss": 0.022, "reward": 2.0825893580913544, "reward_std": 0.07786033488810062, "rewards/accuracy_reward": 0.0915178582072258, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9977678656578064, "step": 1841 }, { "clip_ratio": 0.0, "completion_length": 541.8393020629883, "epoch": 0.5502202972145471, "grad_norm": 1.9603677988052368, "kl": 0.47998046875, "learning_rate": 9.989569747397194e-06, "loss": 0.0308, "reward": 2.1183037161827087, "reward_std": 0.1311756893992424, "rewards/accuracy_reward": 0.12276786379516125, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9977678656578064, "step": 1842 }, { "clip_ratio": 0.0, "completion_length": 572.3504638671875, "epoch": 0.5505190053020685, "grad_norm": 1.8403549194335938, "kl": 0.7919921875, "learning_rate": 9.979139506141477e-06, "loss": 0.0604, "reward": 2.072544753551483, "reward_std": 0.18357745558023453, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.9732143431901932, "rewards/tag_count_reward": 0.987723246216774, "step": 1843 }, { "clip_ratio": 0.0, "completion_length": 567.6919708251953, "epoch": 0.5508177133895901, "grad_norm": 0.5665842294692993, "kl": 0.36181640625, "learning_rate": 9.968709287579937e-06, "loss": 0.0439, "reward": 2.1155134737491608, "reward_std": 0.1756243221461773, "rewards/accuracy_reward": 0.13839286379516125, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9927455633878708, "step": 1844 }, { "clip_ratio": 0.0, "completion_length": 576.6562805175781, "epoch": 0.5511164214771115, "grad_norm": 1.0660771131515503, "kl": 0.38623046875, "learning_rate": 9.958279103059624e-06, "loss": 0.0502, "reward": 2.0619420409202576, "reward_std": 0.15087686851620674, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.990513414144516, "step": 1845 }, { "clip_ratio": 0.0, "completion_length": 604.8593902587891, "epoch": 0.551415129564633, "grad_norm": 0.6077069640159607, "kl": 0.297119140625, "learning_rate": 9.947848963927556e-06, "loss": 0.017, "reward": 2.1255581378936768, "reward_std": 0.09925232827663422, "rewards/accuracy_reward": 0.14062500605359674, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9938616454601288, "step": 1846 }, { "clip_ratio": 0.0, "completion_length": 636.8772583007812, "epoch": 0.5517138376521544, "grad_norm": 0.5809691548347473, "kl": 0.401611328125, "learning_rate": 9.937418881530704e-06, "loss": 0.0222, "reward": 2.1473215222358704, "reward_std": 0.08174209762364626, "rewards/accuracy_reward": 0.15625000558793545, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9977678805589676, "step": 1847 }, { "clip_ratio": 0.0, "completion_length": 592.794677734375, "epoch": 0.552012545739676, "grad_norm": 3.2689266204833984, "kl": 0.9300537109375, "learning_rate": 9.926988867215976e-06, "loss": 0.066, "reward": 2.0524554550647736, "reward_std": 0.11904737539589405, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9944196939468384, "step": 1848 }, { "clip_ratio": 0.0, "completion_length": 596.4040374755859, "epoch": 0.5523112538271974, "grad_norm": 0.1624487340450287, "kl": 0.2440185546875, "learning_rate": 9.916558932330206e-06, "loss": 0.0046, "reward": 2.0731027722358704, "reward_std": 0.11264518275856972, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9949776977300644, "step": 1849 }, { "clip_ratio": 0.0, "completion_length": 609.3928985595703, "epoch": 0.5526099619147189, "grad_norm": 1.9973158836364746, "kl": 0.667724609375, "learning_rate": 9.906129088220137e-06, "loss": 0.0781, "reward": 2.0881697833538055, "reward_std": 0.19520063698291779, "rewards/accuracy_reward": 0.11830357671715319, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9877232313156128, "step": 1850 }, { "clip_ratio": 0.0, "completion_length": 596.7344055175781, "epoch": 0.5529086700022403, "grad_norm": 3.110403537750244, "kl": 0.5872802734375, "learning_rate": 9.895699346232422e-06, "loss": 0.0599, "reward": 2.1244421005249023, "reward_std": 0.1364116258919239, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949776977300644, "step": 1851 }, { "clip_ratio": 0.0, "completion_length": 619.7924346923828, "epoch": 0.5532073780897617, "grad_norm": 2.2244491577148438, "kl": 1.1080322265625, "learning_rate": 9.885269717713595e-06, "loss": 0.0814, "reward": 2.06194207072258, "reward_std": 0.15200625732541084, "rewards/accuracy_reward": 0.09821429196745157, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.983816996216774, "step": 1852 }, { "clip_ratio": 0.0, "completion_length": 619.0267944335938, "epoch": 0.5535060861772833, "grad_norm": 0.5603544116020203, "kl": 0.3272705078125, "learning_rate": 9.874840214010069e-06, "loss": 0.0154, "reward": 2.0982143878936768, "reward_std": 0.09179566986858845, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9955357313156128, "step": 1853 }, { "clip_ratio": 0.0, "completion_length": 602.5022583007812, "epoch": 0.5538047942648047, "grad_norm": 0.4135715663433075, "kl": 0.2520751953125, "learning_rate": 9.864410846468123e-06, "loss": 0.0286, "reward": 2.1997768878936768, "reward_std": 0.1310398131608963, "rewards/accuracy_reward": 0.2098214440047741, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.996651828289032, "step": 1854 }, { "clip_ratio": 0.0, "completion_length": 618.9776916503906, "epoch": 0.5541035023523262, "grad_norm": 5.5384321212768555, "kl": 0.57421875, "learning_rate": 9.85398162643389e-06, "loss": 0.086, "reward": 2.0970982909202576, "reward_std": 0.22153820283710957, "rewards/accuracy_reward": 0.13169643748551607, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9854911118745804, "step": 1855 }, { "clip_ratio": 0.0, "completion_length": 628.6272583007812, "epoch": 0.5544022104398476, "grad_norm": 1.0558170080184937, "kl": 0.5625, "learning_rate": 9.843552565253333e-06, "loss": 0.0457, "reward": 2.016741156578064, "reward_std": 0.15395007468760014, "rewards/accuracy_reward": 0.03348214388824999, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9921875298023224, "step": 1856 }, { "clip_ratio": 0.0, "completion_length": 583.2611846923828, "epoch": 0.5547009185273691, "grad_norm": 0.8800897002220154, "kl": 0.1768798828125, "learning_rate": 9.833123674272252e-06, "loss": 0.0145, "reward": 2.1333706378936768, "reward_std": 0.18510041385889053, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.997209832072258, "step": 1857 }, { "clip_ratio": 0.0, "completion_length": 585.2098541259766, "epoch": 0.5549996266148906, "grad_norm": 22.56063461303711, "kl": 2.768310546875, "learning_rate": 9.822694964836259e-06, "loss": 0.2164, "reward": 2.133928656578064, "reward_std": 0.2035820297896862, "rewards/accuracy_reward": 0.16517858020961285, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9866071790456772, "step": 1858 }, { "clip_ratio": 0.0, "completion_length": 636.1919860839844, "epoch": 0.5552983347024121, "grad_norm": 1.6960123777389526, "kl": 0.7900390625, "learning_rate": 9.812266448290767e-06, "loss": 0.0637, "reward": 2.0546876192092896, "reward_std": 0.19502363726496696, "rewards/accuracy_reward": 0.08035714575089514, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9899553954601288, "step": 1859 }, { "clip_ratio": 0.0, "completion_length": 589.5491333007812, "epoch": 0.5555970427899335, "grad_norm": 17.774261474609375, "kl": 1.9140625, "learning_rate": 9.80183813598098e-06, "loss": 0.2338, "reward": 2.0898438096046448, "reward_std": 0.18759983032941818, "rewards/accuracy_reward": 0.11160714784637094, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.991629496216774, "step": 1860 }, { "clip_ratio": 0.0, "completion_length": 609.9419860839844, "epoch": 0.555895750877455, "grad_norm": 7.153156757354736, "kl": 1.183349609375, "learning_rate": 9.791410039251874e-06, "loss": 0.0912, "reward": 2.126116156578064, "reward_std": 0.15163368918001652, "rewards/accuracy_reward": 0.14062500465661287, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.994419664144516, "step": 1861 }, { "clip_ratio": 0.0, "completion_length": 591.1473541259766, "epoch": 0.5561944589649764, "grad_norm": 2.2354214191436768, "kl": 0.834716796875, "learning_rate": 9.780982169448205e-06, "loss": 0.0622, "reward": 2.1088170409202576, "reward_std": 0.2059597410261631, "rewards/accuracy_reward": 0.13839286472648382, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9882812947034836, "step": 1862 }, { "clip_ratio": 0.0, "completion_length": 611.9777069091797, "epoch": 0.556493167052498, "grad_norm": 11.31606674194336, "kl": 0.708251953125, "learning_rate": 9.77055453791447e-06, "loss": 0.0521, "reward": 2.029017984867096, "reward_std": 0.10780016798526049, "rewards/accuracy_reward": 0.04017857392318547, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9955357313156128, "step": 1863 }, { "clip_ratio": 0.0, "completion_length": 651.2879791259766, "epoch": 0.5567918751400194, "grad_norm": 0.7764716148376465, "kl": 0.2100830078125, "learning_rate": 9.760127155994907e-06, "loss": 0.0257, "reward": 2.0853795409202576, "reward_std": 0.13368579372763634, "rewards/accuracy_reward": 0.10491071850992739, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9938616454601288, "step": 1864 }, { "clip_ratio": 0.0, "completion_length": 620.6116485595703, "epoch": 0.5570905832275409, "grad_norm": 2.167095422744751, "kl": 0.2288818359375, "learning_rate": 9.749700035033492e-06, "loss": 0.0385, "reward": 2.090959906578064, "reward_std": 0.10730606783181429, "rewards/accuracy_reward": 0.11383929220028222, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9927455633878708, "step": 1865 }, { "clip_ratio": 0.0, "completion_length": 589.0781555175781, "epoch": 0.5573892913150623, "grad_norm": 0.47958672046661377, "kl": 0.2269287109375, "learning_rate": 9.739273186373906e-06, "loss": 0.0065, "reward": 2.078125089406967, "reward_std": 0.08306325972080231, "rewards/accuracy_reward": 0.08482143143191934, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9977678805589676, "step": 1866 }, { "clip_ratio": 0.0, "completion_length": 582.3437652587891, "epoch": 0.5576879994025838, "grad_norm": 1.421558141708374, "kl": 0.271484375, "learning_rate": 9.728846621359538e-06, "loss": 0.0386, "reward": 2.1422992050647736, "reward_std": 0.09780097752809525, "rewards/accuracy_reward": 0.15178572316654027, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9972098618745804, "step": 1867 }, { "clip_ratio": 0.0, "completion_length": 590.6227874755859, "epoch": 0.5579867074901053, "grad_norm": 0.21398933231830597, "kl": 0.1343994140625, "learning_rate": 9.718420351333469e-06, "loss": 0.0134, "reward": 2.0931920409202576, "reward_std": 0.10069733951240778, "rewards/accuracy_reward": 0.09598214528523386, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9994419813156128, "step": 1868 }, { "clip_ratio": 0.0, "completion_length": 578.2723388671875, "epoch": 0.5582854155776268, "grad_norm": 1.707909107208252, "kl": 0.158935546875, "learning_rate": 9.707994387638461e-06, "loss": 0.0278, "reward": 2.0742188096046448, "reward_std": 0.12994040362536907, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9938616454601288, "step": 1869 }, { "clip_ratio": 0.0, "completion_length": 574.8884124755859, "epoch": 0.5585841236651482, "grad_norm": 0.9569913148880005, "kl": 0.308837890625, "learning_rate": 9.697568741616942e-06, "loss": 0.0293, "reward": 2.0530134737491608, "reward_std": 0.13752770610153675, "rewards/accuracy_reward": 0.0803571492433548, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9905134290456772, "step": 1870 }, { "clip_ratio": 0.0, "completion_length": 591.7991409301758, "epoch": 0.5588828317526697, "grad_norm": 4.531326770782471, "kl": 0.9722900390625, "learning_rate": 9.687143424610986e-06, "loss": 0.0698, "reward": 2.068638503551483, "reward_std": 0.18394095543771982, "rewards/accuracy_reward": 0.08705357508733869, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9927455633878708, "step": 1871 }, { "clip_ratio": 0.0, "completion_length": 569.3303909301758, "epoch": 0.5591815398401911, "grad_norm": 0.38664597272872925, "kl": 0.2237548828125, "learning_rate": 9.676718447962325e-06, "loss": 0.0259, "reward": 2.053571581840515, "reward_std": 0.10946273524314165, "rewards/accuracy_reward": 0.06473214784637094, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9955357313156128, "step": 1872 }, { "clip_ratio": 0.0, "completion_length": 545.810302734375, "epoch": 0.5594802479277127, "grad_norm": 2.4316136837005615, "kl": 0.4161376953125, "learning_rate": 9.666293823012306e-06, "loss": 0.0866, "reward": 2.1093750298023224, "reward_std": 0.17119484208524227, "rewards/accuracy_reward": 0.13392857555299997, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9933036118745804, "step": 1873 }, { "clip_ratio": 0.0, "completion_length": 590.310302734375, "epoch": 0.5597789560152341, "grad_norm": 0.26384320855140686, "kl": 0.230712890625, "learning_rate": 9.6558695611019e-06, "loss": 0.027, "reward": 2.1199777722358704, "reward_std": 0.0970877492800355, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9949776977300644, "step": 1874 }, { "clip_ratio": 0.0, "completion_length": 606.513427734375, "epoch": 0.5600776641027556, "grad_norm": 2.39729905128479, "kl": 0.9114990234375, "learning_rate": 9.645445673571685e-06, "loss": 0.0521, "reward": 2.036830425262451, "reward_std": 0.15896708145737648, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9921875596046448, "step": 1875 }, { "clip_ratio": 0.0, "completion_length": 619.6094055175781, "epoch": 0.560376372190277, "grad_norm": 1.2393438816070557, "kl": 0.3531494140625, "learning_rate": 9.635022171761826e-06, "loss": 0.0307, "reward": 2.1283482909202576, "reward_std": 0.1825787853449583, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9921875298023224, "step": 1876 }, { "clip_ratio": 0.0, "completion_length": 610.435302734375, "epoch": 0.5606750802777986, "grad_norm": 8.037117958068848, "kl": 0.9720458984375, "learning_rate": 9.624599067012073e-06, "loss": 0.1216, "reward": 2.1718751192092896, "reward_std": 0.15858605410903692, "rewards/accuracy_reward": 0.19196430081501603, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9910714626312256, "step": 1877 }, { "clip_ratio": 0.0, "completion_length": 630.6294860839844, "epoch": 0.56097378836532, "grad_norm": 0.6034764647483826, "kl": 0.4686279296875, "learning_rate": 9.61417637066174e-06, "loss": 0.0397, "reward": 2.098772406578064, "reward_std": 0.12367314100265503, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9938616454601288, "step": 1878 }, { "clip_ratio": 0.0, "completion_length": 640.0424346923828, "epoch": 0.5612724964528415, "grad_norm": 0.9868949055671692, "kl": 0.28369140625, "learning_rate": 9.603754094049702e-06, "loss": 0.047, "reward": 2.039062589406967, "reward_std": 0.15875785797834396, "rewards/accuracy_reward": 0.06250000116415322, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9921875298023224, "step": 1879 }, { "clip_ratio": 0.0, "completion_length": 599.5580444335938, "epoch": 0.5615712045403629, "grad_norm": 1.3674198389053345, "kl": 0.2227783203125, "learning_rate": 9.593332248514374e-06, "loss": 0.0227, "reward": 2.1328126788139343, "reward_std": 0.1884518824517727, "rewards/accuracy_reward": 0.15178572200238705, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9944196790456772, "step": 1880 }, { "clip_ratio": 0.0, "completion_length": 588.6004791259766, "epoch": 0.5618699126278844, "grad_norm": 4.417736053466797, "kl": 0.646484375, "learning_rate": 9.582910845393703e-06, "loss": 0.0924, "reward": 2.0970982909202576, "reward_std": 0.12022263091057539, "rewards/accuracy_reward": 0.11383928847499192, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.994419664144516, "step": 1881 }, { "clip_ratio": 0.0, "completion_length": 639.1071472167969, "epoch": 0.5621686207154059, "grad_norm": 0.36813807487487793, "kl": 0.193115234375, "learning_rate": 9.57248989602515e-06, "loss": 0.019, "reward": 2.0976563692092896, "reward_std": 0.1586737846955657, "rewards/accuracy_reward": 0.10491071757860482, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9972098469734192, "step": 1882 }, { "clip_ratio": 0.0, "completion_length": 634.7120819091797, "epoch": 0.5624673288029274, "grad_norm": 1.1762593984603882, "kl": 0.35107421875, "learning_rate": 9.562069411745692e-06, "loss": 0.037, "reward": 2.0965402722358704, "reward_std": 0.17615276016294956, "rewards/accuracy_reward": 0.12500000605359674, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9871652126312256, "step": 1883 }, { "clip_ratio": 0.0, "completion_length": 621.1183319091797, "epoch": 0.5627660368904488, "grad_norm": 1.5854125022888184, "kl": 0.617431640625, "learning_rate": 9.551649403891792e-06, "loss": 0.0651, "reward": 2.107142984867096, "reward_std": 0.16927179135382175, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9955357611179352, "step": 1884 }, { "clip_ratio": 0.0, "completion_length": 621.825927734375, "epoch": 0.5630647449779703, "grad_norm": 2.006981611251831, "kl": 0.9771728515625, "learning_rate": 9.541229883799397e-06, "loss": 0.1133, "reward": 2.058593839406967, "reward_std": 0.2036888264119625, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9871651977300644, "step": 1885 }, { "clip_ratio": 0.0, "completion_length": 601.7902069091797, "epoch": 0.5633634530654917, "grad_norm": 2.033630609512329, "kl": 0.91650390625, "learning_rate": 9.530810862803922e-06, "loss": 0.1081, "reward": 2.109933078289032, "reward_std": 0.18459918349981308, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9893973767757416, "step": 1886 }, { "clip_ratio": 0.0, "completion_length": 594.0826263427734, "epoch": 0.5636621611530133, "grad_norm": 1.3048715591430664, "kl": 0.34423828125, "learning_rate": 9.520392352240246e-06, "loss": 0.0559, "reward": 2.0831473767757416, "reward_std": 0.15798486955463886, "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9893973618745804, "step": 1887 }, { "clip_ratio": 0.0, "completion_length": 584.6205596923828, "epoch": 0.5639608692405347, "grad_norm": 0.32023340463638306, "kl": 0.332275390625, "learning_rate": 9.509974363442684e-06, "loss": 0.0216, "reward": 2.0463171005249023, "reward_std": 0.12076911143958569, "rewards/accuracy_reward": 0.05357143213041127, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.997209832072258, "step": 1888 }, { "clip_ratio": 0.0, "completion_length": 584.4531555175781, "epoch": 0.5642595773280562, "grad_norm": 1.5071274042129517, "kl": 0.512451171875, "learning_rate": 9.499556907744985e-06, "loss": 0.0442, "reward": 2.075334906578064, "reward_std": 0.1140301339328289, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9882812798023224, "step": 1889 }, { "clip_ratio": 0.0, "completion_length": 556.6361999511719, "epoch": 0.5645582854155776, "grad_norm": 0.9336078763008118, "kl": 0.3143310546875, "learning_rate": 9.489139996480324e-06, "loss": 0.0287, "reward": 2.2137277722358704, "reward_std": 0.11295809410512447, "rewards/accuracy_reward": 0.2209821529686451, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9972098469734192, "step": 1890 }, { "clip_ratio": 0.0, "completion_length": 599.8750305175781, "epoch": 0.5648569935030991, "grad_norm": 0.5353695154190063, "kl": 0.4818115234375, "learning_rate": 9.478723640981276e-06, "loss": 0.0409, "reward": 2.0558037161827087, "reward_std": 0.12008961569517851, "rewards/accuracy_reward": 0.06919643259607255, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9933035969734192, "step": 1891 }, { "clip_ratio": 0.0, "completion_length": 598.4910888671875, "epoch": 0.5651557015906206, "grad_norm": 6.217193126678467, "kl": 1.1683349609375, "learning_rate": 9.468307852579815e-06, "loss": 0.1186, "reward": 2.182477742433548, "reward_std": 0.15310476161539555, "rewards/accuracy_reward": 0.2098214365541935, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9905134439468384, "step": 1892 }, { "clip_ratio": 0.0, "completion_length": 562.9085083007812, "epoch": 0.5654544096781421, "grad_norm": 3.811983346939087, "kl": 1.2049560546875, "learning_rate": 9.4578926426073e-06, "loss": 0.1395, "reward": 2.1216518878936768, "reward_std": 0.21427952870726585, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.987723246216774, "step": 1893 }, { "clip_ratio": 0.0, "completion_length": 578.4464416503906, "epoch": 0.5657531177656635, "grad_norm": 2.621065139770508, "kl": 0.515625, "learning_rate": 9.447478022394457e-06, "loss": 0.0462, "reward": 2.079241156578064, "reward_std": 0.15763495489954948, "rewards/accuracy_reward": 0.10714285937137902, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9877232611179352, "step": 1894 }, { "clip_ratio": 0.0, "completion_length": 582.1518020629883, "epoch": 0.5660518258531849, "grad_norm": 0.864618182182312, "kl": 0.594482421875, "learning_rate": 9.437064003271373e-06, "loss": 0.0681, "reward": 2.078683167695999, "reward_std": 0.1487265545874834, "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.991629496216774, "step": 1895 }, { "clip_ratio": 0.0, "completion_length": 639.1428833007812, "epoch": 0.5663505339407064, "grad_norm": 1.5698312520980835, "kl": 0.6741943359375, "learning_rate": 9.426650596567479e-06, "loss": 0.0839, "reward": 2.090401917695999, "reward_std": 0.1616341769695282, "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.9620536267757416, "rewards/tag_count_reward": 0.9743303954601288, "step": 1896 }, { "clip_ratio": 0.0, "completion_length": 607.9330520629883, "epoch": 0.5666492420282279, "grad_norm": 0.8945545554161072, "kl": 0.5419921875, "learning_rate": 9.416237813611542e-06, "loss": 0.0747, "reward": 2.021205484867096, "reward_std": 0.1752141797915101, "rewards/accuracy_reward": 0.053571432596072555, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.987723246216774, "step": 1897 }, { "clip_ratio": 0.0, "completion_length": 582.3102874755859, "epoch": 0.5669479501157494, "grad_norm": 0.8246076703071594, "kl": 0.665771484375, "learning_rate": 9.405825665731651e-06, "loss": 0.0724, "reward": 2.1138394474983215, "reward_std": 0.22357479482889175, "rewards/accuracy_reward": 0.1495535832364112, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9866071790456772, "step": 1898 }, { "clip_ratio": 0.0, "completion_length": 598.4397583007812, "epoch": 0.5672466582032708, "grad_norm": 0.7973484992980957, "kl": 0.4503173828125, "learning_rate": 9.3954141642552e-06, "loss": 0.0504, "reward": 2.0412946939468384, "reward_std": 0.19331995025277138, "rewards/accuracy_reward": 0.060267859138548374, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9921875447034836, "step": 1899 }, { "clip_ratio": 0.0, "completion_length": 611.6919860839844, "epoch": 0.5675453662907923, "grad_norm": 1.9625385999679565, "kl": 0.595703125, "learning_rate": 9.38500332050888e-06, "loss": 0.0859, "reward": 2.0234376192092896, "reward_std": 0.21325533464550972, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.987723246216774, "step": 1900 }, { "clip_ratio": 0.0, "completion_length": 619.3147583007812, "epoch": 0.5678440743783137, "grad_norm": 29.741180419921875, "kl": 2.829345703125, "learning_rate": 9.374593145818673e-06, "loss": 0.2308, "reward": 2.0184152722358704, "reward_std": 0.23897942900657654, "rewards/accuracy_reward": 0.07812500512227416, "rewards/format_reward": 0.9598214626312256, "rewards/tag_count_reward": 0.9804687947034836, "step": 1901 }, { "clip_ratio": 0.0, "completion_length": 562.9776992797852, "epoch": 0.5681427824658353, "grad_norm": 0.7213208675384521, "kl": 0.41748046875, "learning_rate": 9.364183651509826e-06, "loss": 0.0322, "reward": 2.067522406578064, "reward_std": 0.18443870916962624, "rewards/accuracy_reward": 0.09151786100119352, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9938616454601288, "step": 1902 }, { "clip_ratio": 0.0, "completion_length": 628.1741333007812, "epoch": 0.5684414905533567, "grad_norm": 3.558987855911255, "kl": 1.03759765625, "learning_rate": 9.353774848906849e-06, "loss": 0.1196, "reward": 2.0027903020381927, "reward_std": 0.3021239712834358, "rewards/accuracy_reward": 0.08258929196745157, "rewards/format_reward": 0.948660746216774, "rewards/tag_count_reward": 0.9715402275323868, "step": 1903 }, { "clip_ratio": 0.0, "completion_length": 616.410758972168, "epoch": 0.5687401986408782, "grad_norm": 0.7177006602287292, "kl": 0.76953125, "learning_rate": 9.343366749333502e-06, "loss": 0.0371, "reward": 2.0691965222358704, "reward_std": 0.190860734321177, "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9843750149011612, "step": 1904 }, { "clip_ratio": 0.0, "completion_length": 575.5134124755859, "epoch": 0.5690389067283996, "grad_norm": 0.3719489872455597, "kl": 0.451416015625, "learning_rate": 9.332959364112772e-06, "loss": 0.0101, "reward": 2.070870667695999, "reward_std": 0.24134521558880806, "rewards/accuracy_reward": 0.1138392947614193, "rewards/format_reward": 0.9732143431901932, "rewards/tag_count_reward": 0.9838170111179352, "step": 1905 }, { "clip_ratio": 0.0, "completion_length": 604.8236846923828, "epoch": 0.5693376148159212, "grad_norm": 0.45470577478408813, "kl": 0.1778564453125, "learning_rate": 9.32255270456688e-06, "loss": 0.0374, "reward": 2.0669643878936768, "reward_std": 0.15849695913493633, "rewards/accuracy_reward": 0.08705357392318547, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9933035969734192, "step": 1906 }, { "clip_ratio": 0.0, "completion_length": 564.6607360839844, "epoch": 0.5696363229034426, "grad_norm": 1.1607919931411743, "kl": 0.298583984375, "learning_rate": 9.312146782017244e-06, "loss": 0.0588, "reward": 2.0708706080913544, "reward_std": 0.15853802859783173, "rewards/accuracy_reward": 0.0982142873108387, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9882812798023224, "step": 1907 }, { "clip_ratio": 0.0, "completion_length": 606.7455596923828, "epoch": 0.5699350309909641, "grad_norm": 0.3382757902145386, "kl": 0.2119140625, "learning_rate": 9.301741607784495e-06, "loss": 0.0349, "reward": 2.1316965520381927, "reward_std": 0.1753881871700287, "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9910714626312256, "step": 1908 }, { "clip_ratio": 0.0, "completion_length": 580.7366180419922, "epoch": 0.5702337390784855, "grad_norm": 0.8313273191452026, "kl": 0.537353515625, "learning_rate": 9.29133719318844e-06, "loss": 0.0546, "reward": 2.0301340222358704, "reward_std": 0.1947825737297535, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9899553805589676, "step": 1909 }, { "clip_ratio": 0.0, "completion_length": 524.1986923217773, "epoch": 0.570532447166007, "grad_norm": 2.2977840900421143, "kl": 0.6959228515625, "learning_rate": 9.28093354954806e-06, "loss": 0.0771, "reward": 2.1534599661827087, "reward_std": 0.20723708346486092, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.990513414144516, "step": 1910 }, { "clip_ratio": 0.0, "completion_length": 560.7120666503906, "epoch": 0.5708311552535285, "grad_norm": 1.215287685394287, "kl": 0.510986328125, "learning_rate": 9.270530688181506e-06, "loss": -0.005, "reward": 2.1099331378936768, "reward_std": 0.10535169765353203, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.991629496216774, "step": 1911 }, { "clip_ratio": 0.0, "completion_length": 579.0089569091797, "epoch": 0.57112986334105, "grad_norm": 0.33746254444122314, "kl": 0.160888671875, "learning_rate": 9.260128620406066e-06, "loss": 0.0132, "reward": 2.1004465222358704, "reward_std": 0.10921970382332802, "rewards/accuracy_reward": 0.10267857927829027, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 1.0, "step": 1912 }, { "clip_ratio": 0.0, "completion_length": 547.2768096923828, "epoch": 0.5714285714285714, "grad_norm": 0.16007356345653534, "kl": 0.172607421875, "learning_rate": 9.249727357538171e-06, "loss": 0.0054, "reward": 2.076451003551483, "reward_std": 0.15074845403432846, "rewards/accuracy_reward": 0.08482143096625805, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9960937649011612, "step": 1913 }, { "clip_ratio": 0.0, "completion_length": 541.3504638671875, "epoch": 0.5717272795160929, "grad_norm": 0.5807108283042908, "kl": 0.281982421875, "learning_rate": 9.239326910893378e-06, "loss": 0.0385, "reward": 2.1484376192092896, "reward_std": 0.18684501200914383, "rewards/accuracy_reward": 0.16071429289877415, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.994419664144516, "step": 1914 }, { "clip_ratio": 0.0, "completion_length": 575.5223388671875, "epoch": 0.5720259876036143, "grad_norm": 0.5603349804878235, "kl": 0.3387451171875, "learning_rate": 9.22892729178635e-06, "loss": 0.035, "reward": 2.1406250596046448, "reward_std": 0.09248005133122206, "rewards/accuracy_reward": 0.15625000605359674, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9933035969734192, "step": 1915 }, { "clip_ratio": 0.0, "completion_length": 560.138427734375, "epoch": 0.5723246956911359, "grad_norm": 1.7022987604141235, "kl": 0.51708984375, "learning_rate": 9.218528511530857e-06, "loss": 0.0215, "reward": 2.157924234867096, "reward_std": 0.1316396649926901, "rewards/accuracy_reward": 0.1696428619325161, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949777126312256, "step": 1916 }, { "clip_ratio": 0.0, "completion_length": 558.7611846923828, "epoch": 0.5726234037786573, "grad_norm": 0.11567039042711258, "kl": 0.3909912109375, "learning_rate": 9.208130581439749e-06, "loss": 0.004, "reward": 2.173549175262451, "reward_std": 0.10339736379683018, "rewards/accuracy_reward": 0.18080358137376606, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.997209832072258, "step": 1917 }, { "clip_ratio": 0.0, "completion_length": 561.8861846923828, "epoch": 0.5729221118661788, "grad_norm": 2.717337131500244, "kl": 0.6358642578125, "learning_rate": 9.197733512824958e-06, "loss": 0.0339, "reward": 2.065848261117935, "reward_std": 0.0764784961938858, "rewards/accuracy_reward": 0.0736607201397419, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9966517984867096, "step": 1918 }, { "clip_ratio": 0.0, "completion_length": 566.9620819091797, "epoch": 0.5732208199537002, "grad_norm": 1.4179893732070923, "kl": 0.5657958984375, "learning_rate": 9.187337316997475e-06, "loss": 0.0335, "reward": 2.1060268878936768, "reward_std": 0.16927063837647438, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.994419664144516, "step": 1919 }, { "clip_ratio": 0.0, "completion_length": 571.2343902587891, "epoch": 0.5735195280412217, "grad_norm": 2.979187488555908, "kl": 0.7742919921875, "learning_rate": 9.176942005267342e-06, "loss": 0.0652, "reward": 2.080357253551483, "reward_std": 0.09959924221038818, "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9955357313156128, "step": 1920 }, { "clip_ratio": 0.0, "completion_length": 561.0067443847656, "epoch": 0.5738182361287432, "grad_norm": 0.7108328938484192, "kl": 0.959228515625, "learning_rate": 9.166547588943636e-06, "loss": 0.0653, "reward": 2.048549234867096, "reward_std": 0.15163362212479115, "rewards/accuracy_reward": 0.07812500186264515, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9860491305589676, "step": 1921 }, { "clip_ratio": 0.0, "completion_length": 571.919677734375, "epoch": 0.5741169442162647, "grad_norm": 1.191407561302185, "kl": 0.2337646484375, "learning_rate": 9.15615407933447e-06, "loss": 0.0242, "reward": 2.106026828289032, "reward_std": 0.08601411432027817, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9966517984867096, "step": 1922 }, { "clip_ratio": 0.0, "completion_length": 609.1160888671875, "epoch": 0.5744156523037861, "grad_norm": 2.8107545375823975, "kl": 0.3560791015625, "learning_rate": 9.145761487746958e-06, "loss": 0.0347, "reward": 2.200334906578064, "reward_std": 0.14291714504361153, "rewards/accuracy_reward": 0.22098215110599995, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9905134290456772, "step": 1923 }, { "clip_ratio": 0.0, "completion_length": 584.6495819091797, "epoch": 0.5747143603913076, "grad_norm": 5.059950828552246, "kl": 0.697265625, "learning_rate": 9.135369825487222e-06, "loss": 0.1013, "reward": 2.130580484867096, "reward_std": 0.15064892917871475, "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9899553954601288, "step": 1924 }, { "clip_ratio": 0.0, "completion_length": 572.0625305175781, "epoch": 0.575013068478829, "grad_norm": 3.496459484100342, "kl": 0.4110107421875, "learning_rate": 9.124979103860374e-06, "loss": 0.0849, "reward": 2.1361608505249023, "reward_std": 0.13466838374733925, "rewards/accuracy_reward": 0.14955358020961285, "rewards/format_reward": 0.993303582072258, "rewards/tag_count_reward": 0.993303582072258, "step": 1925 }, { "clip_ratio": 0.0, "completion_length": 535.1250305175781, "epoch": 0.5753117765663506, "grad_norm": 1.3430384397506714, "kl": 0.4766845703125, "learning_rate": 9.1145893341705e-06, "loss": 0.042, "reward": 2.115513503551483, "reward_std": 0.14526437316089869, "rewards/accuracy_reward": 0.13169643399305642, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9927455633878708, "step": 1926 }, { "clip_ratio": 0.0, "completion_length": 568.0446624755859, "epoch": 0.575610484653872, "grad_norm": 0.5559157133102417, "kl": 0.297119140625, "learning_rate": 9.104200527720652e-06, "loss": 0.0235, "reward": 2.0775671005249023, "reward_std": 0.15904142335057259, "rewards/accuracy_reward": 0.08705357694998384, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9972098469734192, "step": 1927 }, { "clip_ratio": 0.0, "completion_length": 586.7656555175781, "epoch": 0.5759091927413935, "grad_norm": 0.6458982229232788, "kl": 0.3038330078125, "learning_rate": 9.093812695812828e-06, "loss": 0.0384, "reward": 2.029576003551483, "reward_std": 0.11195406597107649, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9960937798023224, "step": 1928 }, { "clip_ratio": 0.0, "completion_length": 580.9799346923828, "epoch": 0.5762079008289149, "grad_norm": 1.8496980667114258, "kl": 1.025390625, "learning_rate": 9.08342584974798e-06, "loss": 0.0396, "reward": 2.0892858505249023, "reward_std": 0.17537972703576088, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.988839328289032, "step": 1929 }, { "clip_ratio": 0.0, "completion_length": 592.5670013427734, "epoch": 0.5765066089164365, "grad_norm": 4.808115482330322, "kl": 1.09716796875, "learning_rate": 9.07304000082597e-06, "loss": 0.0508, "reward": 2.0239956974983215, "reward_std": 0.1679250244051218, "rewards/accuracy_reward": 0.04464286006987095, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9905134439468384, "step": 1930 }, { "clip_ratio": 0.0, "completion_length": 579.1607360839844, "epoch": 0.5768053170039579, "grad_norm": 0.3993438482284546, "kl": 0.57470703125, "learning_rate": 9.062655160345587e-06, "loss": 0.0554, "reward": 1.99944207072258, "reward_std": 0.1392408274114132, "rewards/accuracy_reward": 0.020089286845177412, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.990513414144516, "step": 1931 }, { "clip_ratio": 0.0, "completion_length": 597.8259124755859, "epoch": 0.5771040250914794, "grad_norm": 0.1701614260673523, "kl": 0.119384765625, "learning_rate": 9.052271339604523e-06, "loss": 0.0124, "reward": 2.1902902722358704, "reward_std": 0.1049960758537054, "rewards/accuracy_reward": 0.196428582072258, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.998325914144516, "step": 1932 }, { "clip_ratio": 0.0, "completion_length": 506.1607437133789, "epoch": 0.5774027331790008, "grad_norm": 0.8629541993141174, "kl": 0.191650390625, "learning_rate": 9.041888549899352e-06, "loss": 0.0156, "reward": 2.1517858505249023, "reward_std": 0.11443711258471012, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9977678656578064, "step": 1933 }, { "clip_ratio": 0.0, "completion_length": 539.1183471679688, "epoch": 0.5777014412665223, "grad_norm": 0.14390528202056885, "kl": 0.0867919921875, "learning_rate": 9.031506802525535e-06, "loss": 0.013, "reward": 2.1093750596046448, "reward_std": 0.13580003380775452, "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1934 }, { "clip_ratio": 0.0, "completion_length": 556.9174346923828, "epoch": 0.5780001493540438, "grad_norm": 2.471289873123169, "kl": 0.651123046875, "learning_rate": 9.021126108777391e-06, "loss": 0.0498, "reward": 2.0597098767757416, "reward_std": 0.15005733631551266, "rewards/accuracy_reward": 0.0714285762514919, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949776977300644, "step": 1935 }, { "clip_ratio": 0.0, "completion_length": 547.3995971679688, "epoch": 0.5782988574415653, "grad_norm": 2.413480520248413, "kl": 0.44677734375, "learning_rate": 9.010746479948105e-06, "loss": 0.026, "reward": 2.122767925262451, "reward_std": 0.10160730499774218, "rewards/accuracy_reward": 0.12723215040750802, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9977678656578064, "step": 1936 }, { "clip_ratio": 0.0, "completion_length": 597.7611846923828, "epoch": 0.5785975655290867, "grad_norm": 0.08477965742349625, "kl": 0.0908203125, "learning_rate": 9.000367927329691e-06, "loss": 0.0064, "reward": 2.0145090520381927, "reward_std": 0.0491071455180645, "rewards/accuracy_reward": 0.017857144121080637, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9988839328289032, "step": 1937 }, { "clip_ratio": 0.0, "completion_length": 563.3169860839844, "epoch": 0.5788962736166081, "grad_norm": 1.2347376346588135, "kl": 0.518798828125, "learning_rate": 8.989990462212994e-06, "loss": 0.035, "reward": 2.119419753551483, "reward_std": 0.15150070190429688, "rewards/accuracy_reward": 0.13616072130389512, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9921875298023224, "step": 1938 }, { "clip_ratio": 0.0, "completion_length": 536.6027069091797, "epoch": 0.5791949817041296, "grad_norm": 1.0875264406204224, "kl": 0.25537109375, "learning_rate": 8.979614095887685e-06, "loss": 0.0402, "reward": 2.1222099661827087, "reward_std": 0.17897918168455362, "rewards/accuracy_reward": 0.13839286123402417, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9927455484867096, "step": 1939 }, { "clip_ratio": 0.0, "completion_length": 558.732177734375, "epoch": 0.5794936897916511, "grad_norm": 0.8649752736091614, "kl": 0.5185546875, "learning_rate": 8.969238839642232e-06, "loss": 0.0295, "reward": 2.0625001192092896, "reward_std": 0.12733608298003674, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9933035969734192, "step": 1940 }, { "clip_ratio": 0.0, "completion_length": 582.1584930419922, "epoch": 0.5797923978791726, "grad_norm": 0.9769120812416077, "kl": 0.6119384765625, "learning_rate": 8.958864704763896e-06, "loss": 0.0533, "reward": 2.076451003551483, "reward_std": 0.25042734295129776, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.984933078289032, "step": 1941 }, { "clip_ratio": 0.0, "completion_length": 579.444221496582, "epoch": 0.580091105966694, "grad_norm": 1.9443495273590088, "kl": 0.9752197265625, "learning_rate": 8.948491702538716e-06, "loss": 0.0535, "reward": 2.1880581378936768, "reward_std": 0.1801163498312235, "rewards/accuracy_reward": 0.212053582072258, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9893973618745804, "step": 1942 }, { "clip_ratio": 0.0, "completion_length": 556.1406402587891, "epoch": 0.5803898140542155, "grad_norm": 0.2056066393852234, "kl": 0.22314453125, "learning_rate": 8.938119844251507e-06, "loss": 0.0228, "reward": 2.126674175262451, "reward_std": 0.10572467464953661, "rewards/accuracy_reward": 0.13392857694998384, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9972098469734192, "step": 1943 }, { "clip_ratio": 0.0, "completion_length": 622.2678985595703, "epoch": 0.5806885221417369, "grad_norm": 3.030778408050537, "kl": 1.080810546875, "learning_rate": 8.927749141185833e-06, "loss": 0.0734, "reward": 2.0507813096046448, "reward_std": 0.19695055950433016, "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9882812798023224, "step": 1944 }, { "clip_ratio": 0.0, "completion_length": 636.5290374755859, "epoch": 0.5809872302292585, "grad_norm": 1.4798063039779663, "kl": 0.3255615234375, "learning_rate": 8.917379604624e-06, "loss": 0.039, "reward": 2.0496652722358704, "reward_std": 0.18288236483931541, "rewards/accuracy_reward": 0.07366071967408061, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9893973618745804, "step": 1945 }, { "clip_ratio": 0.0, "completion_length": 587.6942138671875, "epoch": 0.5812859383167799, "grad_norm": 0.29805898666381836, "kl": 0.0999755859375, "learning_rate": 8.907011245847049e-06, "loss": 0.0198, "reward": 2.1277902722358704, "reward_std": 0.13395540416240692, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.998325914144516, "step": 1946 }, { "clip_ratio": 0.0, "completion_length": 606.6562805175781, "epoch": 0.5815846464043014, "grad_norm": 0.15009276568889618, "kl": 0.18798828125, "learning_rate": 8.896644076134739e-06, "loss": 0.0074, "reward": 2.2064732909202576, "reward_std": 0.1418982855975628, "rewards/accuracy_reward": 0.2165178693830967, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9966518133878708, "step": 1947 }, { "clip_ratio": 0.0, "completion_length": 550.8080596923828, "epoch": 0.5818833544918228, "grad_norm": 0.17531563341617584, "kl": 0.1060791015625, "learning_rate": 8.886278106765533e-06, "loss": 0.0215, "reward": 2.1741071939468384, "reward_std": 0.13245438504964113, "rewards/accuracy_reward": 0.1785714328289032, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9977678656578064, "step": 1948 }, { "clip_ratio": 0.0, "completion_length": 600.3370666503906, "epoch": 0.5821820625793444, "grad_norm": 0.837383508682251, "kl": 0.3988037109375, "learning_rate": 8.87591334901659e-06, "loss": 0.0275, "reward": 2.1004465222358704, "reward_std": 0.17031561583280563, "rewards/accuracy_reward": 0.12500000605359674, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9910714775323868, "step": 1949 }, { "clip_ratio": 0.0, "completion_length": 566.5870819091797, "epoch": 0.5824807706668658, "grad_norm": 0.5164145231246948, "kl": 0.209716796875, "learning_rate": 8.865549814163752e-06, "loss": 0.0402, "reward": 2.1668527722358704, "reward_std": 0.1252197464928031, "rewards/accuracy_reward": 0.18526786658912897, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9927455633878708, "step": 1950 }, { "clip_ratio": 0.0, "completion_length": 574.1451110839844, "epoch": 0.5827794787543873, "grad_norm": 0.21480479836463928, "kl": 0.145751953125, "learning_rate": 8.855187513481527e-06, "loss": 0.0127, "reward": 2.1997768878936768, "reward_std": 0.17299088835716248, "rewards/accuracy_reward": 0.2187500037252903, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9944196939468384, "step": 1951 }, { "clip_ratio": 0.0, "completion_length": 608.6004791259766, "epoch": 0.5830781868419087, "grad_norm": 0.1581297069787979, "kl": 0.0989990234375, "learning_rate": 8.844826458243083e-06, "loss": 0.0208, "reward": 2.160714328289032, "reward_std": 0.177910840138793, "rewards/accuracy_reward": 0.1763392984867096, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9933035969734192, "step": 1952 }, { "clip_ratio": 0.0, "completion_length": 626.7321624755859, "epoch": 0.5833768949294302, "grad_norm": 0.38856080174446106, "kl": 0.3485107421875, "learning_rate": 8.834466659720234e-06, "loss": 0.0631, "reward": 2.0892858505249023, "reward_std": 0.2217111773788929, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9910714775323868, "step": 1953 }, { "clip_ratio": 0.0, "completion_length": 614.7455596923828, "epoch": 0.5836756030169516, "grad_norm": 0.971146821975708, "kl": 0.3258056640625, "learning_rate": 8.824108129183427e-06, "loss": 0.0246, "reward": 2.1941965222358704, "reward_std": 0.18713731691241264, "rewards/accuracy_reward": 0.21651786426082253, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9910714626312256, "step": 1954 }, { "clip_ratio": 0.0, "completion_length": 618.9174194335938, "epoch": 0.5839743111044732, "grad_norm": 0.4892478287220001, "kl": 0.1014404296875, "learning_rate": 8.813750877901723e-06, "loss": 0.06, "reward": 2.084821581840515, "reward_std": 0.15220902860164642, "rewards/accuracy_reward": 0.10044643329456449, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9955357313156128, "step": 1955 }, { "clip_ratio": 0.0, "completion_length": 646.7522583007812, "epoch": 0.5842730191919946, "grad_norm": 0.11014779657125473, "kl": 0.1510009765625, "learning_rate": 8.803394917142797e-06, "loss": 0.0364, "reward": 2.034598261117935, "reward_std": 0.10575867909938097, "rewards/accuracy_reward": 0.051339289639145136, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9921875149011612, "step": 1956 }, { "clip_ratio": 0.0, "completion_length": 616.6451110839844, "epoch": 0.5845717272795161, "grad_norm": 0.19856391847133636, "kl": 0.1461181640625, "learning_rate": 8.793040258172926e-06, "loss": 0.0659, "reward": 2.190290331840515, "reward_std": 0.24567748233675957, "rewards/accuracy_reward": 0.22767858020961285, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9849330633878708, "step": 1957 }, { "clip_ratio": 0.0, "completion_length": 579.654052734375, "epoch": 0.5848704353670375, "grad_norm": 0.29784470796585083, "kl": 0.15185546875, "learning_rate": 8.782686912256957e-06, "loss": 0.0518, "reward": 2.1523438692092896, "reward_std": 0.2262691631913185, "rewards/accuracy_reward": 0.1808035857975483, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9871651977300644, "step": 1958 }, { "clip_ratio": 0.0, "completion_length": 615.763427734375, "epoch": 0.5851691434545591, "grad_norm": 0.3285141587257385, "kl": 0.212890625, "learning_rate": 8.772334890658317e-06, "loss": 0.0552, "reward": 2.1088171005249023, "reward_std": 0.27579705230891705, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9860491454601288, "step": 1959 }, { "clip_ratio": 0.0, "completion_length": 635.232177734375, "epoch": 0.5854678515420805, "grad_norm": 0.36849603056907654, "kl": 0.230224609375, "learning_rate": 8.761984204638994e-06, "loss": 0.0778, "reward": 2.0100447237491608, "reward_std": 0.24392309039831161, "rewards/accuracy_reward": 0.06473214528523386, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9765625298023224, "step": 1960 }, { "clip_ratio": 0.0, "completion_length": 659.6942291259766, "epoch": 0.585766559629602, "grad_norm": 1.7661768198013306, "kl": 0.552001953125, "learning_rate": 8.751634865459518e-06, "loss": 0.0882, "reward": 2.0273437798023224, "reward_std": 0.24174839630723, "rewards/accuracy_reward": 0.09151786542497575, "rewards/format_reward": 0.9598214775323868, "rewards/tag_count_reward": 0.9760045111179352, "step": 1961 }, { "clip_ratio": 0.0, "completion_length": 641.6116333007812, "epoch": 0.5860652677171234, "grad_norm": 0.6492117047309875, "kl": 0.38671875, "learning_rate": 8.741286884378954e-06, "loss": 0.1056, "reward": 2.108817160129547, "reward_std": 0.274643886834383, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.9726562947034836, "step": 1962 }, { "clip_ratio": 0.0, "completion_length": 599.5781555175781, "epoch": 0.5863639758046449, "grad_norm": 0.4099963903427124, "kl": 0.222900390625, "learning_rate": 8.73094027265489e-06, "loss": 0.0706, "reward": 2.056361675262451, "reward_std": 0.17953431978821754, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9893973767757416, "step": 1963 }, { "clip_ratio": 0.0, "completion_length": 616.4598541259766, "epoch": 0.5866626838921664, "grad_norm": 0.6665926575660706, "kl": 0.310302734375, "learning_rate": 8.720595041543433e-06, "loss": 0.1222, "reward": 2.0284599363803864, "reward_std": 0.30176180228590965, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.9531250447034836, "rewards/tag_count_reward": 0.9726562798023224, "step": 1964 }, { "clip_ratio": 0.0, "completion_length": 604.2410888671875, "epoch": 0.5869613919796879, "grad_norm": 0.6391229033470154, "kl": 0.51025390625, "learning_rate": 8.710251202299174e-06, "loss": 0.1048, "reward": 2.0039063692092896, "reward_std": 0.36250270158052444, "rewards/accuracy_reward": 0.09821428847499192, "rewards/format_reward": 0.9419643431901932, "rewards/tag_count_reward": 0.9637277126312256, "step": 1965 }, { "clip_ratio": 0.0, "completion_length": 578.0357513427734, "epoch": 0.5872601000672093, "grad_norm": 0.8480169773101807, "kl": 0.47216796875, "learning_rate": 8.699908766175195e-06, "loss": 0.0797, "reward": 2.0809153020381927, "reward_std": 0.24156131967902184, "rewards/accuracy_reward": 0.15178571827709675, "rewards/format_reward": 0.9531250447034836, "rewards/tag_count_reward": 0.9760045111179352, "step": 1966 }, { "clip_ratio": 0.0, "completion_length": 581.1161193847656, "epoch": 0.5875588081547308, "grad_norm": 0.4125616252422333, "kl": 0.371337890625, "learning_rate": 8.68956774442306e-06, "loss": 0.0686, "reward": 2.178013503551483, "reward_std": 0.306006096303463, "rewards/accuracy_reward": 0.2388392947614193, "rewards/format_reward": 0.9598214775323868, "rewards/tag_count_reward": 0.9793527275323868, "step": 1967 }, { "clip_ratio": 0.0, "completion_length": 573.779052734375, "epoch": 0.5878575162422522, "grad_norm": 0.47523263096809387, "kl": 0.28369140625, "learning_rate": 8.679228148292782e-06, "loss": 0.0661, "reward": 1.9804688394069672, "reward_std": 0.20983291417360306, "rewards/accuracy_reward": 0.024553572293370962, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9827009439468384, "step": 1968 }, { "clip_ratio": 0.0, "completion_length": 609.4643249511719, "epoch": 0.5881562243297738, "grad_norm": 0.4582006633281708, "kl": 0.1805419921875, "learning_rate": 8.66888998903283e-06, "loss": 0.0433, "reward": 2.070312649011612, "reward_std": 0.20940091833472252, "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9899553954601288, "step": 1969 }, { "clip_ratio": 0.0, "completion_length": 570.5580596923828, "epoch": 0.5884549324172952, "grad_norm": 0.6826907396316528, "kl": 0.208984375, "learning_rate": 8.658553277890102e-06, "loss": 0.0083, "reward": 2.0970983505249023, "reward_std": 0.08620638307183981, "rewards/accuracy_reward": 0.10714285937137902, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9966518133878708, "step": 1970 }, { "clip_ratio": 0.0, "completion_length": 572.1027069091797, "epoch": 0.5887536405048167, "grad_norm": 0.7168413400650024, "kl": 0.1871337890625, "learning_rate": 8.648218026109937e-06, "loss": 0.0282, "reward": 2.0703126192092896, "reward_std": 0.08682949468493462, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.994419664144516, "step": 1971 }, { "clip_ratio": 0.0, "completion_length": 557.2879638671875, "epoch": 0.5890523485923381, "grad_norm": 1.8065636157989502, "kl": 0.20068359375, "learning_rate": 8.637884244936069e-06, "loss": 0.0345, "reward": 2.271763503551483, "reward_std": 0.15911667980253696, "rewards/accuracy_reward": 0.2857142984867096, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9949776977300644, "step": 1972 }, { "clip_ratio": 0.0, "completion_length": 559.8727874755859, "epoch": 0.5893510566798597, "grad_norm": 0.26543670892715454, "kl": 0.198974609375, "learning_rate": 8.627551945610641e-06, "loss": 0.0389, "reward": 2.0619421005249023, "reward_std": 0.12424705270677805, "rewards/accuracy_reward": 0.07812500465661287, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9949777126312256, "step": 1973 }, { "clip_ratio": 0.0, "completion_length": 489.65181732177734, "epoch": 0.5896497647673811, "grad_norm": 0.2599751949310303, "kl": 0.1583251953125, "learning_rate": 8.617221139374181e-06, "loss": 0.0125, "reward": 2.091517984867096, "reward_std": 0.11424114927649498, "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9977678656578064, "step": 1974 }, { "clip_ratio": 0.0, "completion_length": 539.2768173217773, "epoch": 0.5899484728549026, "grad_norm": 0.20254763960838318, "kl": 0.1728515625, "learning_rate": 8.606891837465596e-06, "loss": 0.0232, "reward": 2.166852831840515, "reward_std": 0.17511921748518944, "rewards/accuracy_reward": 0.1830357275903225, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9927455633878708, "step": 1975 }, { "clip_ratio": 0.0, "completion_length": 517.3638610839844, "epoch": 0.590247180942424, "grad_norm": 0.12547466158866882, "kl": 0.106689453125, "learning_rate": 8.596564051122152e-06, "loss": 0.004, "reward": 2.149553656578064, "reward_std": 0.08406830858439207, "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9977678656578064, "step": 1976 }, { "clip_ratio": 0.0, "completion_length": 531.1026992797852, "epoch": 0.5905458890299455, "grad_norm": 0.16461190581321716, "kl": 0.2781982421875, "learning_rate": 8.586237791579466e-06, "loss": 0.0118, "reward": 2.1143974661827087, "reward_std": 0.08658832591027021, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9938616305589676, "step": 1977 }, { "clip_ratio": 0.0, "completion_length": 539.6116333007812, "epoch": 0.590844597117467, "grad_norm": 0.1067279651761055, "kl": 0.10205078125, "learning_rate": 8.575913070071503e-06, "loss": 0.0188, "reward": 2.143415331840515, "reward_std": 0.08283309359103441, "rewards/accuracy_reward": 0.15178572316654027, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9960937649011612, "step": 1978 }, { "clip_ratio": 0.0, "completion_length": 516.1361923217773, "epoch": 0.5911433052049885, "grad_norm": 1.3850501775741577, "kl": 0.217041015625, "learning_rate": 8.565589897830543e-06, "loss": 0.0192, "reward": 2.083147406578064, "reward_std": 0.19054771400988102, "rewards/accuracy_reward": 0.10044643026776612, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9938616305589676, "step": 1979 }, { "clip_ratio": 0.0, "completion_length": 557.6138458251953, "epoch": 0.5914420132925099, "grad_norm": 0.15909956395626068, "kl": 0.1365966796875, "learning_rate": 8.555268286087187e-06, "loss": 0.0338, "reward": 2.0809152722358704, "reward_std": 0.1352866105735302, "rewards/accuracy_reward": 0.09598214644938707, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9938616305589676, "step": 1980 }, { "clip_ratio": 0.0, "completion_length": 541.8549346923828, "epoch": 0.5917407213800313, "grad_norm": 0.1036207303404808, "kl": 0.1236572265625, "learning_rate": 8.544948246070335e-06, "loss": 0.014, "reward": 2.093750089406967, "reward_std": 0.08574779983609915, "rewards/accuracy_reward": 0.10491072130389512, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.995535746216774, "step": 1981 }, { "clip_ratio": 0.0, "completion_length": 535.4553680419922, "epoch": 0.5920394294675528, "grad_norm": 0.14762099087238312, "kl": 0.15673828125, "learning_rate": 8.534629789007183e-06, "loss": 0.0152, "reward": 2.1132813692092896, "reward_std": 0.20241502672433853, "rewards/accuracy_reward": 0.13169643841683865, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9927455633878708, "step": 1982 }, { "clip_ratio": 0.0, "completion_length": 544.2522430419922, "epoch": 0.5923381375550743, "grad_norm": 0.17293523252010345, "kl": 0.3111572265625, "learning_rate": 8.524312926123199e-06, "loss": 0.0334, "reward": 2.0295759737491608, "reward_std": 0.1559038609266281, "rewards/accuracy_reward": 0.055803575087338686, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9871652126312256, "step": 1983 }, { "clip_ratio": 0.0, "completion_length": 573.6696624755859, "epoch": 0.5926368456425958, "grad_norm": 0.15029947459697723, "kl": 0.123046875, "learning_rate": 8.513997668642117e-06, "loss": 0.0321, "reward": 2.049107253551483, "reward_std": 0.17024172469973564, "rewards/accuracy_reward": 0.06696429033763707, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9910714626312256, "step": 1984 }, { "clip_ratio": 0.0, "completion_length": 563.966552734375, "epoch": 0.5929355537301172, "grad_norm": 0.1261949986219406, "kl": 0.121337890625, "learning_rate": 8.503684027785929e-06, "loss": 0.027, "reward": 2.2260045409202576, "reward_std": 0.13496027328073978, "rewards/accuracy_reward": 0.2366071529686451, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9960937649011612, "step": 1985 }, { "clip_ratio": 0.0, "completion_length": 552.6071624755859, "epoch": 0.5932342618176387, "grad_norm": 0.17885757982730865, "kl": 0.142578125, "learning_rate": 8.493372014774863e-06, "loss": 0.0402, "reward": 2.0580358505249023, "reward_std": 0.13525152020156384, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.9888392984867096, "rewards/tag_count_reward": 0.9888392984867096, "step": 1986 }, { "clip_ratio": 0.0, "completion_length": 572.8973388671875, "epoch": 0.5935329699051601, "grad_norm": 0.10935112833976746, "kl": 0.2745361328125, "learning_rate": 8.48306164082738e-06, "loss": 0.018, "reward": 2.075892925262451, "reward_std": 0.09153290838003159, "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9933035969734192, "step": 1987 }, { "clip_ratio": 0.0, "completion_length": 553.3125228881836, "epoch": 0.5938316779926817, "grad_norm": 0.19727858901023865, "kl": 0.392822265625, "learning_rate": 8.472752917160155e-06, "loss": 0.0075, "reward": 2.108258992433548, "reward_std": 0.17499813809990883, "rewards/accuracy_reward": 0.12723215157166123, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9921875149011612, "step": 1988 }, { "clip_ratio": 0.0, "completion_length": 551.1049499511719, "epoch": 0.5941303860802031, "grad_norm": 0.22062301635742188, "kl": 0.1114501953125, "learning_rate": 8.462445854988071e-06, "loss": 0.0056, "reward": 2.213169753551483, "reward_std": 0.12950171530246735, "rewards/accuracy_reward": 0.2187500149011612, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9988839328289032, "step": 1989 }, { "clip_ratio": 0.0, "completion_length": 603.6116333007812, "epoch": 0.5944290941677246, "grad_norm": 0.22245189547538757, "kl": 0.1292724609375, "learning_rate": 8.452140465524201e-06, "loss": 0.0168, "reward": 2.0920759439468384, "reward_std": 0.10593733750283718, "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9938616305589676, "step": 1990 }, { "clip_ratio": 0.0, "completion_length": 593.3393249511719, "epoch": 0.594727802255246, "grad_norm": 0.5939856767654419, "kl": 0.1417236328125, "learning_rate": 8.441836759979796e-06, "loss": 0.0461, "reward": 1.985491156578064, "reward_std": 0.1532503291964531, "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.987723246216774, "step": 1991 }, { "clip_ratio": 0.0, "completion_length": 572.4419708251953, "epoch": 0.5950265103427675, "grad_norm": 0.21215425431728363, "kl": 0.1187744140625, "learning_rate": 8.43153474956428e-06, "loss": 0.0193, "reward": 2.0825893878936768, "reward_std": 0.1459372565150261, "rewards/accuracy_reward": 0.1026785762514919, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9910714477300644, "step": 1992 }, { "clip_ratio": 0.0, "completion_length": 581.3750381469727, "epoch": 0.595325218430289, "grad_norm": 2.4232118129730225, "kl": 0.68505859375, "learning_rate": 8.421234445485232e-06, "loss": 0.0466, "reward": 1.9988840222358704, "reward_std": 0.1037907712161541, "rewards/accuracy_reward": 0.017857143422588706, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9921875447034836, "step": 1993 }, { "clip_ratio": 0.0, "completion_length": 570.888427734375, "epoch": 0.5956239265178105, "grad_norm": 0.2480495721101761, "kl": 0.327880859375, "learning_rate": 8.410935858948372e-06, "loss": 0.0351, "reward": 2.079241156578064, "reward_std": 0.16216813772916794, "rewards/accuracy_reward": 0.10044643119908869, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9899553954601288, "step": 1994 }, { "clip_ratio": 0.0, "completion_length": 613.0067291259766, "epoch": 0.5959226346053319, "grad_norm": 0.408537358045578, "kl": 0.3515625, "learning_rate": 8.400639001157549e-06, "loss": 0.0215, "reward": 2.0357143878936768, "reward_std": 0.1063158418983221, "rewards/accuracy_reward": 0.05133928940631449, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9933036118745804, "step": 1995 }, { "clip_ratio": 0.0, "completion_length": 630.4040374755859, "epoch": 0.5962213426928534, "grad_norm": 0.36376842856407166, "kl": 0.1260986328125, "learning_rate": 8.39034388331474e-06, "loss": 0.0178, "reward": 2.134486734867096, "reward_std": 0.1441460894420743, "rewards/accuracy_reward": 0.14732143841683865, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9960937798023224, "step": 1996 }, { "clip_ratio": 0.0, "completion_length": 607.3861846923828, "epoch": 0.5965200507803748, "grad_norm": 0.14896602928638458, "kl": 0.2294921875, "learning_rate": 8.380050516620026e-06, "loss": 0.0189, "reward": 2.107142984867096, "reward_std": 0.18059588316828012, "rewards/accuracy_reward": 0.11830357392318547, "rewards/format_reward": 0.993303582072258, "rewards/tag_count_reward": 0.9955357313156128, "step": 1997 }, { "clip_ratio": 0.0, "completion_length": 645.3594207763672, "epoch": 0.5968187588678964, "grad_norm": 0.14343227446079254, "kl": 0.138916015625, "learning_rate": 8.369758912271573e-06, "loss": 0.0201, "reward": 2.115513563156128, "reward_std": 0.09767604153603315, "rewards/accuracy_reward": 0.12946429080329835, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9949777126312256, "step": 1998 }, { "clip_ratio": 0.0, "completion_length": 622.7835083007812, "epoch": 0.5971174669554178, "grad_norm": 0.5226101875305176, "kl": 0.2398681640625, "learning_rate": 8.359469081465645e-06, "loss": 0.0488, "reward": 2.0424107909202576, "reward_std": 0.18227626010775566, "rewards/accuracy_reward": 0.06250000302679837, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9910714775323868, "step": 1999 }, { "clip_ratio": 0.0, "completion_length": 605.2411041259766, "epoch": 0.5974161750429393, "grad_norm": 0.14964400231838226, "kl": 0.1436767578125, "learning_rate": 8.349181035396568e-06, "loss": 0.0281, "reward": 2.1021206378936768, "reward_std": 0.11754334531724453, "rewards/accuracy_reward": 0.11607143143191934, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9949776977300644, "step": 2000 }, { "clip_ratio": 0.0, "completion_length": 660.3995819091797, "epoch": 0.5977148831304607, "grad_norm": 0.20725226402282715, "kl": 0.1483154296875, "learning_rate": 8.338894785256726e-06, "loss": 0.0235, "reward": 2.0513394474983215, "reward_std": 0.16932361293584108, "rewards/accuracy_reward": 0.06919643213041127, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9910714477300644, "step": 2001 }, { "clip_ratio": 0.0, "completion_length": 617.5268249511719, "epoch": 0.5980135912179823, "grad_norm": 0.3259180188179016, "kl": 0.1549072265625, "learning_rate": 8.32861034223655e-06, "loss": 0.0641, "reward": 2.209263503551483, "reward_std": 0.2231181040406227, "rewards/accuracy_reward": 0.227678582072258, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9927455633878708, "step": 2002 }, { "clip_ratio": 0.0, "completion_length": 650.2678680419922, "epoch": 0.5983122993055037, "grad_norm": 0.45232051610946655, "kl": 0.2025146484375, "learning_rate": 8.31832771752451e-06, "loss": 0.0463, "reward": 2.162388503551483, "reward_std": 0.24591970816254616, "rewards/accuracy_reward": 0.2053571566939354, "rewards/format_reward": 0.970982164144516, "rewards/tag_count_reward": 0.9860491305589676, "step": 2003 }, { "clip_ratio": 0.0, "completion_length": 639.5513610839844, "epoch": 0.5986110073930252, "grad_norm": 0.5598156452178955, "kl": 0.1109619140625, "learning_rate": 8.308046922307091e-06, "loss": 0.0381, "reward": 2.0005581080913544, "reward_std": 0.206568643450737, "rewards/accuracy_reward": 0.03794642933644354, "rewards/format_reward": 0.9732143133878708, "rewards/tag_count_reward": 0.9893973618745804, "step": 2004 }, { "clip_ratio": 0.0, "completion_length": 589.3571624755859, "epoch": 0.5989097154805466, "grad_norm": 0.3441433608531952, "kl": 0.4173583984375, "learning_rate": 8.29776796776879e-06, "loss": 0.0285, "reward": 2.188616096973419, "reward_std": 0.20396124199032784, "rewards/accuracy_reward": 0.2187500149011612, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9877232611179352, "step": 2005 }, { "clip_ratio": 0.0, "completion_length": 613.6451110839844, "epoch": 0.5992084235680681, "grad_norm": 1.007623553276062, "kl": 0.58251953125, "learning_rate": 8.287490865092106e-06, "loss": 0.0262, "reward": 2.1506696939468384, "reward_std": 0.1438368447124958, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.994419664144516, "step": 2006 }, { "clip_ratio": 0.0, "completion_length": 638.3549346923828, "epoch": 0.5995071316555896, "grad_norm": 3.3493449687957764, "kl": 0.830078125, "learning_rate": 8.277215625457516e-06, "loss": 0.0604, "reward": 2.0385045409202576, "reward_std": 0.16339887492358685, "rewards/accuracy_reward": 0.053571431897580624, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9938616454601288, "step": 2007 }, { "clip_ratio": 0.0, "completion_length": 599.6830596923828, "epoch": 0.5998058397431111, "grad_norm": 0.32738402485847473, "kl": 0.254150390625, "learning_rate": 8.266942260043474e-06, "loss": 0.0691, "reward": 1.997767984867096, "reward_std": 0.1746789664030075, "rewards/accuracy_reward": 0.03348214295692742, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9843750596046448, "step": 2008 }, { "clip_ratio": 0.0, "completion_length": 613.5826110839844, "epoch": 0.6001045478306325, "grad_norm": 0.9169827699661255, "kl": 0.246826171875, "learning_rate": 8.256670780026393e-06, "loss": 0.0466, "reward": 2.0770090520381927, "reward_std": 0.20056728273630142, "rewards/accuracy_reward": 0.10267857741564512, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.987723246216774, "step": 2009 }, { "clip_ratio": 0.0, "completion_length": 575.6540298461914, "epoch": 0.600403255918154, "grad_norm": 4.612494468688965, "kl": 0.986328125, "learning_rate": 8.246401196580642e-06, "loss": 0.1069, "reward": 2.107142984867096, "reward_std": 0.24509312584996223, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9866071790456772, "step": 2010 }, { "clip_ratio": 0.0, "completion_length": 561.8013610839844, "epoch": 0.6007019640056754, "grad_norm": 0.399199903011322, "kl": 0.3583984375, "learning_rate": 8.236133520878517e-06, "loss": 0.0186, "reward": 2.0786831378936768, "reward_std": 0.17553436756134033, "rewards/accuracy_reward": 0.09598215040750802, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.991629496216774, "step": 2011 }, { "clip_ratio": 0.0, "completion_length": 593.9665222167969, "epoch": 0.601000672093197, "grad_norm": 0.23117676377296448, "kl": 0.3226318359375, "learning_rate": 8.225867764090243e-06, "loss": 0.0252, "reward": 2.1707589626312256, "reward_std": 0.17663121316581964, "rewards/accuracy_reward": 0.1897321492433548, "rewards/format_reward": 0.9888392984867096, "rewards/tag_count_reward": 0.9921875149011612, "step": 2012 }, { "clip_ratio": 0.0, "completion_length": 571.9085006713867, "epoch": 0.6012993801807184, "grad_norm": 0.558133065700531, "kl": 0.38232421875, "learning_rate": 8.215603937383959e-06, "loss": 0.0202, "reward": 2.087611645460129, "reward_std": 0.20802647806704044, "rewards/accuracy_reward": 0.12053572246804833, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9871652275323868, "step": 2013 }, { "clip_ratio": 0.0, "completion_length": 601.4777069091797, "epoch": 0.6015980882682399, "grad_norm": 0.21158301830291748, "kl": 0.1422119140625, "learning_rate": 8.205342051925702e-06, "loss": 0.0161, "reward": 2.055803656578064, "reward_std": 0.1597053725272417, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9933036267757416, "step": 2014 }, { "clip_ratio": 0.0, "completion_length": 574.5268096923828, "epoch": 0.6018967963557613, "grad_norm": 0.11952093988656998, "kl": 0.0955810546875, "learning_rate": 8.195082118879397e-06, "loss": 0.013, "reward": 2.0870537161827087, "reward_std": 0.13517801277339458, "rewards/accuracy_reward": 0.09151786146685481, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9977678656578064, "step": 2015 }, { "clip_ratio": 0.0, "completion_length": 587.9486694335938, "epoch": 0.6021955044432828, "grad_norm": 0.413765013217926, "kl": 1.0390625, "learning_rate": 8.184824149406843e-06, "loss": 0.0369, "reward": 2.1166296005249023, "reward_std": 0.20466038957238197, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9871652126312256, "step": 2016 }, { "clip_ratio": 0.0, "completion_length": 607.7344055175781, "epoch": 0.6024942125308043, "grad_norm": 0.5645737051963806, "kl": 0.4576416015625, "learning_rate": 8.174568154667712e-06, "loss": 0.0625, "reward": 2.0429688692092896, "reward_std": 0.25084217358380556, "rewards/accuracy_reward": 0.08258928777649999, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9804687947034836, "step": 2017 }, { "clip_ratio": 0.0, "completion_length": 607.950927734375, "epoch": 0.6027929206183258, "grad_norm": 0.2010897547006607, "kl": 0.09912109375, "learning_rate": 8.164314145819514e-06, "loss": 0.0071, "reward": 2.1065849363803864, "reward_std": 0.14337069634348154, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.997209832072258, "step": 2018 }, { "clip_ratio": 0.0, "completion_length": 632.9129791259766, "epoch": 0.6030916287058472, "grad_norm": 3.74204683303833, "kl": 0.8218994140625, "learning_rate": 8.15406213401761e-06, "loss": 0.0813, "reward": 2.0608259737491608, "reward_std": 0.21588735096156597, "rewards/accuracy_reward": 0.11383928847499192, "rewards/format_reward": 0.9665178805589676, "rewards/tag_count_reward": 0.9804687798023224, "step": 2019 }, { "clip_ratio": 0.0, "completion_length": 598.2611846923828, "epoch": 0.6033903367933687, "grad_norm": 0.24370822310447693, "kl": 0.2144775390625, "learning_rate": 8.143812130415182e-06, "loss": 0.001, "reward": 2.154017984867096, "reward_std": 0.15809014439582825, "rewards/accuracy_reward": 0.1741071492433548, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9933036118745804, "step": 2020 }, { "clip_ratio": 0.0, "completion_length": 566.6540451049805, "epoch": 0.6036890448808901, "grad_norm": 1.1060643196105957, "kl": 0.28955078125, "learning_rate": 8.133564146163232e-06, "loss": 0.0532, "reward": 2.1668527722358704, "reward_std": 0.27028267458081245, "rewards/accuracy_reward": 0.2098214365541935, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.98604916036129, "step": 2021 }, { "clip_ratio": 0.0, "completion_length": 570.8058319091797, "epoch": 0.6039877529684117, "grad_norm": 1.3280746936798096, "kl": 0.194091796875, "learning_rate": 8.12331819241056e-06, "loss": 0.0427, "reward": 2.0223215520381927, "reward_std": 0.13674430642277002, "rewards/accuracy_reward": 0.05133928940631449, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9910714626312256, "step": 2022 }, { "clip_ratio": 0.0, "completion_length": 602.4219055175781, "epoch": 0.6042864610559331, "grad_norm": 1.6284525394439697, "kl": 0.4117431640625, "learning_rate": 8.11307428030376e-06, "loss": 0.0485, "reward": 2.1277903020381927, "reward_std": 0.183877844363451, "rewards/accuracy_reward": 0.14955357764847577, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9916294813156128, "step": 2023 }, { "clip_ratio": 0.0, "completion_length": 592.0736846923828, "epoch": 0.6045851691434545, "grad_norm": 1.1480368375778198, "kl": 0.2593994140625, "learning_rate": 8.102832420987205e-06, "loss": 0.026, "reward": 2.042968839406967, "reward_std": 0.12866787239909172, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9938616454601288, "step": 2024 }, { "clip_ratio": 0.0, "completion_length": 591.4799499511719, "epoch": 0.604883877230976, "grad_norm": 0.3064006567001343, "kl": 0.1837158203125, "learning_rate": 8.092592625603033e-06, "loss": 0.0347, "reward": 2.098214417695999, "reward_std": 0.14281171560287476, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9910714775323868, "step": 2025 }, { "clip_ratio": 0.0, "completion_length": 580.6741256713867, "epoch": 0.6051825853184974, "grad_norm": 0.22271902859210968, "kl": 0.128662109375, "learning_rate": 8.082354905291136e-06, "loss": 0.0324, "reward": 2.0892857909202576, "reward_std": 0.15285142697393894, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.995535746216774, "step": 2026 }, { "clip_ratio": 0.0, "completion_length": 595.5714416503906, "epoch": 0.605481293406019, "grad_norm": 1.4300485849380493, "kl": 0.634521484375, "learning_rate": 8.072119271189155e-06, "loss": 0.0359, "reward": 2.0814733505249023, "reward_std": 0.22114188969135284, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9854910969734192, "step": 2027 }, { "clip_ratio": 0.0, "completion_length": 567.1696548461914, "epoch": 0.6057800014935404, "grad_norm": 0.2860681116580963, "kl": 0.1644287109375, "learning_rate": 8.061885734432455e-06, "loss": 0.033, "reward": 2.1071430146694183, "reward_std": 0.18966221623122692, "rewards/accuracy_reward": 0.13169643399305642, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9888393133878708, "step": 2028 }, { "clip_ratio": 0.0, "completion_length": 580.7299499511719, "epoch": 0.6060787095810619, "grad_norm": 0.45616182684898376, "kl": 0.3375244140625, "learning_rate": 8.05165430615412e-06, "loss": 0.0378, "reward": 2.0664063692092896, "reward_std": 0.15577204711735249, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9927455931901932, "step": 2029 }, { "clip_ratio": 0.0, "completion_length": 562.2165374755859, "epoch": 0.6063774176685833, "grad_norm": 0.5754091143608093, "kl": 0.26025390625, "learning_rate": 8.041424997484938e-06, "loss": 0.0558, "reward": 2.036830484867096, "reward_std": 0.26413361355662346, "rewards/accuracy_reward": 0.09151785937137902, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9832589775323868, "step": 2030 }, { "clip_ratio": 0.0, "completion_length": 543.9330596923828, "epoch": 0.6066761257561049, "grad_norm": 0.3761497437953949, "kl": 0.263427734375, "learning_rate": 8.031197819553398e-06, "loss": 0.1001, "reward": 2.0943081378936768, "reward_std": 0.26502785086631775, "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.9620535969734192, "rewards/tag_count_reward": 0.97823666036129, "step": 2031 }, { "clip_ratio": 0.0, "completion_length": 566.2388610839844, "epoch": 0.6069748338436263, "grad_norm": 0.9954007863998413, "kl": 0.4345703125, "learning_rate": 8.020972783485671e-06, "loss": 0.073, "reward": 2.0479911267757416, "reward_std": 0.24320120364427567, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9854911118745804, "step": 2032 }, { "clip_ratio": 0.0, "completion_length": 594.4620819091797, "epoch": 0.6072735419311478, "grad_norm": 0.507079005241394, "kl": 0.248779296875, "learning_rate": 8.01074990040559e-06, "loss": 0.0519, "reward": 2.0463170409202576, "reward_std": 0.1908898027613759, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9860491454601288, "step": 2033 }, { "clip_ratio": 0.0, "completion_length": 585.2745819091797, "epoch": 0.6075722500186692, "grad_norm": 0.3443470001220703, "kl": 0.1575927734375, "learning_rate": 8.000529181434649e-06, "loss": 0.0369, "reward": 2.071428656578064, "reward_std": 0.19784246012568474, "rewards/accuracy_reward": 0.09375000605359674, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9933036267757416, "step": 2034 }, { "clip_ratio": 0.0, "completion_length": 531.3102951049805, "epoch": 0.6078709581061907, "grad_norm": 0.3073044717311859, "kl": 0.2100830078125, "learning_rate": 7.990310637691988e-06, "loss": 0.0837, "reward": 2.1434152722358704, "reward_std": 0.22905626147985458, "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.984933078289032, "step": 2035 }, { "clip_ratio": 0.0, "completion_length": 544.4085235595703, "epoch": 0.6081696661937122, "grad_norm": 0.35713228583335876, "kl": 0.2540283203125, "learning_rate": 7.980094280294383e-06, "loss": 0.0345, "reward": 2.0245536267757416, "reward_std": 0.15078245662152767, "rewards/accuracy_reward": 0.04464286030270159, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9910714626312256, "step": 2036 }, { "clip_ratio": 0.0, "completion_length": 512.3526992797852, "epoch": 0.6084683742812337, "grad_norm": 0.2385704219341278, "kl": 0.232666015625, "learning_rate": 7.96988012035623e-06, "loss": 0.0227, "reward": 2.111049234867096, "reward_std": 0.12717285379767418, "rewards/accuracy_reward": 0.12276786286383867, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949776977300644, "step": 2037 }, { "clip_ratio": 0.0, "completion_length": 531.5089492797852, "epoch": 0.6087670823687551, "grad_norm": 1.580531358718872, "kl": 0.25390625, "learning_rate": 7.959668168989531e-06, "loss": 0.0109, "reward": 2.114955484867096, "reward_std": 0.16512488946318626, "rewards/accuracy_reward": 0.12946429196745157, "rewards/format_reward": 0.9910714328289032, "rewards/tag_count_reward": 0.9944196492433548, "step": 2038 }, { "clip_ratio": 0.0, "completion_length": 496.88841247558594, "epoch": 0.6090657904562766, "grad_norm": 0.8142208456993103, "kl": 0.4169921875, "learning_rate": 7.949458437303892e-06, "loss": 0.0304, "reward": 2.1668527722358704, "reward_std": 0.146327568218112, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9927455484867096, "step": 2039 }, { "clip_ratio": 0.0, "completion_length": 495.06475830078125, "epoch": 0.609364498543798, "grad_norm": 0.24968433380126953, "kl": 0.161865234375, "learning_rate": 7.939250936406499e-06, "loss": 0.0284, "reward": 2.166852831840515, "reward_std": 0.1974467635154724, "rewards/accuracy_reward": 0.1897321529686451, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9927455931901932, "step": 2040 }, { "clip_ratio": 0.0, "completion_length": 465.86609649658203, "epoch": 0.6096632066313196, "grad_norm": 0.3496554493904114, "kl": 0.171875, "learning_rate": 7.92904567740211e-06, "loss": 0.0372, "reward": 2.0318081378936768, "reward_std": 0.16186819598078728, "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9938616305589676, "step": 2041 }, { "clip_ratio": 0.0, "completion_length": 494.6205596923828, "epoch": 0.609961914718841, "grad_norm": 1.4287669658660889, "kl": 0.2178955078125, "learning_rate": 7.918842671393048e-06, "loss": 0.0267, "reward": 2.0864956080913544, "reward_std": 0.11226503551006317, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.997209832072258, "step": 2042 }, { "clip_ratio": 0.0, "completion_length": 516.4352874755859, "epoch": 0.6102606228063625, "grad_norm": 0.6093387603759766, "kl": 0.2265625, "learning_rate": 7.908641929479187e-06, "loss": 0.0269, "reward": 2.068080484867096, "reward_std": 0.1339448243379593, "rewards/accuracy_reward": 0.08705357764847577, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9921875298023224, "step": 2043 }, { "clip_ratio": 0.0, "completion_length": 557.2165374755859, "epoch": 0.6105593308938839, "grad_norm": 0.2721146047115326, "kl": 0.33837890625, "learning_rate": 7.898443462757933e-06, "loss": -0.0043, "reward": 2.1545759439468384, "reward_std": 0.20598334446549416, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9916295260190964, "step": 2044 }, { "clip_ratio": 0.0, "completion_length": 537.9397430419922, "epoch": 0.6108580389814054, "grad_norm": 0.16789671778678894, "kl": 0.169189453125, "learning_rate": 7.888247282324212e-06, "loss": 0.0039, "reward": 2.135044813156128, "reward_std": 0.1711404100060463, "rewards/accuracy_reward": 0.14732143376022577, "rewards/format_reward": 0.9910714328289032, "rewards/tag_count_reward": 0.9966518133878708, "step": 2045 }, { "clip_ratio": 0.0, "completion_length": 536.5178909301758, "epoch": 0.6111567470689269, "grad_norm": 0.3677961230278015, "kl": 0.3006591796875, "learning_rate": 7.878053399270475e-06, "loss": 0.0465, "reward": 2.0066965222358704, "reward_std": 0.23733942955732346, "rewards/accuracy_reward": 0.049107146449387074, "rewards/format_reward": 0.975446492433548, "rewards/tag_count_reward": 0.98214291036129, "step": 2046 }, { "clip_ratio": 0.0, "completion_length": 545.3750305175781, "epoch": 0.6114554551564484, "grad_norm": 0.17578208446502686, "kl": 0.109375, "learning_rate": 7.86786182468667e-06, "loss": 0.0228, "reward": 2.126674234867096, "reward_std": 0.12120267935097218, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9972098469734192, "step": 2047 }, { "clip_ratio": 0.0, "completion_length": 547.3013610839844, "epoch": 0.6117541632439698, "grad_norm": 0.33836033940315247, "kl": 0.10791015625, "learning_rate": 7.857672569660226e-06, "loss": 0.0293, "reward": 2.089843839406967, "reward_std": 0.16453610360622406, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9938616305589676, "step": 2048 }, { "clip_ratio": 0.0, "completion_length": 634.0044708251953, "epoch": 0.6120528713314913, "grad_norm": 0.3001931309700012, "kl": 0.1055908203125, "learning_rate": 7.847485645276053e-06, "loss": 0.0354, "reward": 2.0697546005249023, "reward_std": 0.1688420083373785, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9893973618745804, "step": 2049 }, { "clip_ratio": 0.0, "completion_length": 539.5759124755859, "epoch": 0.6123515794190127, "grad_norm": 0.21329812705516815, "kl": 0.091064453125, "learning_rate": 7.837301062616531e-06, "loss": 0.0012, "reward": 2.1166296005249023, "reward_std": 0.0763626080006361, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9983258992433548, "step": 2050 }, { "clip_ratio": 0.0, "completion_length": 586.9576263427734, "epoch": 0.6126502875065343, "grad_norm": 0.31454548239707947, "kl": 0.1929931640625, "learning_rate": 7.827118832761487e-06, "loss": 0.0433, "reward": 2.115513563156128, "reward_std": 0.22436382621526718, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9860491305589676, "step": 2051 }, { "clip_ratio": 0.0, "completion_length": 572.4330520629883, "epoch": 0.6129489955940557, "grad_norm": 0.2499733418226242, "kl": 0.115234375, "learning_rate": 7.816938966788185e-06, "loss": 0.04, "reward": 2.181361675262451, "reward_std": 0.1572945937514305, "rewards/accuracy_reward": 0.1986607275903225, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.991629496216774, "step": 2052 }, { "clip_ratio": 0.0, "completion_length": 598.8036041259766, "epoch": 0.6132477036815772, "grad_norm": 0.25493869185447693, "kl": 0.266357421875, "learning_rate": 7.806761475771325e-06, "loss": 0.0392, "reward": 2.1071429550647736, "reward_std": 0.251674460247159, "rewards/accuracy_reward": 0.14508929289877415, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9821428805589676, "step": 2053 }, { "clip_ratio": 0.0, "completion_length": 600.6585083007812, "epoch": 0.6135464117690986, "grad_norm": 0.208938866853714, "kl": 0.1334228515625, "learning_rate": 7.796586370783019e-06, "loss": 0.0217, "reward": 2.130022406578064, "reward_std": 0.12454343866556883, "rewards/accuracy_reward": 0.14285714644938707, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9938616305589676, "step": 2054 }, { "clip_ratio": 0.0, "completion_length": 608.5156402587891, "epoch": 0.6138451198566202, "grad_norm": 0.2597469091415405, "kl": 0.249267578125, "learning_rate": 7.786413662892785e-06, "loss": 0.0368, "reward": 2.060267984867096, "reward_std": 0.14879614487290382, "rewards/accuracy_reward": 0.08258928824216127, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9910714477300644, "step": 2055 }, { "clip_ratio": 0.0, "completion_length": 620.2344055175781, "epoch": 0.6141438279441416, "grad_norm": 0.8213784098625183, "kl": 0.48291015625, "learning_rate": 7.776243363167529e-06, "loss": 0.0388, "reward": 2.0552456378936768, "reward_std": 0.2039932832121849, "rewards/accuracy_reward": 0.0915178656578064, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.983816996216774, "step": 2056 }, { "clip_ratio": 0.0, "completion_length": 573.6071624755859, "epoch": 0.6144425360316631, "grad_norm": 7.846925735473633, "kl": 1.487548828125, "learning_rate": 7.766075482671544e-06, "loss": 0.1049, "reward": 2.0301340222358704, "reward_std": 0.2335708076134324, "rewards/accuracy_reward": 0.064732147147879, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9832589626312256, "step": 2057 }, { "clip_ratio": 0.0, "completion_length": 590.2522583007812, "epoch": 0.6147412441191845, "grad_norm": 0.17436224222183228, "kl": 0.11669921875, "learning_rate": 7.755910032466485e-06, "loss": 0.0104, "reward": 2.1568081378936768, "reward_std": 0.1193621726706624, "rewards/accuracy_reward": 0.16071429220028222, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9983258992433548, "step": 2058 }, { "clip_ratio": 0.0, "completion_length": 532.3995895385742, "epoch": 0.615039952206706, "grad_norm": 2.7346503734588623, "kl": 0.6705322265625, "learning_rate": 7.745747023611367e-06, "loss": 0.0692, "reward": 2.1595983505249023, "reward_std": 0.2944931089878082, "rewards/accuracy_reward": 0.1919642984867096, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9854910969734192, "step": 2059 }, { "clip_ratio": 0.0, "completion_length": 566.4151916503906, "epoch": 0.6153386602942275, "grad_norm": 1.7915265560150146, "kl": 0.447998046875, "learning_rate": 7.735586467162544e-06, "loss": 0.0435, "reward": 2.032924175262451, "reward_std": 0.16712030582129955, "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9882812947034836, "step": 2060 }, { "clip_ratio": 0.0, "completion_length": 611.4263763427734, "epoch": 0.615637368381749, "grad_norm": 0.21370628476142883, "kl": 0.1199951171875, "learning_rate": 7.725428374173712e-06, "loss": 0.0277, "reward": 2.0691965222358704, "reward_std": 0.16081291250884533, "rewards/accuracy_reward": 0.08705357694998384, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9910714626312256, "step": 2061 }, { "clip_ratio": 0.0, "completion_length": 600.8303833007812, "epoch": 0.6159360764692704, "grad_norm": 0.3105943202972412, "kl": 0.12158203125, "learning_rate": 7.715272755695876e-06, "loss": 0.0219, "reward": 2.1445313692092896, "reward_std": 0.11993943713605404, "rewards/accuracy_reward": 0.15178572130389512, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9972098469734192, "step": 2062 }, { "clip_ratio": 0.0, "completion_length": 593.3281402587891, "epoch": 0.6162347845567919, "grad_norm": 0.17215152084827423, "kl": 0.1268310546875, "learning_rate": 7.705119622777351e-06, "loss": 0.0194, "reward": 2.154576003551483, "reward_std": 0.14754331670701504, "rewards/accuracy_reward": 0.17187500465661287, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9938616305589676, "step": 2063 }, { "clip_ratio": 0.0, "completion_length": 590.6986694335938, "epoch": 0.6165334926443133, "grad_norm": 0.2434154748916626, "kl": 0.1082763671875, "learning_rate": 7.694968986463758e-06, "loss": 0.0278, "reward": 2.1484375596046448, "reward_std": 0.17186937853693962, "rewards/accuracy_reward": 0.15178571827709675, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9988839328289032, "step": 2064 }, { "clip_ratio": 0.0, "completion_length": 596.4442291259766, "epoch": 0.6168322007318349, "grad_norm": 0.9289751648902893, "kl": 0.3740234375, "learning_rate": 7.68482085779799e-06, "loss": 0.0593, "reward": 2.1640626788139343, "reward_std": 0.20048540458083153, "rewards/accuracy_reward": 0.18973215389996767, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9899553954601288, "step": 2065 }, { "clip_ratio": 0.0, "completion_length": 653.0424346923828, "epoch": 0.6171309088193563, "grad_norm": 1.3112074136734009, "kl": 0.2860107421875, "learning_rate": 7.674675247820215e-06, "loss": 0.0354, "reward": 1.9933035969734192, "reward_std": 0.11376677453517914, "rewards/accuracy_reward": 0.020089285913854837, "rewards/format_reward": 0.9821428656578064, "rewards/tag_count_reward": 0.9910714477300644, "step": 2066 }, { "clip_ratio": 0.0, "completion_length": 632.888427734375, "epoch": 0.6174296169068777, "grad_norm": 3.5395190715789795, "kl": 0.614990234375, "learning_rate": 7.664532167567864e-06, "loss": 0.056, "reward": 2.1316965222358704, "reward_std": 0.1537736840546131, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9933035969734192, "step": 2067 }, { "clip_ratio": 0.0, "completion_length": 610.0111999511719, "epoch": 0.6177283249943992, "grad_norm": 0.5909607410430908, "kl": 0.292236328125, "learning_rate": 7.654391628075616e-06, "loss": 0.0459, "reward": 2.1166295409202576, "reward_std": 0.18431755900382996, "rewards/accuracy_reward": 0.14732143632136285, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9871652275323868, "step": 2068 }, { "clip_ratio": 0.0, "completion_length": 519.0915374755859, "epoch": 0.6180270330819206, "grad_norm": 11.646056175231934, "kl": 1.2686767578125, "learning_rate": 7.644253640375382e-06, "loss": 0.1616, "reward": 2.1155134439468384, "reward_std": 0.21770687028765678, "rewards/accuracy_reward": 0.15178572572767735, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9838170111179352, "step": 2069 }, { "clip_ratio": 0.0, "completion_length": 590.1339569091797, "epoch": 0.6183257411694422, "grad_norm": 1.7276540994644165, "kl": 0.47021484375, "learning_rate": 7.634118215496298e-06, "loss": 0.0491, "reward": 2.08147332072258, "reward_std": 0.1364145651459694, "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.9843750149011612, "rewards/tag_count_reward": 0.9899553805589676, "step": 2070 }, { "clip_ratio": 0.0, "completion_length": 557.9710159301758, "epoch": 0.6186244492569636, "grad_norm": 0.5785210132598877, "kl": 0.7071533203125, "learning_rate": 7.623985364464715e-06, "loss": 0.0516, "reward": 2.1361607909202576, "reward_std": 0.12586330994963646, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9910714477300644, "step": 2071 }, { "clip_ratio": 0.0, "completion_length": 555.2008972167969, "epoch": 0.6189231573444851, "grad_norm": 0.5928044319152832, "kl": 0.301025390625, "learning_rate": 7.613855098304182e-06, "loss": 0.041, "reward": 2.0345983505249023, "reward_std": 0.14737638179212809, "rewards/accuracy_reward": 0.05803571711294353, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9899553805589676, "step": 2072 }, { "clip_ratio": 0.0, "completion_length": 594.3995971679688, "epoch": 0.6192218654320065, "grad_norm": 7.76837158203125, "kl": 0.7274169921875, "learning_rate": 7.6037274280354345e-06, "loss": 0.1015, "reward": 2.0239956080913544, "reward_std": 0.15049242787063122, "rewards/accuracy_reward": 0.046875000931322575, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9905134290456772, "step": 2073 }, { "clip_ratio": 0.0, "completion_length": 640.0424499511719, "epoch": 0.619520573519528, "grad_norm": 1.5223110914230347, "kl": 0.26904296875, "learning_rate": 7.593602364676382e-06, "loss": 0.0288, "reward": 2.0541295409202576, "reward_std": 0.11819489486515522, "rewards/accuracy_reward": 0.06919643236324191, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9938616305589676, "step": 2074 }, { "clip_ratio": 0.0, "completion_length": 559.9285888671875, "epoch": 0.6198192816070495, "grad_norm": 0.6256917119026184, "kl": 0.4644775390625, "learning_rate": 7.583479919242108e-06, "loss": 0.067, "reward": 2.0474331378936768, "reward_std": 0.15680615790188313, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9871652126312256, "step": 2075 }, { "clip_ratio": 0.0, "completion_length": 555.3370819091797, "epoch": 0.620117989694571, "grad_norm": 0.9074476957321167, "kl": 0.596923828125, "learning_rate": 7.573360102744838e-06, "loss": 0.1006, "reward": 2.142299175262451, "reward_std": 0.11288669053465128, "rewards/accuracy_reward": 0.1629464402794838, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.990513414144516, "step": 2076 }, { "clip_ratio": 0.0, "completion_length": 599.0000152587891, "epoch": 0.6204166977820924, "grad_norm": 0.6681820750236511, "kl": 0.67138671875, "learning_rate": 7.563242926193937e-06, "loss": 0.0363, "reward": 2.138392984867096, "reward_std": 0.14939750544726849, "rewards/accuracy_reward": 0.15625000558793545, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9933036118745804, "step": 2077 }, { "clip_ratio": 0.0, "completion_length": 553.5915374755859, "epoch": 0.6207154058696139, "grad_norm": 0.387768417596817, "kl": 0.490234375, "learning_rate": 7.553128400595906e-06, "loss": -0.0012, "reward": 2.055803656578064, "reward_std": 0.10764012392610312, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9977678805589676, "step": 2078 }, { "clip_ratio": 0.0, "completion_length": 614.3683319091797, "epoch": 0.6210141139571353, "grad_norm": 0.1599700003862381, "kl": 0.1148681640625, "learning_rate": 7.5430165369543566e-06, "loss": 0.0155, "reward": 2.039062649011612, "reward_std": 0.08243321813642979, "rewards/accuracy_reward": 0.046875003492459655, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9966517984867096, "step": 2079 }, { "clip_ratio": 0.0, "completion_length": 573.716552734375, "epoch": 0.6213128220446569, "grad_norm": 0.3255632221698761, "kl": 0.3651123046875, "learning_rate": 7.532907346270004e-06, "loss": 0.0263, "reward": 2.0898438096046448, "reward_std": 0.159959115087986, "rewards/accuracy_reward": 0.11383929289877415, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9893973618745804, "step": 2080 }, { "clip_ratio": 0.0, "completion_length": 584.6294784545898, "epoch": 0.6216115301321783, "grad_norm": 6.466292381286621, "kl": 1.0794677734375, "learning_rate": 7.522800839540656e-06, "loss": 0.1064, "reward": 2.021763503551483, "reward_std": 0.17395696509629488, "rewards/accuracy_reward": 0.04687500209547579, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9905134290456772, "step": 2081 }, { "clip_ratio": 0.0, "completion_length": 617.294677734375, "epoch": 0.6219102382196998, "grad_norm": 0.3158479630947113, "kl": 0.408203125, "learning_rate": 7.512697027761204e-06, "loss": 0.0396, "reward": 2.1953126192092896, "reward_std": 0.18311485275626183, "rewards/accuracy_reward": 0.2165178656578064, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9899553805589676, "step": 2082 }, { "clip_ratio": 0.0, "completion_length": 600.7946624755859, "epoch": 0.6222089463072212, "grad_norm": 1.381428837776184, "kl": 0.5703125, "learning_rate": 7.5025959219236055e-06, "loss": 0.0576, "reward": 2.071986675262451, "reward_std": 0.19548925012350082, "rewards/accuracy_reward": 0.09375000675208867, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9916295111179352, "step": 2083 }, { "clip_ratio": 0.0, "completion_length": 596.5647430419922, "epoch": 0.6225076543947428, "grad_norm": 0.5019568204879761, "kl": 0.4171142578125, "learning_rate": 7.49249753301687e-06, "loss": 0.0347, "reward": 2.102678656578064, "reward_std": 0.12482774257659912, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9933035969734192, "step": 2084 }, { "clip_ratio": 0.0, "completion_length": 590.1406555175781, "epoch": 0.6228063624822642, "grad_norm": 1.3081562519073486, "kl": 0.4095458984375, "learning_rate": 7.482401872027058e-06, "loss": 0.0268, "reward": 2.1512277722358704, "reward_std": 0.1293898979201913, "rewards/accuracy_reward": 0.16517857648432255, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9949776977300644, "step": 2085 }, { "clip_ratio": 0.0, "completion_length": 599.6763610839844, "epoch": 0.6231050705697857, "grad_norm": 9.719858169555664, "kl": 1.4676513671875, "learning_rate": 7.4723089499372595e-06, "loss": 0.0972, "reward": 2.0742188692092896, "reward_std": 0.14349348656833172, "rewards/accuracy_reward": 0.09151786239817739, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.991629496216774, "step": 2086 }, { "clip_ratio": 0.0, "completion_length": 570.7120819091797, "epoch": 0.6234037786573071, "grad_norm": 0.1851445585489273, "kl": 0.239501953125, "learning_rate": 7.462218777727581e-06, "loss": 0.0249, "reward": 2.1411831974983215, "reward_std": 0.15684681944549084, "rewards/accuracy_reward": 0.16294643515720963, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.991629496216774, "step": 2087 }, { "clip_ratio": 0.0, "completion_length": 589.1562652587891, "epoch": 0.6237024867448286, "grad_norm": 0.2562994956970215, "kl": 0.128173828125, "learning_rate": 7.452131366375142e-06, "loss": 0.0235, "reward": 2.076451063156128, "reward_std": 0.1131489360705018, "rewards/accuracy_reward": 0.08705357694998384, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9960937649011612, "step": 2088 }, { "clip_ratio": 0.0, "completion_length": 582.9844055175781, "epoch": 0.6240011948323501, "grad_norm": 1.4579417705535889, "kl": 0.46337890625, "learning_rate": 7.442046726854061e-06, "loss": -0.0073, "reward": 2.0652903020381927, "reward_std": 0.1658739559352398, "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.991629496216774, "step": 2089 }, { "clip_ratio": 0.0, "completion_length": 602.372802734375, "epoch": 0.6242999029198716, "grad_norm": 2.4400947093963623, "kl": 0.130615234375, "learning_rate": 7.4319648701354355e-06, "loss": 0.0339, "reward": 2.0591518878936768, "reward_std": 0.08060094341635704, "rewards/accuracy_reward": 0.06919643003493547, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9966518133878708, "step": 2090 }, { "clip_ratio": 0.0, "completion_length": 567.685302734375, "epoch": 0.624598611007393, "grad_norm": 4.430581092834473, "kl": 0.994140625, "learning_rate": 7.421885807187332e-06, "loss": 0.0841, "reward": 2.076451003551483, "reward_std": 0.17796926200389862, "rewards/accuracy_reward": 0.10044643399305642, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9893973767757416, "step": 2091 }, { "clip_ratio": 0.0, "completion_length": 585.5379791259766, "epoch": 0.6248973190949145, "grad_norm": 0.16984061896800995, "kl": 0.51171875, "learning_rate": 7.411809548974792e-06, "loss": 0.0133, "reward": 2.1356027722358704, "reward_std": 0.22393294051289558, "rewards/accuracy_reward": 0.16517857927829027, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9882812947034836, "step": 2092 }, { "clip_ratio": 0.0, "completion_length": 534.6852951049805, "epoch": 0.6251960271824359, "grad_norm": 0.41107138991355896, "kl": 0.52978515625, "learning_rate": 7.4017361064597925e-06, "loss": -0.0041, "reward": 2.1328126192092896, "reward_std": 0.16450510546565056, "rewards/accuracy_reward": 0.14508929336443543, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.994419664144516, "step": 2093 }, { "clip_ratio": 0.0, "completion_length": 599.6964721679688, "epoch": 0.6254947352699575, "grad_norm": 0.8912278413772583, "kl": 0.534423828125, "learning_rate": 7.391665490601252e-06, "loss": 0.0055, "reward": 2.100446581840515, "reward_std": 0.16109598148614168, "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.988839328289032, "step": 2094 }, { "clip_ratio": 0.0, "completion_length": 559.3973541259766, "epoch": 0.6257934433574789, "grad_norm": 0.46881103515625, "kl": 0.66943359375, "learning_rate": 7.381597712355011e-06, "loss": 0.0022, "reward": 2.0742188692092896, "reward_std": 0.17587561532855034, "rewards/accuracy_reward": 0.09598214598372579, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.991629496216774, "step": 2095 }, { "clip_ratio": 0.0, "completion_length": 621.4218902587891, "epoch": 0.6260921514450004, "grad_norm": 1.4525715112686157, "kl": 0.1475830078125, "learning_rate": 7.371532782673832e-06, "loss": 0.0254, "reward": 2.095982253551483, "reward_std": 0.1098175747320056, "rewards/accuracy_reward": 0.10491072130389512, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9977678805589676, "step": 2096 }, { "clip_ratio": 0.0, "completion_length": 587.9754638671875, "epoch": 0.6263908595325218, "grad_norm": 0.13639356195926666, "kl": 0.1507568359375, "learning_rate": 7.3614707125073645e-06, "loss": -0.0025, "reward": 2.1808037757873535, "reward_std": 0.11185095086693764, "rewards/accuracy_reward": 0.19196429662406445, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.995535746216774, "step": 2097 }, { "clip_ratio": 0.0, "completion_length": 621.7411041259766, "epoch": 0.6266895676200434, "grad_norm": 0.16830921173095703, "kl": 0.324462890625, "learning_rate": 7.351411512802158e-06, "loss": 0.0066, "reward": 2.0435269474983215, "reward_std": 0.11521979607641697, "rewards/accuracy_reward": 0.060267859138548374, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9921875149011612, "step": 2098 }, { "clip_ratio": 0.0, "completion_length": 565.4018096923828, "epoch": 0.6269882757075648, "grad_norm": 0.8739376664161682, "kl": 0.62548828125, "learning_rate": 7.341355194501638e-06, "loss": 0.0352, "reward": 2.0809153020381927, "reward_std": 0.22089964151382446, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9893973618745804, "step": 2099 }, { "clip_ratio": 0.0, "completion_length": 626.6763610839844, "epoch": 0.6272869837950863, "grad_norm": 7.144871234893799, "kl": 1.3468017578125, "learning_rate": 7.331301768546091e-06, "loss": 0.0593, "reward": 2.1199777722358704, "reward_std": 0.18147066235542297, "rewards/accuracy_reward": 0.14285715040750802, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.990513414144516, "step": 2100 }, { "clip_ratio": 0.0, "completion_length": 631.4085083007812, "epoch": 0.6275856918826077, "grad_norm": 0.6548885107040405, "kl": 0.3189697265625, "learning_rate": 7.3212512458726605e-06, "loss": 0.0222, "reward": 2.0904018878936768, "reward_std": 0.1650756597518921, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9921875298023224, "step": 2101 }, { "clip_ratio": 0.0, "completion_length": 613.9620819091797, "epoch": 0.6278843999701292, "grad_norm": 0.37769514322280884, "kl": 0.3277587890625, "learning_rate": 7.311203637415325e-06, "loss": 0.0199, "reward": 2.106584906578064, "reward_std": 0.19340850412845612, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.992745578289032, "step": 2102 }, { "clip_ratio": 0.0, "completion_length": 599.4955596923828, "epoch": 0.6281831080576507, "grad_norm": 0.8223286271095276, "kl": 0.3148193359375, "learning_rate": 7.301158954104905e-06, "loss": 0.0522, "reward": 2.0814733505249023, "reward_std": 0.17235571146011353, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9899553954601288, "step": 2103 }, { "clip_ratio": 0.0, "completion_length": 606.3772583007812, "epoch": 0.6284818161451722, "grad_norm": 0.25681447982788086, "kl": 0.6068115234375, "learning_rate": 7.291117206869027e-06, "loss": 0.004, "reward": 2.075334906578064, "reward_std": 0.15612981468439102, "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9927455633878708, "step": 2104 }, { "clip_ratio": 0.0, "completion_length": 609.4665374755859, "epoch": 0.6287805242326936, "grad_norm": 2.1117136478424072, "kl": 0.151123046875, "learning_rate": 7.281078406632127e-06, "loss": 0.0223, "reward": 2.1171876788139343, "reward_std": 0.110162902623415, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9966517984867096, "step": 2105 }, { "clip_ratio": 0.0, "completion_length": 629.9129791259766, "epoch": 0.6290792323202151, "grad_norm": 1.365716814994812, "kl": 1.049560546875, "learning_rate": 7.27104256431544e-06, "loss": 0.0217, "reward": 2.0669643878936768, "reward_std": 0.25743881426751614, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9776786118745804, "step": 2106 }, { "clip_ratio": 0.0, "completion_length": 582.9330749511719, "epoch": 0.6293779404077365, "grad_norm": 0.28547731041908264, "kl": 0.2484130859375, "learning_rate": 7.261009690836977e-06, "loss": 0.0041, "reward": 2.2187501192092896, "reward_std": 0.23632728680968285, "rewards/accuracy_reward": 0.2388393022119999, "rewards/format_reward": 0.9888392984867096, "rewards/tag_count_reward": 0.9910714477300644, "step": 2107 }, { "clip_ratio": 0.0, "completion_length": 618.6875305175781, "epoch": 0.6296766484952581, "grad_norm": 0.48773008584976196, "kl": 0.5135498046875, "learning_rate": 7.2509797971115195e-06, "loss": 0.0186, "reward": 2.1166296005249023, "reward_std": 0.20018705446273088, "rewards/accuracy_reward": 0.14732143771834671, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9871651977300644, "step": 2108 }, { "clip_ratio": 0.0, "completion_length": 564.3236846923828, "epoch": 0.6299753565827795, "grad_norm": 0.3707898259162903, "kl": 0.92138671875, "learning_rate": 7.240952894050608e-06, "loss": 0.0032, "reward": 2.0982143878936768, "reward_std": 0.18340428173542023, "rewards/accuracy_reward": 0.12946429220028222, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9888393133878708, "step": 2109 }, { "clip_ratio": 0.0, "completion_length": 630.9665374755859, "epoch": 0.6302740646703009, "grad_norm": 0.34457623958587646, "kl": 0.76513671875, "learning_rate": 7.230928992562534e-06, "loss": 0.0361, "reward": 1.9765625894069672, "reward_std": 0.1742430105805397, "rewards/accuracy_reward": 0.013392857974395156, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9832589626312256, "step": 2110 }, { "clip_ratio": 0.0, "completion_length": 622.6607437133789, "epoch": 0.6305727727578224, "grad_norm": 0.2920095920562744, "kl": 0.370849609375, "learning_rate": 7.220908103552319e-06, "loss": 0.0197, "reward": 2.0898438692092896, "reward_std": 0.18456257600337267, "rewards/accuracy_reward": 0.10937500791624188, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9916294813156128, "step": 2111 }, { "clip_ratio": 0.0, "completion_length": 642.5736694335938, "epoch": 0.6308714808453438, "grad_norm": 0.2229899913072586, "kl": 0.5279541015625, "learning_rate": 7.210890237921704e-06, "loss": 0.0216, "reward": 2.0357143878936768, "reward_std": 0.14474697411060333, "rewards/accuracy_reward": 0.06250000419095159, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9888393133878708, "step": 2112 }, { "clip_ratio": 0.0, "completion_length": 621.1428833007812, "epoch": 0.6311701889328654, "grad_norm": 0.23249369859695435, "kl": 0.2589111328125, "learning_rate": 7.20087540656915e-06, "loss": 0.0352, "reward": 2.1049108505249023, "reward_std": 0.12525091134011745, "rewards/accuracy_reward": 0.11830357951112092, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.995535746216774, "step": 2113 }, { "clip_ratio": 0.0, "completion_length": 633.9643249511719, "epoch": 0.6314688970203868, "grad_norm": 0.3061787188053131, "kl": 0.487548828125, "learning_rate": 7.1908636203898094e-06, "loss": 0.0338, "reward": 2.1568081378936768, "reward_std": 0.20527540147304535, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9938616305589676, "step": 2114 }, { "clip_ratio": 0.0, "completion_length": 648.3504638671875, "epoch": 0.6317676051079083, "grad_norm": 0.20692291855812073, "kl": 0.3453369140625, "learning_rate": 7.180854890275527e-06, "loss": 0.007, "reward": 2.1060269474983215, "reward_std": 0.12663239054381847, "rewards/accuracy_reward": 0.12276786286383867, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9921875298023224, "step": 2115 }, { "clip_ratio": 0.0, "completion_length": 628.4419860839844, "epoch": 0.6320663131954297, "grad_norm": 0.23081673681735992, "kl": 0.2320556640625, "learning_rate": 7.1708492271148144e-06, "loss": 0.025, "reward": 2.1478796005249023, "reward_std": 0.15630930289626122, "rewards/accuracy_reward": 0.16517858393490314, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9938616305589676, "step": 2116 }, { "clip_ratio": 0.0, "completion_length": 685.4330596923828, "epoch": 0.6323650212829512, "grad_norm": 0.3967278301715851, "kl": 0.2755126953125, "learning_rate": 7.160846641792858e-06, "loss": 0.051, "reward": 2.076451003551483, "reward_std": 0.17937280982732773, "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9893973469734192, "step": 2117 }, { "clip_ratio": 0.0, "completion_length": 675.0848388671875, "epoch": 0.6326637293704727, "grad_norm": 0.3207552433013916, "kl": 0.3758544921875, "learning_rate": 7.150847145191489e-06, "loss": 0.0304, "reward": 2.015066981315613, "reward_std": 0.16386719048023224, "rewards/accuracy_reward": 0.04241071571595967, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9905134290456772, "step": 2118 }, { "clip_ratio": 0.0, "completion_length": 646.2299346923828, "epoch": 0.6329624374579942, "grad_norm": 0.18599838018417358, "kl": 0.3023681640625, "learning_rate": 7.140850748189177e-06, "loss": -0.0095, "reward": 2.080915331840515, "reward_std": 0.17849878035485744, "rewards/accuracy_reward": 0.09151786286383867, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9960937798023224, "step": 2119 }, { "clip_ratio": 0.0, "completion_length": 631.0067291259766, "epoch": 0.6332611455455156, "grad_norm": 0.4988456964492798, "kl": 0.4296875, "learning_rate": 7.130857461661027e-06, "loss": 0.0293, "reward": 2.028459906578064, "reward_std": 0.17840108647942543, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9860491305589676, "step": 2120 }, { "clip_ratio": 0.0, "completion_length": 629.4084930419922, "epoch": 0.6335598536330371, "grad_norm": 0.2238464057445526, "kl": 0.216552734375, "learning_rate": 7.1208672964787505e-06, "loss": 0.0355, "reward": 2.113839328289032, "reward_std": 0.1798970978707075, "rewards/accuracy_reward": 0.12500000605359674, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9955357313156128, "step": 2121 }, { "clip_ratio": 0.0, "completion_length": 553.2768096923828, "epoch": 0.6338585617205585, "grad_norm": 1.1781564950942993, "kl": 0.306884765625, "learning_rate": 7.110880263510672e-06, "loss": 0.0571, "reward": 2.0937500596046448, "reward_std": 0.11394960433244705, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9977678656578064, "step": 2122 }, { "clip_ratio": 0.0, "completion_length": 578.1786041259766, "epoch": 0.6341572698080801, "grad_norm": 0.4174361526966095, "kl": 0.394775390625, "learning_rate": 7.1008963736217e-06, "loss": 0.0398, "reward": 2.117187589406967, "reward_std": 0.15211549028754234, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9921875447034836, "step": 2123 }, { "clip_ratio": 0.0, "completion_length": 575.2299346923828, "epoch": 0.6344559778956015, "grad_norm": 0.20393969118595123, "kl": 0.1556396484375, "learning_rate": 7.090915637673333e-06, "loss": 0.0319, "reward": 2.1132813096046448, "reward_std": 0.09478803910315037, "rewards/accuracy_reward": 0.12723215040750802, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9949777126312256, "step": 2124 }, { "clip_ratio": 0.0, "completion_length": 581.2455596923828, "epoch": 0.634754685983123, "grad_norm": 0.7953033447265625, "kl": 0.2110595703125, "learning_rate": 7.080938066523631e-06, "loss": 0.0293, "reward": 2.1099331378936768, "reward_std": 0.1274759480729699, "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9960937649011612, "step": 2125 }, { "clip_ratio": 0.0, "completion_length": 593.419677734375, "epoch": 0.6350533940706444, "grad_norm": 0.6868840456008911, "kl": 0.694091796875, "learning_rate": 7.0709636710272115e-06, "loss": 0.0522, "reward": 2.044642895460129, "reward_std": 0.20243466272950172, "rewards/accuracy_reward": 0.08035714388825, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9843750447034836, "step": 2126 }, { "clip_ratio": 0.0, "completion_length": 623.1719055175781, "epoch": 0.635352102158166, "grad_norm": 0.184820294380188, "kl": 0.124755859375, "learning_rate": 7.060992462035243e-06, "loss": 0.022, "reward": 2.051339417695999, "reward_std": 0.11658122297376394, "rewards/accuracy_reward": 0.06696428777649999, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.993303582072258, "step": 2127 }, { "clip_ratio": 0.0, "completion_length": 636.0424346923828, "epoch": 0.6356508102456874, "grad_norm": 1.2144627571105957, "kl": 0.3372802734375, "learning_rate": 7.051024450395424e-06, "loss": 0.0356, "reward": 2.122209906578064, "reward_std": 0.16705424711108208, "rewards/accuracy_reward": 0.14508929196745157, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9905134290456772, "step": 2128 }, { "clip_ratio": 0.0, "completion_length": 534.2656555175781, "epoch": 0.6359495183332089, "grad_norm": 0.8679077625274658, "kl": 0.3677978515625, "learning_rate": 7.041059646951971e-06, "loss": 0.0334, "reward": 2.142857253551483, "reward_std": 0.19708395563066006, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.993303582072258, "step": 2129 }, { "clip_ratio": 0.0, "completion_length": 590.0803833007812, "epoch": 0.6362482264207303, "grad_norm": 0.5698196291923523, "kl": 0.6485595703125, "learning_rate": 7.031098062545614e-06, "loss": 0.0331, "reward": 2.0658483505249023, "reward_std": 0.23320633731782436, "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9854910969734192, "step": 2130 }, { "clip_ratio": 0.0, "completion_length": 549.5312728881836, "epoch": 0.6365469345082518, "grad_norm": 0.2589964270591736, "kl": 0.1226806640625, "learning_rate": 7.021139708013582e-06, "loss": 0.0038, "reward": 2.114955484867096, "reward_std": 0.14121572114527225, "rewards/accuracy_reward": 0.12053571874275804, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9988839328289032, "step": 2131 }, { "clip_ratio": 0.0, "completion_length": 591.9777221679688, "epoch": 0.6368456425957733, "grad_norm": 0.2907955050468445, "kl": 0.2669677734375, "learning_rate": 7.0111845941895885e-06, "loss": 0.0481, "reward": 2.150669753551483, "reward_std": 0.20346449688076973, "rewards/accuracy_reward": 0.17633929033763707, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9899553954601288, "step": 2132 }, { "clip_ratio": 0.0, "completion_length": 506.3080520629883, "epoch": 0.6371443506832948, "grad_norm": 1.035014271736145, "kl": 0.48876953125, "learning_rate": 7.001232731903818e-06, "loss": 0.0519, "reward": 2.1512277722358704, "reward_std": 0.18841539323329926, "rewards/accuracy_reward": 0.1741071492433548, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9905134290456772, "step": 2133 }, { "clip_ratio": 0.0, "completion_length": 532.4397430419922, "epoch": 0.6374430587708162, "grad_norm": 0.23305360972881317, "kl": 0.4969482421875, "learning_rate": 6.991284131982927e-06, "loss": 0.0026, "reward": 2.1143974661827087, "reward_std": 0.1522299274802208, "rewards/accuracy_reward": 0.13616072479635477, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9916295111179352, "step": 2134 }, { "clip_ratio": 0.0, "completion_length": 544.8638610839844, "epoch": 0.6377417668583377, "grad_norm": 1.0466206073760986, "kl": 0.36279296875, "learning_rate": 6.981338805250015e-06, "loss": 0.0196, "reward": 2.1093751192092896, "reward_std": 0.15095992852002382, "rewards/accuracy_reward": 0.12500000488944352, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9933035969734192, "step": 2135 }, { "clip_ratio": 0.0, "completion_length": 520.5044937133789, "epoch": 0.6380404749458591, "grad_norm": 0.6010384559631348, "kl": 0.411865234375, "learning_rate": 6.971396762524622e-06, "loss": 0.0512, "reward": 2.1099331378936768, "reward_std": 0.14766835048794746, "rewards/accuracy_reward": 0.12946429592557251, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.991629496216774, "step": 2136 }, { "clip_ratio": 0.0, "completion_length": 513.4397583007812, "epoch": 0.6383391830333807, "grad_norm": 0.19835060834884644, "kl": 0.4046630859375, "learning_rate": 6.9614580146227155e-06, "loss": 0.0127, "reward": 2.100446581840515, "reward_std": 0.11399613041430712, "rewards/accuracy_reward": 0.11383929220028222, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9933035969734192, "step": 2137 }, { "clip_ratio": 0.0, "completion_length": 500.76788330078125, "epoch": 0.6386378911209021, "grad_norm": 0.6307342648506165, "kl": 0.3314208984375, "learning_rate": 6.951522572356682e-06, "loss": 0.0429, "reward": 2.1919643878936768, "reward_std": 0.2338980734348297, "rewards/accuracy_reward": 0.2098214365541935, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9933036118745804, "step": 2138 }, { "clip_ratio": 0.0, "completion_length": 521.9799423217773, "epoch": 0.6389365992084236, "grad_norm": 0.1790759414434433, "kl": 0.1092529296875, "learning_rate": 6.9415904465353045e-06, "loss": 0.0191, "reward": 2.0736608505249023, "reward_std": 0.10739377047866583, "rewards/accuracy_reward": 0.08258928707800806, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9955357313156128, "step": 2139 }, { "clip_ratio": 0.0, "completion_length": 525.0446701049805, "epoch": 0.639235307295945, "grad_norm": 0.1439022719860077, "kl": 0.100341796875, "learning_rate": 6.931661647963766e-06, "loss": 0.0191, "reward": 2.1049107909202576, "reward_std": 0.15649579465389252, "rewards/accuracy_reward": 0.11383928917348385, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9955357313156128, "step": 2140 }, { "clip_ratio": 0.0, "completion_length": 540.6317291259766, "epoch": 0.6395340153834665, "grad_norm": 1.6086890697479248, "kl": 0.70703125, "learning_rate": 6.921736187443624e-06, "loss": 0.0998, "reward": 2.073102831840515, "reward_std": 0.12559817917644978, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9882812947034836, "step": 2141 }, { "clip_ratio": 0.0, "completion_length": 523.1049270629883, "epoch": 0.639832723470988, "grad_norm": 1.7177960872650146, "kl": 0.5697021484375, "learning_rate": 6.911814075772809e-06, "loss": 0.0449, "reward": 2.201451003551483, "reward_std": 0.16764596290886402, "rewards/accuracy_reward": 0.22321429569274187, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.991629496216774, "step": 2142 }, { "clip_ratio": 0.0, "completion_length": 513.9375152587891, "epoch": 0.6401314315585095, "grad_norm": 0.2538197636604309, "kl": 0.4779052734375, "learning_rate": 6.901895323745604e-06, "loss": 0.0187, "reward": 2.034040242433548, "reward_std": 0.18390630185604095, "rewards/accuracy_reward": 0.05803571455180645, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9893973469734192, "step": 2143 }, { "clip_ratio": 0.0, "completion_length": 510.0290298461914, "epoch": 0.6404301396460309, "grad_norm": 0.32521852850914, "kl": 0.2060546875, "learning_rate": 6.89197994215264e-06, "loss": 0.0292, "reward": 2.080915331840515, "reward_std": 0.12054564151912928, "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9938616454601288, "step": 2144 }, { "clip_ratio": 0.0, "completion_length": 534.3393020629883, "epoch": 0.6407288477335524, "grad_norm": 0.174055278301239, "kl": 0.1092529296875, "learning_rate": 6.882067941780881e-06, "loss": 0.0237, "reward": 2.0418527722358704, "reward_std": 0.10691366158425808, "rewards/accuracy_reward": 0.049107144586741924, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9972098469734192, "step": 2145 }, { "clip_ratio": 0.0, "completion_length": 534.5089569091797, "epoch": 0.6410275558210738, "grad_norm": 0.0893525555729866, "kl": 0.3106689453125, "learning_rate": 6.87215933341361e-06, "loss": -0.0015, "reward": 2.0446430444717407, "reward_std": 0.07439951971173286, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9955357313156128, "step": 2146 }, { "clip_ratio": 0.0, "completion_length": 541.6071624755859, "epoch": 0.6413262639085954, "grad_norm": 0.6081139445304871, "kl": 0.4718017578125, "learning_rate": 6.862254127830426e-06, "loss": 0.029, "reward": 2.146763503551483, "reward_std": 0.2121233344078064, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9882812798023224, "step": 2147 }, { "clip_ratio": 0.0, "completion_length": 542.7210083007812, "epoch": 0.6416249719961168, "grad_norm": 1.0406360626220703, "kl": 0.254150390625, "learning_rate": 6.852352335807213e-06, "loss": 0.013, "reward": 2.0424108505249023, "reward_std": 0.17618322372436523, "rewards/accuracy_reward": 0.0714285762514919, "rewards/format_reward": 0.9821428656578064, "rewards/tag_count_reward": 0.9888392984867096, "step": 2148 }, { "clip_ratio": 0.0, "completion_length": 530.9442291259766, "epoch": 0.6419236800836383, "grad_norm": 0.7494205236434937, "kl": 0.3861083984375, "learning_rate": 6.84245396811616e-06, "loss": 0.031, "reward": 2.1523438692092896, "reward_std": 0.25490347668528557, "rewards/accuracy_reward": 0.1852678693830967, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9849330633878708, "step": 2149 }, { "clip_ratio": 0.0, "completion_length": 589.0379791259766, "epoch": 0.6422223881711597, "grad_norm": 1.3269902467727661, "kl": 0.263671875, "learning_rate": 6.832559035525716e-06, "loss": 0.0163, "reward": 2.068638503551483, "reward_std": 0.1366334743797779, "rewards/accuracy_reward": 0.09375000419095159, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9905134439468384, "step": 2150 }, { "clip_ratio": 0.0, "completion_length": 569.3214569091797, "epoch": 0.6425210962586813, "grad_norm": 0.387560099363327, "kl": 0.2791748046875, "learning_rate": 6.822667548800599e-06, "loss": -0.0015, "reward": 2.0111607909202576, "reward_std": 0.08659445866942406, "rewards/accuracy_reward": 0.02455357206054032, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.995535746216774, "step": 2151 }, { "clip_ratio": 0.0, "completion_length": 598.0870819091797, "epoch": 0.6428198043462027, "grad_norm": 0.4656822979450226, "kl": 0.2493896484375, "learning_rate": 6.812779518701778e-06, "loss": 0.0183, "reward": 2.0502232909202576, "reward_std": 0.15769579727202654, "rewards/accuracy_reward": 0.07366071688011289, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9899553805589676, "step": 2152 }, { "clip_ratio": 0.0, "completion_length": 577.1406402587891, "epoch": 0.6431185124337241, "grad_norm": 11.143975257873535, "kl": 2.364990234375, "learning_rate": 6.802894955986459e-06, "loss": 0.0888, "reward": 2.058035731315613, "reward_std": 0.1404235064983368, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9910714775323868, "step": 2153 }, { "clip_ratio": 0.0, "completion_length": 629.9486846923828, "epoch": 0.6434172205212456, "grad_norm": 0.8838651776313782, "kl": 0.58203125, "learning_rate": 6.793013871408076e-06, "loss": 0.0528, "reward": 2.0574778020381927, "reward_std": 0.22881752625107765, "rewards/accuracy_reward": 0.09375000186264515, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.98604916036129, "step": 2154 }, { "clip_ratio": 0.0, "completion_length": 615.9754791259766, "epoch": 0.643715928608767, "grad_norm": 1.0721226930618286, "kl": 0.607177734375, "learning_rate": 6.783136275716283e-06, "loss": 0.0411, "reward": 2.13616082072258, "reward_std": 0.28566865250468254, "rewards/accuracy_reward": 0.16964286682195961, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.986607164144516, "step": 2155 }, { "clip_ratio": 0.0, "completion_length": 612.4843902587891, "epoch": 0.6440146366962886, "grad_norm": 1.3981670141220093, "kl": 1.72119140625, "learning_rate": 6.773262179656936e-06, "loss": 0.033, "reward": 2.080357313156128, "reward_std": 0.2799568325281143, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9776785969734192, "step": 2156 }, { "clip_ratio": 0.0, "completion_length": 580.2500305175781, "epoch": 0.64431334478381, "grad_norm": 1.0251082181930542, "kl": 1.11328125, "learning_rate": 6.763391593972084e-06, "loss": 0.0476, "reward": 2.0357143580913544, "reward_std": 0.1416286788880825, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.988839328289032, "step": 2157 }, { "clip_ratio": 0.0, "completion_length": 579.8236846923828, "epoch": 0.6446120528713315, "grad_norm": 0.30718308687210083, "kl": 0.2542724609375, "learning_rate": 6.7535245293999556e-06, "loss": 0.0073, "reward": 2.060267984867096, "reward_std": 0.08123911079019308, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9977678656578064, "step": 2158 }, { "clip_ratio": 0.0, "completion_length": 577.8638610839844, "epoch": 0.6449107609588529, "grad_norm": 0.13968507945537567, "kl": 0.26806640625, "learning_rate": 6.74366099667495e-06, "loss": -0.0013, "reward": 2.2025671005249023, "reward_std": 0.12901266105473042, "rewards/accuracy_reward": 0.2142857201397419, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949776977300644, "step": 2159 }, { "clip_ratio": 0.0, "completion_length": 630.8638610839844, "epoch": 0.6452094690463744, "grad_norm": 0.4107196629047394, "kl": 0.266357421875, "learning_rate": 6.733801006527625e-06, "loss": 0.026, "reward": 2.0691965222358704, "reward_std": 0.22533336281776428, "rewards/accuracy_reward": 0.10044643608853221, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.988839328289032, "step": 2160 }, { "clip_ratio": 0.0, "completion_length": 626.9866333007812, "epoch": 0.6455081771338959, "grad_norm": 0.11747819930315018, "kl": 0.1292724609375, "learning_rate": 6.723944569684684e-06, "loss": 0.0167, "reward": 2.0976563096046448, "reward_std": 0.08489983528852463, "rewards/accuracy_reward": 0.10491072200238705, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.997209832072258, "step": 2161 }, { "clip_ratio": 0.0, "completion_length": 632.7366333007812, "epoch": 0.6458068852214174, "grad_norm": 0.3323573172092438, "kl": 0.2711181640625, "learning_rate": 6.71409169686896e-06, "loss": 0.0127, "reward": 2.072544753551483, "reward_std": 0.11198883131146431, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9921875298023224, "step": 2162 }, { "clip_ratio": 0.0, "completion_length": 590.8660888671875, "epoch": 0.6461055933089388, "grad_norm": 4.154722690582275, "kl": 0.361328125, "learning_rate": 6.704242398799419e-06, "loss": 0.0409, "reward": 2.0652902722358704, "reward_std": 0.15370220877230167, "rewards/accuracy_reward": 0.08482143399305642, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9916294813156128, "step": 2163 }, { "clip_ratio": 0.0, "completion_length": 620.9219055175781, "epoch": 0.6464043013964603, "grad_norm": 0.7772718071937561, "kl": 0.4776611328125, "learning_rate": 6.6943966861911295e-06, "loss": 0.0587, "reward": 2.070870578289032, "reward_std": 0.2120601385831833, "rewards/accuracy_reward": 0.10267857694998384, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.98604916036129, "step": 2164 }, { "clip_ratio": 0.0, "completion_length": 625.1920013427734, "epoch": 0.6467030094839817, "grad_norm": 0.9681296944618225, "kl": 0.4847412109375, "learning_rate": 6.684554569755258e-06, "loss": 0.0576, "reward": 2.0675224363803864, "reward_std": 0.2538611926138401, "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9827009439468384, "step": 2165 }, { "clip_ratio": 0.0, "completion_length": 607.1027069091797, "epoch": 0.6470017175715033, "grad_norm": 0.1485864371061325, "kl": 0.2369384765625, "learning_rate": 6.674716060199069e-06, "loss": -0.0016, "reward": 2.0920759439468384, "reward_std": 0.13318928703665733, "rewards/accuracy_reward": 0.10044643003493547, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9960937649011612, "step": 2166 }, { "clip_ratio": 0.0, "completion_length": 567.4040374755859, "epoch": 0.6473004256590247, "grad_norm": 0.2465779334306717, "kl": 0.2322998046875, "learning_rate": 6.664881168225894e-06, "loss": -0.0083, "reward": 2.190290331840515, "reward_std": 0.13691939041018486, "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.991629496216774, "step": 2167 }, { "clip_ratio": 0.0, "completion_length": 580.1964416503906, "epoch": 0.6475991337465462, "grad_norm": 0.22065401077270508, "kl": 0.4620361328125, "learning_rate": 6.655049904535131e-06, "loss": 0.017, "reward": 2.1049107909202576, "reward_std": 0.1771821342408657, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9888393133878708, "step": 2168 }, { "clip_ratio": 0.0, "completion_length": 548.803596496582, "epoch": 0.6478978418340676, "grad_norm": 1.402557134628296, "kl": 0.3038330078125, "learning_rate": 6.645222279822229e-06, "loss": 0.0357, "reward": 2.092076003551483, "reward_std": 0.18553309608250856, "rewards/accuracy_reward": 0.11383929220028222, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.991629496216774, "step": 2169 }, { "clip_ratio": 0.0, "completion_length": 613.6183166503906, "epoch": 0.6481965499215891, "grad_norm": 0.3615218698978424, "kl": 0.497314453125, "learning_rate": 6.635398304778685e-06, "loss": 0.041, "reward": 2.0563617050647736, "reward_std": 0.13919130433350801, "rewards/accuracy_reward": 0.07589286309666932, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9916294813156128, "step": 2170 }, { "clip_ratio": 0.0, "completion_length": 575.3058319091797, "epoch": 0.6484952580091106, "grad_norm": 0.8345733284950256, "kl": 0.3875732421875, "learning_rate": 6.625577990092019e-06, "loss": 0.0484, "reward": 2.111607253551483, "reward_std": 0.1715732328593731, "rewards/accuracy_reward": 0.13616071757860482, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9910714775323868, "step": 2171 }, { "clip_ratio": 0.0, "completion_length": 579.7545013427734, "epoch": 0.6487939660966321, "grad_norm": 1.3634767532348633, "kl": 0.9425048828125, "learning_rate": 6.615761346445769e-06, "loss": 0.0323, "reward": 2.079799234867096, "reward_std": 0.16247124690562487, "rewards/accuracy_reward": 0.10491071688011289, "rewards/format_reward": 0.9843750149011612, "rewards/tag_count_reward": 0.990513414144516, "step": 2172 }, { "clip_ratio": 0.0, "completion_length": 552.0357360839844, "epoch": 0.6490926741841535, "grad_norm": 0.5671789646148682, "kl": 0.5281982421875, "learning_rate": 6.605948384519485e-06, "loss": 0.0607, "reward": 2.068638503551483, "reward_std": 0.16540857404470444, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9927455633878708, "step": 2173 }, { "clip_ratio": 0.0, "completion_length": 591.8705596923828, "epoch": 0.649391382271675, "grad_norm": 0.4970574676990509, "kl": 0.463623046875, "learning_rate": 6.5961391149887065e-06, "loss": 0.0481, "reward": 2.0385045409202576, "reward_std": 0.1592297051101923, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.991629496216774, "step": 2174 }, { "clip_ratio": 0.0, "completion_length": 584.803581237793, "epoch": 0.6496900903591964, "grad_norm": 0.216690331697464, "kl": 0.1497802734375, "learning_rate": 6.586333548524957e-06, "loss": 0.021, "reward": 2.107701003551483, "reward_std": 0.1564756203442812, "rewards/accuracy_reward": 0.11607143562287092, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9960937649011612, "step": 2175 }, { "clip_ratio": 0.0, "completion_length": 636.6473541259766, "epoch": 0.649988798446718, "grad_norm": 0.4047645330429077, "kl": 0.2655029296875, "learning_rate": 6.576531695795727e-06, "loss": 0.049, "reward": 2.032366156578064, "reward_std": 0.1301727592945099, "rewards/accuracy_reward": 0.049107146449387074, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9921875298023224, "step": 2176 }, { "clip_ratio": 0.0, "completion_length": 607.0736846923828, "epoch": 0.6502875065342394, "grad_norm": 0.2600668668746948, "kl": 0.1158447265625, "learning_rate": 6.56673356746448e-06, "loss": 0.0142, "reward": 2.0161831378936768, "reward_std": 0.15483182296156883, "rewards/accuracy_reward": 0.03125000116415322, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9938616454601288, "step": 2177 }, { "clip_ratio": 0.0, "completion_length": 614.8281555175781, "epoch": 0.6505862146217609, "grad_norm": 2.6063215732574463, "kl": 0.915283203125, "learning_rate": 6.556939174190615e-06, "loss": 0.095, "reward": 2.0697545409202576, "reward_std": 0.18105081841349602, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9893973618745804, "step": 2178 }, { "clip_ratio": 0.0, "completion_length": 609.1696624755859, "epoch": 0.6508849227092823, "grad_norm": 1.2372124195098877, "kl": 0.476806640625, "learning_rate": 6.54714852662947e-06, "loss": 0.0724, "reward": 2.1250001192092896, "reward_std": 0.2593097351491451, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9866071790456772, "step": 2179 }, { "clip_ratio": 0.0, "completion_length": 572.3661041259766, "epoch": 0.6511836307968039, "grad_norm": 0.41426926851272583, "kl": 0.33642578125, "learning_rate": 6.537361635432316e-06, "loss": 0.0378, "reward": 2.1612724661827087, "reward_std": 0.1911649378016591, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.991629496216774, "step": 2180 }, { "clip_ratio": 0.0, "completion_length": 647.9129791259766, "epoch": 0.6514823388843253, "grad_norm": 3.036902666091919, "kl": 0.673583984375, "learning_rate": 6.527578511246325e-06, "loss": 0.0831, "reward": 2.04631707072258, "reward_std": 0.13943966291844845, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9905134439468384, "step": 2181 }, { "clip_ratio": 0.0, "completion_length": 571.9844055175781, "epoch": 0.6517810469718468, "grad_norm": 0.49794235825538635, "kl": 0.482421875, "learning_rate": 6.517799164714581e-06, "loss": 0.0469, "reward": 2.1512277722358704, "reward_std": 0.21032719500362873, "rewards/accuracy_reward": 0.1785714402794838, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9882812947034836, "step": 2182 }, { "clip_ratio": 0.0, "completion_length": 575.560302734375, "epoch": 0.6520797550593682, "grad_norm": 0.4030884802341461, "kl": 0.2144775390625, "learning_rate": 6.508023606476052e-06, "loss": 0.0214, "reward": 2.131138503551483, "reward_std": 0.11464738566428423, "rewards/accuracy_reward": 0.13839286379516125, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.997209832072258, "step": 2183 }, { "clip_ratio": 0.0, "completion_length": 627.1272583007812, "epoch": 0.6523784631468897, "grad_norm": 0.31576332449913025, "kl": 0.184814453125, "learning_rate": 6.498251847165589e-06, "loss": 0.0224, "reward": 2.0708706378936768, "reward_std": 0.12987130600959063, "rewards/accuracy_reward": 0.08928571571595967, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.992745578289032, "step": 2184 }, { "clip_ratio": 0.0, "completion_length": 616.8437957763672, "epoch": 0.6526771712344112, "grad_norm": 0.29037410020828247, "kl": 0.088134765625, "learning_rate": 6.4884838974139096e-06, "loss": 0.0122, "reward": 2.108817011117935, "reward_std": 0.13882900029420853, "rewards/accuracy_reward": 0.12276786495931447, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9972098767757416, "step": 2185 }, { "clip_ratio": 0.0, "completion_length": 607.0201263427734, "epoch": 0.6529758793219327, "grad_norm": 0.5657331347465515, "kl": 0.3089599609375, "learning_rate": 6.478719767847581e-06, "loss": 0.0385, "reward": 2.1021206378936768, "reward_std": 0.16061905585229397, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9927455633878708, "step": 2186 }, { "clip_ratio": 0.0, "completion_length": 622.8080596923828, "epoch": 0.6532745874094541, "grad_norm": 0.6691778302192688, "kl": 0.4586181640625, "learning_rate": 6.468959469089025e-06, "loss": 0.0386, "reward": 2.1289062798023224, "reward_std": 0.20250234752893448, "rewards/accuracy_reward": 0.1629464323632419, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.98604916036129, "step": 2187 }, { "clip_ratio": 0.0, "completion_length": 563.866096496582, "epoch": 0.6535732954969756, "grad_norm": 0.587884247303009, "kl": 0.34521484375, "learning_rate": 6.4592030117564885e-06, "loss": 0.0182, "reward": 2.0770090222358704, "reward_std": 0.2097139023244381, "rewards/accuracy_reward": 0.11383928847499192, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.987723246216774, "step": 2188 }, { "clip_ratio": 0.0, "completion_length": 561.8236999511719, "epoch": 0.653872003584497, "grad_norm": 1.3020968437194824, "kl": 0.38525390625, "learning_rate": 6.44945040646404e-06, "loss": 0.0289, "reward": 2.0156251192092896, "reward_std": 0.19053971581161022, "rewards/accuracy_reward": 0.055803572526201606, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9821428954601288, "step": 2189 }, { "clip_ratio": 0.0, "completion_length": 582.4955596923828, "epoch": 0.6541707116720186, "grad_norm": 0.43803679943084717, "kl": 0.3892822265625, "learning_rate": 6.4397016638215535e-06, "loss": 0.0419, "reward": 2.0485492646694183, "reward_std": 0.1473235646262765, "rewards/accuracy_reward": 0.07142857438884676, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9905134290456772, "step": 2190 }, { "clip_ratio": 0.0, "completion_length": 643.0379791259766, "epoch": 0.65446941975954, "grad_norm": 0.47717368602752686, "kl": 0.5106201171875, "learning_rate": 6.429956794434714e-06, "loss": 0.0368, "reward": 2.0117188692092896, "reward_std": 0.1729719303548336, "rewards/accuracy_reward": 0.03794642980210483, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9871652126312256, "step": 2191 }, { "clip_ratio": 0.0, "completion_length": 651.9129638671875, "epoch": 0.6547681278470615, "grad_norm": 2.229048490524292, "kl": 0.662353515625, "learning_rate": 6.420215808904979e-06, "loss": 0.0397, "reward": 2.1601563096046448, "reward_std": 0.17749475315213203, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.992745578289032, "step": 2192 }, { "clip_ratio": 0.0, "completion_length": 576.8839645385742, "epoch": 0.6550668359345829, "grad_norm": 0.43603742122650146, "kl": 0.337158203125, "learning_rate": 6.410478717829587e-06, "loss": 0.0367, "reward": 2.0770090520381927, "reward_std": 0.14967706706374884, "rewards/accuracy_reward": 0.09598214528523386, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9921875298023224, "step": 2193 }, { "clip_ratio": 0.0, "completion_length": 606.2678833007812, "epoch": 0.6553655440221045, "grad_norm": 1.0804721117019653, "kl": 0.267822265625, "learning_rate": 6.40074553180154e-06, "loss": 0.0206, "reward": 2.1562501192092896, "reward_std": 0.12252027355134487, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9910714626312256, "step": 2194 }, { "clip_ratio": 0.0, "completion_length": 599.5424499511719, "epoch": 0.6556642521096259, "grad_norm": 0.24620011448860168, "kl": 0.145263671875, "learning_rate": 6.39101626140959e-06, "loss": 0.0261, "reward": 2.050781339406967, "reward_std": 0.1385787632316351, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.992745578289032, "step": 2195 }, { "clip_ratio": 0.0, "completion_length": 636.9419860839844, "epoch": 0.6559629601971473, "grad_norm": 1.2653452157974243, "kl": 0.443603515625, "learning_rate": 6.381290917238229e-06, "loss": 0.0574, "reward": 2.0775670409202576, "reward_std": 0.14118062052875757, "rewards/accuracy_reward": 0.09598214598372579, "rewards/format_reward": 0.9888392984867096, "rewards/tag_count_reward": 0.9927455484867096, "step": 2196 }, { "clip_ratio": 0.0, "completion_length": 646.2857360839844, "epoch": 0.6562616682846688, "grad_norm": 0.22541332244873047, "kl": 0.111083984375, "learning_rate": 6.371569509867676e-06, "loss": 0.0211, "reward": 2.044642895460129, "reward_std": 0.12951632775366306, "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9933036118745804, "step": 2197 }, { "clip_ratio": 0.0, "completion_length": 589.3884201049805, "epoch": 0.6565603763721902, "grad_norm": 0.3142176568508148, "kl": 0.359130859375, "learning_rate": 6.361852049873875e-06, "loss": 0.0157, "reward": 2.060826003551483, "reward_std": 0.13655143417418003, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9938616454601288, "step": 2198 }, { "clip_ratio": 0.0, "completion_length": 586.3370666503906, "epoch": 0.6568590844597118, "grad_norm": 0.22970983386039734, "kl": 0.1356201171875, "learning_rate": 6.352138547828466e-06, "loss": 0.0116, "reward": 2.0792412161827087, "reward_std": 0.14216485526412725, "rewards/accuracy_reward": 0.08482143562287092, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9988839626312256, "step": 2199 }, { "clip_ratio": 0.0, "completion_length": 579.3147583007812, "epoch": 0.6571577925472332, "grad_norm": 0.5072870254516602, "kl": 0.187744140625, "learning_rate": 6.342429014298786e-06, "loss": 0.0455, "reward": 2.0094867050647736, "reward_std": 0.12395684979856014, "rewards/accuracy_reward": 0.029017858672887087, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.991629496216774, "step": 2200 }, { "clip_ratio": 0.0, "completion_length": 599.091552734375, "epoch": 0.6574565006347547, "grad_norm": 0.21285519003868103, "kl": 0.1251220703125, "learning_rate": 6.3327234598478605e-06, "loss": 0.008, "reward": 2.065290242433548, "reward_std": 0.09032375551760197, "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9938616454601288, "step": 2201 }, { "clip_ratio": 0.0, "completion_length": 538.1763610839844, "epoch": 0.6577552087222761, "grad_norm": 2.2054851055145264, "kl": 0.4398193359375, "learning_rate": 6.323021895034378e-06, "loss": 0.0362, "reward": 2.0708706378936768, "reward_std": 0.13801665417850018, "rewards/accuracy_reward": 0.09151786239817739, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.990513414144516, "step": 2202 }, { "clip_ratio": 0.0, "completion_length": 583.9732284545898, "epoch": 0.6580539168097976, "grad_norm": 1.0534530878067017, "kl": 0.2913818359375, "learning_rate": 6.313324330412692e-06, "loss": 0.0321, "reward": 2.0998885333538055, "reward_std": 0.18032503500580788, "rewards/accuracy_reward": 0.12946429522708058, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9882812798023224, "step": 2203 }, { "clip_ratio": 0.0, "completion_length": 516.7076110839844, "epoch": 0.658352624897319, "grad_norm": 2.1658120155334473, "kl": 0.5418701171875, "learning_rate": 6.303630776532799e-06, "loss": 0.0714, "reward": 2.111607253551483, "reward_std": 0.2021181583404541, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9866071790456772, "step": 2204 }, { "clip_ratio": 0.0, "completion_length": 578.0602874755859, "epoch": 0.6586513329848406, "grad_norm": 0.6906934976577759, "kl": 0.2344970703125, "learning_rate": 6.29394124394034e-06, "loss": 0.0462, "reward": 2.029576003551483, "reward_std": 0.20381690002977848, "rewards/accuracy_reward": 0.05803571711294353, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9871652126312256, "step": 2205 }, { "clip_ratio": 0.0, "completion_length": 567.9933166503906, "epoch": 0.658950041072362, "grad_norm": 0.2847347557544708, "kl": 0.1336669921875, "learning_rate": 6.284255743176576e-06, "loss": 0.0291, "reward": 2.069754511117935, "reward_std": 0.1418198775500059, "rewards/accuracy_reward": 0.08928572130389512, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9916295111179352, "step": 2206 }, { "clip_ratio": 0.0, "completion_length": 583.6428833007812, "epoch": 0.6592487491598835, "grad_norm": 0.5866267085075378, "kl": 0.369873046875, "learning_rate": 6.274574284778379e-06, "loss": 0.0536, "reward": 2.0474331080913544, "reward_std": 0.20608412474393845, "rewards/accuracy_reward": 0.07812500605359674, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9849330633878708, "step": 2207 }, { "clip_ratio": 0.0, "completion_length": 519.194221496582, "epoch": 0.6595474572474049, "grad_norm": 0.5590463280677795, "kl": 0.2236328125, "learning_rate": 6.26489687927823e-06, "loss": 0.0376, "reward": 2.0965403020381927, "reward_std": 0.18925542384386063, "rewards/accuracy_reward": 0.11383929033763707, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9916294813156128, "step": 2208 }, { "clip_ratio": 0.0, "completion_length": 548.8839416503906, "epoch": 0.6598461653349265, "grad_norm": 0.2726258635520935, "kl": 0.13818359375, "learning_rate": 6.2552235372041985e-06, "loss": 0.0298, "reward": 2.095424175262451, "reward_std": 0.2030961588025093, "rewards/accuracy_reward": 0.12946428824216127, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9860491454601288, "step": 2209 }, { "clip_ratio": 0.0, "completion_length": 585.1428985595703, "epoch": 0.6601448734224479, "grad_norm": 0.27040284872055054, "kl": 0.1239013671875, "learning_rate": 6.245554269079929e-06, "loss": 0.0306, "reward": 2.0970982909202576, "reward_std": 0.16538403648883104, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9899553805589676, "step": 2210 }, { "clip_ratio": 0.0, "completion_length": 549.5781402587891, "epoch": 0.6604435815099694, "grad_norm": 0.17526377737522125, "kl": 0.1180419921875, "learning_rate": 6.235889085424638e-06, "loss": 0.015, "reward": 2.1127233505249023, "reward_std": 0.1857096254825592, "rewards/accuracy_reward": 0.13616072246804833, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9899553954601288, "step": 2211 }, { "clip_ratio": 0.0, "completion_length": 576.9888763427734, "epoch": 0.6607422895974908, "grad_norm": 0.1958295702934265, "kl": 0.25390625, "learning_rate": 6.226227996753102e-06, "loss": 0.0157, "reward": 2.1646206378936768, "reward_std": 0.18916823342442513, "rewards/accuracy_reward": 0.1941964402794838, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9882813096046448, "step": 2212 }, { "clip_ratio": 0.0, "completion_length": 522.6428680419922, "epoch": 0.6610409976850123, "grad_norm": 0.7009002566337585, "kl": 0.13037109375, "learning_rate": 6.2165710135756365e-06, "loss": 0.0314, "reward": 2.204241156578064, "reward_std": 0.16517849639058113, "rewards/accuracy_reward": 0.2209821492433548, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9921875298023224, "step": 2213 }, { "clip_ratio": 0.0, "completion_length": 580.0736846923828, "epoch": 0.6613397057725338, "grad_norm": 0.27728280425071716, "kl": 0.1085205078125, "learning_rate": 6.206918146398091e-06, "loss": 0.0172, "reward": 2.0658482909202576, "reward_std": 0.12326836306601763, "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9944196790456772, "step": 2214 }, { "clip_ratio": 0.0, "completion_length": 621.9375305175781, "epoch": 0.6616384138600553, "grad_norm": 0.28906333446502686, "kl": 0.146728515625, "learning_rate": 6.1972694057218404e-06, "loss": 0.0234, "reward": 2.163504511117935, "reward_std": 0.21684269234538078, "rewards/accuracy_reward": 0.1941964328289032, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9871651977300644, "step": 2215 }, { "clip_ratio": 0.0, "completion_length": 602.7254791259766, "epoch": 0.6619371219475767, "grad_norm": 0.17997480928897858, "kl": 0.126953125, "learning_rate": 6.18762480204377e-06, "loss": 0.0442, "reward": 2.1300224363803864, "reward_std": 0.151371568441391, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9893973469734192, "step": 2216 }, { "clip_ratio": 0.0, "completion_length": 604.1741333007812, "epoch": 0.6622358300350982, "grad_norm": 0.4115852117538452, "kl": 0.11474609375, "learning_rate": 6.177984345856262e-06, "loss": 0.0388, "reward": 2.0357143878936768, "reward_std": 0.21563202515244484, "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.9799107313156128, "rewards/tag_count_reward": 0.9843750298023224, "step": 2217 }, { "clip_ratio": 0.0, "completion_length": 615.3326263427734, "epoch": 0.6625345381226196, "grad_norm": 0.15786223113536835, "kl": 0.111572265625, "learning_rate": 6.168348047647185e-06, "loss": 0.0215, "reward": 2.1138394474983215, "reward_std": 0.15895190183073282, "rewards/accuracy_reward": 0.12946429406292737, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9933035969734192, "step": 2218 }, { "clip_ratio": 0.0, "completion_length": 567.0357360839844, "epoch": 0.6628332462101412, "grad_norm": 0.18901316821575165, "kl": 0.0927734375, "learning_rate": 6.158715917899892e-06, "loss": 0.0079, "reward": 2.103794753551483, "reward_std": 0.14121555164456367, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9966517984867096, "step": 2219 }, { "clip_ratio": 0.0, "completion_length": 607.4777221679688, "epoch": 0.6631319542976626, "grad_norm": 0.2783227860927582, "kl": 0.10693359375, "learning_rate": 6.149087967093195e-06, "loss": 0.0197, "reward": 2.0870537161827087, "reward_std": 0.14096087217330933, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9933035969734192, "step": 2220 }, { "clip_ratio": 0.0, "completion_length": 589.0468902587891, "epoch": 0.6634306623851841, "grad_norm": 0.5282402634620667, "kl": 0.220703125, "learning_rate": 6.13946420570136e-06, "loss": 0.047, "reward": 2.1679688692092896, "reward_std": 0.17813490889966488, "rewards/accuracy_reward": 0.1986607201397419, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9871652126312256, "step": 2221 }, { "clip_ratio": 0.0, "completion_length": 608.8058319091797, "epoch": 0.6637293704727055, "grad_norm": 0.22183358669281006, "kl": 0.130126953125, "learning_rate": 6.1298446441940916e-06, "loss": 0.0461, "reward": 2.0306921005249023, "reward_std": 0.19355440139770508, "rewards/accuracy_reward": 0.0691964291036129, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9838170111179352, "step": 2222 }, { "clip_ratio": 0.0, "completion_length": 597.1227874755859, "epoch": 0.664028078560227, "grad_norm": 0.33534613251686096, "kl": 0.134765625, "learning_rate": 6.120229293036539e-06, "loss": 0.0377, "reward": 2.105468839406967, "reward_std": 0.19853601977229118, "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9827009290456772, "step": 2223 }, { "clip_ratio": 0.0, "completion_length": 574.0513610839844, "epoch": 0.6643267866477485, "grad_norm": 0.17759975790977478, "kl": 0.130126953125, "learning_rate": 6.110618162689257e-06, "loss": 0.0566, "reward": 2.1316965222358704, "reward_std": 0.18769019655883312, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.986607164144516, "step": 2224 }, { "clip_ratio": 0.0, "completion_length": 590.7299194335938, "epoch": 0.66462549473527, "grad_norm": 0.17179086804389954, "kl": 0.1292724609375, "learning_rate": 6.10101126360821e-06, "loss": 0.048, "reward": 2.0926340222358704, "reward_std": 0.19259820878505707, "rewards/accuracy_reward": 0.12500000838190317, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9854911118745804, "step": 2225 }, { "clip_ratio": 0.0, "completion_length": 622.6049346923828, "epoch": 0.6649242028227914, "grad_norm": 0.17191408574581146, "kl": 0.1492919921875, "learning_rate": 6.091408606244769e-06, "loss": 0.0321, "reward": 2.033482253551483, "reward_std": 0.20596596226096153, "rewards/accuracy_reward": 0.0602678582072258, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.988839328289032, "step": 2226 }, { "clip_ratio": 0.0, "completion_length": 535.5915374755859, "epoch": 0.6652229109103129, "grad_norm": 0.22334788739681244, "kl": 0.1263427734375, "learning_rate": 6.081810201045681e-06, "loss": 0.0283, "reward": 2.1205357909202576, "reward_std": 0.1437385343015194, "rewards/accuracy_reward": 0.13839286379516125, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9910714477300644, "step": 2227 }, { "clip_ratio": 0.0, "completion_length": 580.9308166503906, "epoch": 0.6655216189978344, "grad_norm": 0.30633771419525146, "kl": 0.15869140625, "learning_rate": 6.072216058453071e-06, "loss": 0.0656, "reward": 2.1049107909202576, "reward_std": 0.2610424682497978, "rewards/accuracy_reward": 0.14508929336443543, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9821428954601288, "step": 2228 }, { "clip_ratio": 0.0, "completion_length": 611.2053833007812, "epoch": 0.6658203270853559, "grad_norm": 0.20449529588222504, "kl": 0.1033935546875, "learning_rate": 6.0626261889044236e-06, "loss": 0.0213, "reward": 2.0976563096046448, "reward_std": 0.145141227170825, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9905134290456772, "step": 2229 }, { "clip_ratio": 0.0, "completion_length": 576.2187957763672, "epoch": 0.6661190351728773, "grad_norm": 0.36199986934661865, "kl": 0.1195068359375, "learning_rate": 6.053040602832581e-06, "loss": 0.0516, "reward": 2.021763503551483, "reward_std": 0.18181529268622398, "rewards/accuracy_reward": 0.05133928754366934, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9860491454601288, "step": 2230 }, { "clip_ratio": 0.0, "completion_length": 538.0312805175781, "epoch": 0.6664177432603988, "grad_norm": 0.23617030680179596, "kl": 0.105224609375, "learning_rate": 6.043459310665716e-06, "loss": 0.0122, "reward": 2.1272322833538055, "reward_std": 0.19760730862617493, "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9933035969734192, "step": 2231 }, { "clip_ratio": 0.0, "completion_length": 531.2968978881836, "epoch": 0.6667164513479202, "grad_norm": 0.43557435274124146, "kl": 0.3966064453125, "learning_rate": 6.033882322827338e-06, "loss": 0.0561, "reward": 2.1099331378936768, "reward_std": 0.225278009660542, "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.9732143133878708, "rewards/tag_count_reward": 0.9782366454601288, "step": 2232 }, { "clip_ratio": 0.0, "completion_length": 564.544677734375, "epoch": 0.6670151594354418, "grad_norm": 1.2029283046722412, "kl": 0.3150634765625, "learning_rate": 6.024309649736276e-06, "loss": 0.058, "reward": 1.9760045409202576, "reward_std": 0.20374789834022522, "rewards/accuracy_reward": 0.03125000232830644, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9760045111179352, "step": 2233 }, { "clip_ratio": 0.0, "completion_length": 596.5223541259766, "epoch": 0.6673138675229632, "grad_norm": 1.8902983665466309, "kl": 0.60009765625, "learning_rate": 6.0147413018066515e-06, "loss": 0.1147, "reward": 1.9040179550647736, "reward_std": 0.27361947670578957, "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.9397321790456772, "rewards/tag_count_reward": 0.9575893133878708, "step": 2234 }, { "clip_ratio": 0.0, "completion_length": 539.5044937133789, "epoch": 0.6676125756104847, "grad_norm": 0.808760404586792, "kl": 0.3076171875, "learning_rate": 6.005177289447895e-06, "loss": 0.1128, "reward": 2.0251117050647736, "reward_std": 0.3063171058893204, "rewards/accuracy_reward": 0.13392857578583062, "rewards/format_reward": 0.9375000447034836, "rewards/tag_count_reward": 0.953683078289032, "step": 2235 }, { "clip_ratio": 0.0, "completion_length": 587.3906555175781, "epoch": 0.6679112836980061, "grad_norm": 1.1241713762283325, "kl": 0.2978515625, "learning_rate": 5.9956176230647115e-06, "loss": 0.1376, "reward": 1.9871653020381927, "reward_std": 0.3314635306596756, "rewards/accuracy_reward": 0.08482143399305642, "rewards/format_reward": 0.9441964626312256, "rewards/tag_count_reward": 0.9581473618745804, "step": 2236 }, { "clip_ratio": 0.0, "completion_length": 565.0781402587891, "epoch": 0.6682099917855276, "grad_norm": 18.340042114257812, "kl": 2.078857421875, "learning_rate": 5.986062313057084e-06, "loss": 0.249, "reward": 1.9190848767757416, "reward_std": 0.4441080018877983, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.9084821790456772, "rewards/tag_count_reward": 0.9324777126312256, "step": 2237 }, { "clip_ratio": 0.0, "completion_length": 562.6317138671875, "epoch": 0.6685086998730491, "grad_norm": 1.0704460144042969, "kl": 0.46875, "learning_rate": 5.97651136982025e-06, "loss": 0.1681, "reward": 1.9760045409202576, "reward_std": 0.40866465866565704, "rewards/accuracy_reward": 0.14508929336443543, "rewards/format_reward": 0.90401791036129, "rewards/tag_count_reward": 0.9268973767757416, "step": 2238 }, { "clip_ratio": 0.0, "completion_length": 551.0178833007812, "epoch": 0.6688074079605706, "grad_norm": 0.7087568640708923, "kl": 0.411376953125, "learning_rate": 5.966964803744701e-06, "loss": 0.1884, "reward": 1.9799107909202576, "reward_std": 0.5461779981851578, "rewards/accuracy_reward": 0.17633929662406445, "rewards/format_reward": 0.8883928954601288, "rewards/tag_count_reward": 0.9151786118745804, "step": 2239 }, { "clip_ratio": 0.0, "completion_length": 560.154052734375, "epoch": 0.669106116048092, "grad_norm": 0.8177587985992432, "kl": 0.4619140625, "learning_rate": 5.957422625216168e-06, "loss": 0.2006, "reward": 1.9001117050647736, "reward_std": 0.4787428751587868, "rewards/accuracy_reward": 0.09821428847499192, "rewards/format_reward": 0.88839291036129, "rewards/tag_count_reward": 0.913504496216774, "step": 2240 }, { "clip_ratio": 0.0, "completion_length": 530.9777069091797, "epoch": 0.6694048241356134, "grad_norm": 0.9035939574241638, "kl": 0.31689453125, "learning_rate": 5.947884844615603e-06, "loss": 0.1293, "reward": 1.9637277722358704, "reward_std": 0.3608165867626667, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.9441964626312256, "rewards/tag_count_reward": 0.9570312947034836, "step": 2241 }, { "clip_ratio": 0.0, "completion_length": 517.4777069091797, "epoch": 0.669703532223135, "grad_norm": 0.5731971859931946, "kl": 0.215576171875, "learning_rate": 5.938351472319177e-06, "loss": 0.1065, "reward": 2.0479911863803864, "reward_std": 0.3110194206237793, "rewards/accuracy_reward": 0.12723214784637094, "rewards/format_reward": 0.9553571939468384, "rewards/tag_count_reward": 0.965401828289032, "step": 2242 }, { "clip_ratio": 0.0, "completion_length": 475.27904510498047, "epoch": 0.6700022403106564, "grad_norm": 0.4002302587032318, "kl": 0.1236572265625, "learning_rate": 5.928822518698263e-06, "loss": 0.0782, "reward": 2.272321581840515, "reward_std": 0.2798566706478596, "rewards/accuracy_reward": 0.3169642984867096, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9799107611179352, "step": 2243 }, { "clip_ratio": 0.0, "completion_length": 508.74778747558594, "epoch": 0.6703009483981779, "grad_norm": 0.26157575845718384, "kl": 0.1375732421875, "learning_rate": 5.919297994119433e-06, "loss": 0.0582, "reward": 2.0926340222358704, "reward_std": 0.1795903779566288, "rewards/accuracy_reward": 0.11383929336443543, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9899553805589676, "step": 2244 }, { "clip_ratio": 0.0, "completion_length": 470.35047149658203, "epoch": 0.6705996564856993, "grad_norm": 0.31043753027915955, "kl": 0.1241455078125, "learning_rate": 5.909777908944433e-06, "loss": 0.0701, "reward": 2.1802456378936768, "reward_std": 0.21314378082752228, "rewards/accuracy_reward": 0.2165178656578064, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.98604916036129, "step": 2245 }, { "clip_ratio": 0.0, "completion_length": 501.77904510498047, "epoch": 0.6708983645732208, "grad_norm": 0.16017626225948334, "kl": 0.1168212890625, "learning_rate": 5.9002622735301815e-06, "loss": 0.0368, "reward": 2.1434153020381927, "reward_std": 0.16081068105995655, "rewards/accuracy_reward": 0.1718750111758709, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9871652126312256, "step": 2246 }, { "clip_ratio": 0.0, "completion_length": 471.3080520629883, "epoch": 0.6711970726607422, "grad_norm": 0.17108573019504547, "kl": 0.1168212890625, "learning_rate": 5.89075109822876e-06, "loss": 0.0426, "reward": 2.118861734867096, "reward_std": 0.14968278259038925, "rewards/accuracy_reward": 0.1450892873108387, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9893973469734192, "step": 2247 }, { "clip_ratio": 0.0, "completion_length": 472.54019927978516, "epoch": 0.6714957807482638, "grad_norm": 0.2603254020214081, "kl": 0.1004638671875, "learning_rate": 5.881244393387395e-06, "loss": 0.0245, "reward": 2.0122768878936768, "reward_std": 0.1414729654788971, "rewards/accuracy_reward": 0.03125000116415322, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9921875298023224, "step": 2248 }, { "clip_ratio": 0.0, "completion_length": 432.9866256713867, "epoch": 0.6717944888357852, "grad_norm": 0.16900880634784698, "kl": 0.1094970703125, "learning_rate": 5.871742169348447e-06, "loss": 0.0093, "reward": 2.0457590222358704, "reward_std": 0.11416336987167597, "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9966517984867096, "step": 2249 }, { "clip_ratio": 0.0, "completion_length": 477.4955596923828, "epoch": 0.6720931969233067, "grad_norm": 0.21831680834293365, "kl": 0.1158447265625, "learning_rate": 5.862244436449405e-06, "loss": 0.064, "reward": 2.082031339406967, "reward_std": 0.1972164809703827, "rewards/accuracy_reward": 0.10267857438884676, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.990513414144516, "step": 2250 }, { "clip_ratio": 0.0, "completion_length": 474.77457427978516, "epoch": 0.6723919050108281, "grad_norm": 0.16336795687675476, "kl": 0.1195068359375, "learning_rate": 5.852751205022875e-06, "loss": 0.0678, "reward": 2.054687589406967, "reward_std": 0.17372700199484825, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9832589626312256, "step": 2251 }, { "clip_ratio": 0.0, "completion_length": 473.5067138671875, "epoch": 0.6726906130983497, "grad_norm": 0.12196514010429382, "kl": 0.100830078125, "learning_rate": 5.84326248539656e-06, "loss": 0.0226, "reward": 2.0585938096046448, "reward_std": 0.10689675435423851, "rewards/accuracy_reward": 0.06696428754366934, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9960937649011612, "step": 2252 }, { "clip_ratio": 0.0, "completion_length": 474.2924346923828, "epoch": 0.6729893211858711, "grad_norm": 0.2090972512960434, "kl": 0.1385498046875, "learning_rate": 5.833778287893257e-06, "loss": 0.0761, "reward": 2.0887277722358704, "reward_std": 0.2420149128884077, "rewards/accuracy_reward": 0.13169643399305642, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9838170111179352, "step": 2253 }, { "clip_ratio": 0.0, "completion_length": 447.7120819091797, "epoch": 0.6732880292733926, "grad_norm": 0.15815484523773193, "kl": 0.089111328125, "learning_rate": 5.82429862283084e-06, "loss": 0.0459, "reward": 2.0457590520381927, "reward_std": 0.17126222606748343, "rewards/accuracy_reward": 0.06250000349245965, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9921875298023224, "step": 2254 }, { "clip_ratio": 0.0, "completion_length": 471.63394927978516, "epoch": 0.673586737360914, "grad_norm": 0.22165624797344208, "kl": 0.11279296875, "learning_rate": 5.81482350052226e-06, "loss": 0.0535, "reward": 2.0513393878936768, "reward_std": 0.1542956717312336, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9910714477300644, "step": 2255 }, { "clip_ratio": 0.0, "completion_length": 502.5558319091797, "epoch": 0.6738854454484355, "grad_norm": 0.27161040902137756, "kl": 0.1094970703125, "learning_rate": 5.805352931275522e-06, "loss": 0.0288, "reward": 2.111607253551483, "reward_std": 0.18772686272859573, "rewards/accuracy_reward": 0.13839286006987095, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9888393133878708, "step": 2256 }, { "clip_ratio": 0.0, "completion_length": 491.2701110839844, "epoch": 0.674184153535957, "grad_norm": 0.19074468314647675, "kl": 0.12841796875, "learning_rate": 5.795886925393672e-06, "loss": 0.064, "reward": 2.0184152722358704, "reward_std": 0.11318524181842804, "rewards/accuracy_reward": 0.0446428582072258, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9893973618745804, "step": 2257 }, { "clip_ratio": 0.0, "completion_length": 490.0044860839844, "epoch": 0.6744828616234785, "grad_norm": 0.1410241276025772, "kl": 0.10205078125, "learning_rate": 5.786425493174801e-06, "loss": 0.0267, "reward": 2.083705484867096, "reward_std": 0.09840513207018375, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9966518133878708, "step": 2258 }, { "clip_ratio": 0.0, "completion_length": 485.6718978881836, "epoch": 0.6747815697109999, "grad_norm": 0.12813377380371094, "kl": 0.1048583984375, "learning_rate": 5.7769686449120225e-06, "loss": 0.0257, "reward": 2.1177456378936768, "reward_std": 0.15131548419594765, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.990513414144516, "step": 2259 }, { "clip_ratio": 0.0, "completion_length": 478.0491256713867, "epoch": 0.6750802777985214, "grad_norm": 0.14539262652397156, "kl": 0.103271484375, "learning_rate": 5.767516390893451e-06, "loss": 0.0429, "reward": 2.0792412161827087, "reward_std": 0.1794506572186947, "rewards/accuracy_reward": 0.10044643143191934, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9899553954601288, "step": 2260 }, { "clip_ratio": 0.0, "completion_length": 475.30582427978516, "epoch": 0.6753789858860428, "grad_norm": 0.2382662296295166, "kl": 0.1495361328125, "learning_rate": 5.758068741402223e-06, "loss": 0.0886, "reward": 2.1400670409202576, "reward_std": 0.24810832366347313, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.977120578289032, "step": 2261 }, { "clip_ratio": 0.0, "completion_length": 525.3683319091797, "epoch": 0.6756776939735644, "grad_norm": 0.14406238496303558, "kl": 0.110107421875, "learning_rate": 5.748625706716448e-06, "loss": 0.0373, "reward": 2.0345982909202576, "reward_std": 0.16695569455623627, "rewards/accuracy_reward": 0.0602678619325161, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9899553954601288, "step": 2262 }, { "clip_ratio": 0.0, "completion_length": 496.13841247558594, "epoch": 0.6759764020610858, "grad_norm": 0.16346938908100128, "kl": 0.1077880859375, "learning_rate": 5.739187297109223e-06, "loss": 0.0462, "reward": 2.0954242050647736, "reward_std": 0.16746445186436176, "rewards/accuracy_reward": 0.11607143259607255, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.990513414144516, "step": 2263 }, { "clip_ratio": 0.0, "completion_length": 539.897346496582, "epoch": 0.6762751101486073, "grad_norm": 0.1481361836194992, "kl": 0.4859619140625, "learning_rate": 5.729753522848618e-06, "loss": 0.0264, "reward": 2.071428656578064, "reward_std": 0.25315483659505844, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9821428954601288, "step": 2264 }, { "clip_ratio": 0.0, "completion_length": 549.5111770629883, "epoch": 0.6765738182361287, "grad_norm": 0.3174876570701599, "kl": 0.2003173828125, "learning_rate": 5.720324394197649e-06, "loss": 0.0298, "reward": 2.099330484867096, "reward_std": 0.20912921521812677, "rewards/accuracy_reward": 0.12276786239817739, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9899553954601288, "step": 2265 }, { "clip_ratio": 0.0, "completion_length": 543.4933319091797, "epoch": 0.6768725263236502, "grad_norm": 0.1411864459514618, "kl": 0.1129150390625, "learning_rate": 5.710899921414284e-06, "loss": 0.0495, "reward": 2.075334906578064, "reward_std": 0.1685822755098343, "rewards/accuracy_reward": 0.10267857811413705, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9882812798023224, "step": 2266 }, { "clip_ratio": 0.0, "completion_length": 565.1763610839844, "epoch": 0.6771712344111717, "grad_norm": 0.10633184015750885, "kl": 0.091796875, "learning_rate": 5.701480114751432e-06, "loss": 0.0196, "reward": 2.1082590222358704, "reward_std": 0.0974982175976038, "rewards/accuracy_reward": 0.12500000605359674, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9921875298023224, "step": 2267 }, { "clip_ratio": 0.0, "completion_length": 542.1093902587891, "epoch": 0.6774699424986932, "grad_norm": 0.1572662740945816, "kl": 0.0902099609375, "learning_rate": 5.692064984456911e-06, "loss": 0.0446, "reward": 2.119419753551483, "reward_std": 0.21335318312048912, "rewards/accuracy_reward": 0.14732143748551607, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.98995541036129, "step": 2268 }, { "clip_ratio": 0.0, "completion_length": 569.8437652587891, "epoch": 0.6777686505862146, "grad_norm": 0.08309128135442734, "kl": 0.1126708984375, "learning_rate": 5.6826545407734636e-06, "loss": 0.0214, "reward": 2.0574778020381927, "reward_std": 0.1196656497195363, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9860491305589676, "step": 2269 }, { "clip_ratio": 0.0, "completion_length": 577.9732360839844, "epoch": 0.6780673586737361, "grad_norm": 0.18310382962226868, "kl": 0.106689453125, "learning_rate": 5.673248793938735e-06, "loss": 0.0428, "reward": 2.04631707072258, "reward_std": 0.16366672702133656, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9905134439468384, "step": 2270 }, { "clip_ratio": 0.0, "completion_length": 581.7991256713867, "epoch": 0.6783660667612575, "grad_norm": 0.31548184156417847, "kl": 0.0921630859375, "learning_rate": 5.663847754185246e-06, "loss": 0.0306, "reward": 2.0440849363803864, "reward_std": 0.16364002786576748, "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9905134290456772, "step": 2271 }, { "clip_ratio": 0.0, "completion_length": 597.7902069091797, "epoch": 0.6786647748487791, "grad_norm": 0.1553804874420166, "kl": 0.0877685546875, "learning_rate": 5.654451431740417e-06, "loss": 0.0526, "reward": 2.040736734867096, "reward_std": 0.22732827439904213, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9871652275323868, "step": 2272 }, { "clip_ratio": 0.0, "completion_length": 608.9709930419922, "epoch": 0.6789634829363005, "grad_norm": 0.12650960683822632, "kl": 0.085693359375, "learning_rate": 5.645059836826518e-06, "loss": 0.0227, "reward": 2.1400670409202576, "reward_std": 0.14359961077570915, "rewards/accuracy_reward": 0.15401786286383867, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9949777126312256, "step": 2273 }, { "clip_ratio": 0.0, "completion_length": 584.0937805175781, "epoch": 0.679262191023822, "grad_norm": 0.11492251604795456, "kl": 0.103515625, "learning_rate": 5.6356729796606844e-06, "loss": 0.0375, "reward": 2.0731027126312256, "reward_std": 0.1491047628223896, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9882812798023224, "step": 2274 }, { "clip_ratio": 0.0, "completion_length": 591.169677734375, "epoch": 0.6795608991113434, "grad_norm": 0.14299757778644562, "kl": 0.0887451171875, "learning_rate": 5.626290870454905e-06, "loss": 0.0417, "reward": 2.0323661267757416, "reward_std": 0.12877880968153477, "rewards/accuracy_reward": 0.058035718742758036, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9899553954601288, "step": 2275 }, { "clip_ratio": 0.0, "completion_length": 571.8750152587891, "epoch": 0.679859607198865, "grad_norm": 2.701746702194214, "kl": 0.469482421875, "learning_rate": 5.616913519415983e-06, "loss": 0.0658, "reward": 2.0691965222358704, "reward_std": 0.21035540476441383, "rewards/accuracy_reward": 0.09375000419095159, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.988839328289032, "step": 2276 }, { "clip_ratio": 0.0, "completion_length": 569.2098541259766, "epoch": 0.6801583152863864, "grad_norm": 0.18020427227020264, "kl": 0.1041259765625, "learning_rate": 5.607540936745564e-06, "loss": 0.0364, "reward": 2.2215402722358704, "reward_std": 0.15718754567205906, "rewards/accuracy_reward": 0.2321428656578064, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9960937798023224, "step": 2277 }, { "clip_ratio": 0.0, "completion_length": 588.4709930419922, "epoch": 0.6804570233739079, "grad_norm": 0.16487333178520203, "kl": 0.09423828125, "learning_rate": 5.598173132640102e-06, "loss": 0.039, "reward": 2.06584832072258, "reward_std": 0.14393239095807076, "rewards/accuracy_reward": 0.08482143469154835, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9921875149011612, "step": 2278 }, { "clip_ratio": 0.0, "completion_length": 547.334846496582, "epoch": 0.6807557314614293, "grad_norm": 0.12234886735677719, "kl": 0.104736328125, "learning_rate": 5.588810117290843e-06, "loss": 0.0467, "reward": 2.041852831840515, "reward_std": 0.21296042669564486, "rewards/accuracy_reward": 0.07142857438884676, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9860491454601288, "step": 2279 }, { "clip_ratio": 0.0, "completion_length": 556.4754791259766, "epoch": 0.6810544395489508, "grad_norm": 0.12739188969135284, "kl": 0.0916748046875, "learning_rate": 5.579451900883833e-06, "loss": 0.0281, "reward": 2.0998885333538055, "reward_std": 0.11489041149616241, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9905134290456772, "step": 2280 }, { "clip_ratio": 0.0, "completion_length": 576.1830596923828, "epoch": 0.6813531476364723, "grad_norm": 0.1540086567401886, "kl": 0.0885009765625, "learning_rate": 5.570098493599898e-06, "loss": 0.0326, "reward": 2.1171876192092896, "reward_std": 0.14212011359632015, "rewards/accuracy_reward": 0.13616071827709675, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9921875298023224, "step": 2281 }, { "clip_ratio": 0.0, "completion_length": 574.3973388671875, "epoch": 0.6816518557239938, "grad_norm": 0.11883820593357086, "kl": 0.0953369140625, "learning_rate": 5.5607499056146216e-06, "loss": 0.0399, "reward": 2.041294753551483, "reward_std": 0.1239640424028039, "rewards/accuracy_reward": 0.06696428777649999, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9899553805589676, "step": 2282 }, { "clip_ratio": 0.0, "completion_length": 556.8125305175781, "epoch": 0.6819505638115152, "grad_norm": 0.21026578545570374, "kl": 0.0888671875, "learning_rate": 5.551406147098355e-06, "loss": 0.0413, "reward": 2.1668527722358704, "reward_std": 0.15900138020515442, "rewards/accuracy_reward": 0.1897321529686451, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9905134290456772, "step": 2283 }, { "clip_ratio": 0.0, "completion_length": 561.256721496582, "epoch": 0.6822492718990366, "grad_norm": 1.0231939554214478, "kl": 0.1832275390625, "learning_rate": 5.542067228216195e-06, "loss": 0.0594, "reward": 2.0781251192092896, "reward_std": 0.1455477885901928, "rewards/accuracy_reward": 0.09821429220028222, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9910714775323868, "step": 2284 }, { "clip_ratio": 0.0, "completion_length": 623.982177734375, "epoch": 0.6825479799865581, "grad_norm": 0.2913665473461151, "kl": 0.1148681640625, "learning_rate": 5.532733159127963e-06, "loss": 0.0194, "reward": 2.0948662161827087, "reward_std": 0.19670528173446655, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9921875298023224, "step": 2285 }, { "clip_ratio": 0.0, "completion_length": 590.4509124755859, "epoch": 0.6828466880740796, "grad_norm": 0.16056783497333527, "kl": 0.265380859375, "learning_rate": 5.523403949988217e-06, "loss": 0.0084, "reward": 2.0675223767757416, "reward_std": 0.16440021619200706, "rewards/accuracy_reward": 0.08482143096625805, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9938616305589676, "step": 2286 }, { "clip_ratio": 0.0, "completion_length": 605.6674346923828, "epoch": 0.6831453961616011, "grad_norm": 1.1177860498428345, "kl": 0.14013671875, "learning_rate": 5.514079610946217e-06, "loss": 0.0547, "reward": 2.0150670409202576, "reward_std": 0.1810532659292221, "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9860491305589676, "step": 2287 }, { "clip_ratio": 0.0, "completion_length": 548.3951110839844, "epoch": 0.6834441042491225, "grad_norm": 0.8177090883255005, "kl": 0.3270263671875, "learning_rate": 5.504760152145934e-06, "loss": 0.0376, "reward": 2.1054688096046448, "reward_std": 0.16459326166659594, "rewards/accuracy_reward": 0.133928582072258, "rewards/format_reward": 0.9843750149011612, "rewards/tag_count_reward": 0.9871651977300644, "step": 2288 }, { "clip_ratio": 0.0, "completion_length": 587.8594207763672, "epoch": 0.683742812336644, "grad_norm": 0.18658681213855743, "kl": 0.11083984375, "learning_rate": 5.4954455837260265e-06, "loss": 0.0353, "reward": 2.130580484867096, "reward_std": 0.1995290443301201, "rewards/accuracy_reward": 0.15178572200238705, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9899553805589676, "step": 2289 }, { "clip_ratio": 0.0, "completion_length": 547.6629791259766, "epoch": 0.6840415204241654, "grad_norm": 0.45031479001045227, "kl": 0.1630859375, "learning_rate": 5.486135915819827e-06, "loss": 0.0862, "reward": 2.1729912161827087, "reward_std": 0.18120701611042023, "rewards/accuracy_reward": 0.2053571529686451, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9854911118745804, "step": 2290 }, { "clip_ratio": 0.0, "completion_length": 537.2879638671875, "epoch": 0.684340228511687, "grad_norm": 0.16823844611644745, "kl": 0.107421875, "learning_rate": 5.476831158555345e-06, "loss": 0.0551, "reward": 2.162946581840515, "reward_std": 0.1813310645520687, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.988839328289032, "step": 2291 }, { "clip_ratio": 0.0, "completion_length": 540.3638534545898, "epoch": 0.6846389365992084, "grad_norm": 0.24840721487998962, "kl": 0.1351318359375, "learning_rate": 5.467531322055247e-06, "loss": 0.0293, "reward": 2.1054688096046448, "reward_std": 0.23045930452644825, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.9843750149011612, "rewards/tag_count_reward": 0.9849330484867096, "step": 2292 }, { "clip_ratio": 0.0, "completion_length": 568.919677734375, "epoch": 0.6849376446867299, "grad_norm": 0.37335771322250366, "kl": 0.3160400390625, "learning_rate": 5.458236416436838e-06, "loss": 0.0516, "reward": 2.1623884439468384, "reward_std": 0.15628127194941044, "rewards/accuracy_reward": 0.18303572572767735, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9905134290456772, "step": 2293 }, { "clip_ratio": 0.0, "completion_length": 566.2455596923828, "epoch": 0.6852363527742513, "grad_norm": 0.4965033531188965, "kl": 0.1845703125, "learning_rate": 5.448946451812067e-06, "loss": 0.0697, "reward": 2.0039063692092896, "reward_std": 0.19293048232793808, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9838170111179352, "step": 2294 }, { "clip_ratio": 0.0, "completion_length": 523.5446624755859, "epoch": 0.6855350608617728, "grad_norm": 0.16103824973106384, "kl": 0.130859375, "learning_rate": 5.43966143828751e-06, "loss": 0.0669, "reward": 2.0563617050647736, "reward_std": 0.15706433728337288, "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9871652275323868, "step": 2295 }, { "clip_ratio": 0.0, "completion_length": 517.4308319091797, "epoch": 0.6858337689492943, "grad_norm": 1.1236084699630737, "kl": 0.3865966796875, "learning_rate": 5.430381385964343e-06, "loss": 0.0751, "reward": 2.1646206378936768, "reward_std": 0.21433288231492043, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.983816996216774, "step": 2296 }, { "clip_ratio": 0.0, "completion_length": 556.9732360839844, "epoch": 0.6861324770368158, "grad_norm": 0.6840149164199829, "kl": 0.3236083984375, "learning_rate": 5.421106304938356e-06, "loss": 0.0487, "reward": 2.165178656578064, "reward_std": 0.10753842070698738, "rewards/accuracy_reward": 0.176339291036129, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.995535746216774, "step": 2297 }, { "clip_ratio": 0.0, "completion_length": 550.2120819091797, "epoch": 0.6864311851243372, "grad_norm": 0.8726528286933899, "kl": 0.377197265625, "learning_rate": 5.411836205299934e-06, "loss": 0.0675, "reward": 2.0625000596046448, "reward_std": 0.18246626947075129, "rewards/accuracy_reward": 0.09821429057046771, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9821428954601288, "step": 2298 }, { "clip_ratio": 0.0, "completion_length": 553.622802734375, "epoch": 0.6867298932118587, "grad_norm": 0.17643089592456818, "kl": 0.1038818359375, "learning_rate": 5.402571097134029e-06, "loss": 0.0196, "reward": 2.1205357909202576, "reward_std": 0.12897825613617897, "rewards/accuracy_reward": 0.13169643585570157, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9955357313156128, "step": 2299 }, { "clip_ratio": 0.0, "completion_length": 549.5268096923828, "epoch": 0.6870286012993801, "grad_norm": 0.18515878915786743, "kl": 0.1474609375, "learning_rate": 5.393310990520177e-06, "loss": 0.0314, "reward": 2.044642984867096, "reward_std": 0.16450568474829197, "rewards/accuracy_reward": 0.06696429057046771, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.988839328289032, "step": 2300 }, { "clip_ratio": 0.0, "completion_length": 517.0491256713867, "epoch": 0.6873273093869017, "grad_norm": 0.12057393789291382, "kl": 0.09130859375, "learning_rate": 5.384055895532458e-06, "loss": 0.034, "reward": 2.0669643878936768, "reward_std": 0.10753842256963253, "rewards/accuracy_reward": 0.07812500279396772, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9955357313156128, "step": 2301 }, { "clip_ratio": 0.0, "completion_length": 582.060302734375, "epoch": 0.6876260174744231, "grad_norm": 0.09649860858917236, "kl": 0.0960693359375, "learning_rate": 5.374805822239516e-06, "loss": 0.0022, "reward": 2.0825894474983215, "reward_std": 0.03495405800640583, "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 1.0, "step": 2302 }, { "clip_ratio": 0.0, "completion_length": 546.1406707763672, "epoch": 0.6879247255619446, "grad_norm": 0.7096304297447205, "kl": 0.377685546875, "learning_rate": 5.365560780704524e-06, "loss": 0.0523, "reward": 2.1489956378936768, "reward_std": 0.18304398283362389, "rewards/accuracy_reward": 0.16517857694998384, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.992745578289032, "step": 2303 }, { "clip_ratio": 0.0, "completion_length": 562.3214416503906, "epoch": 0.688223433649466, "grad_norm": 1.45001220703125, "kl": 0.4012451171875, "learning_rate": 5.356320780985176e-06, "loss": 0.0574, "reward": 2.055803656578064, "reward_std": 0.1333034597337246, "rewards/accuracy_reward": 0.07366071688011289, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9910714775323868, "step": 2304 }, { "clip_ratio": 0.0, "completion_length": 554.8571701049805, "epoch": 0.6885221417369876, "grad_norm": 0.1794653683900833, "kl": 0.187744140625, "learning_rate": 5.347085833133689e-06, "loss": -0.0117, "reward": 2.1556921005249023, "reward_std": 0.10886568389832973, "rewards/accuracy_reward": 0.16294643701985478, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.997209832072258, "step": 2305 }, { "clip_ratio": 0.0, "completion_length": 521.8527069091797, "epoch": 0.688820849824509, "grad_norm": 0.36596807837486267, "kl": 0.198486328125, "learning_rate": 5.337855947196784e-06, "loss": 0.0238, "reward": 2.150111734867096, "reward_std": 0.1565728448331356, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9916295111179352, "step": 2306 }, { "clip_ratio": 0.0, "completion_length": 564.7388610839844, "epoch": 0.6891195579120305, "grad_norm": 3.4996631145477295, "kl": 0.3736572265625, "learning_rate": 5.328631133215665e-06, "loss": 0.009, "reward": 2.1210938096046448, "reward_std": 0.17706676572561264, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.9888392984867096, "rewards/tag_count_reward": 0.9960937798023224, "step": 2307 }, { "clip_ratio": 0.0, "completion_length": 542.1786041259766, "epoch": 0.6894182659995519, "grad_norm": 0.45999395847320557, "kl": 0.177490234375, "learning_rate": 5.31941140122603e-06, "loss": 0.0073, "reward": 2.0825893878936768, "reward_std": 0.17220186069607735, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9955357313156128, "step": 2308 }, { "clip_ratio": 0.0, "completion_length": 584.7522583007812, "epoch": 0.6897169740870734, "grad_norm": 0.4133806824684143, "kl": 0.215576171875, "learning_rate": 5.310196761258048e-06, "loss": 0.0223, "reward": 2.0351562798023224, "reward_std": 0.10978634096682072, "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9927455633878708, "step": 2309 }, { "clip_ratio": 0.0, "completion_length": 556.0669860839844, "epoch": 0.6900156821745949, "grad_norm": 0.10451837629079819, "kl": 0.0751953125, "learning_rate": 5.300987223336334e-06, "loss": 0.0085, "reward": 2.1406250596046448, "reward_std": 0.1018661530688405, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 2310 }, { "clip_ratio": 0.0, "completion_length": 577.1049346923828, "epoch": 0.6903143902621164, "grad_norm": 0.1366843730211258, "kl": 0.0791015625, "learning_rate": 5.29178279747997e-06, "loss": 0.0275, "reward": 2.091517984867096, "reward_std": 0.13706324435770512, "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9977678656578064, "step": 2311 }, { "clip_ratio": 0.0, "completion_length": 578.5000305175781, "epoch": 0.6906130983496378, "grad_norm": 0.5751811861991882, "kl": 0.5516357421875, "learning_rate": 5.282583493702471e-06, "loss": 0.0248, "reward": 2.070312589406967, "reward_std": 0.1554046580567956, "rewards/accuracy_reward": 0.09375000419095159, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9899553954601288, "step": 2312 }, { "clip_ratio": 0.0, "completion_length": 579.4352874755859, "epoch": 0.6909118064371593, "grad_norm": 0.8412837386131287, "kl": 0.3367919921875, "learning_rate": 5.273389322011771e-06, "loss": 0.0602, "reward": 2.1367188096046448, "reward_std": 0.16891740635037422, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9916294813156128, "step": 2313 }, { "clip_ratio": 0.0, "completion_length": 558.9821701049805, "epoch": 0.6912105145246807, "grad_norm": 2.19205641746521, "kl": 0.67431640625, "learning_rate": 5.2642002924102334e-06, "loss": 0.0874, "reward": 2.085937649011612, "reward_std": 0.23447431437671185, "rewards/accuracy_reward": 0.12276786286383867, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9832589626312256, "step": 2314 }, { "clip_ratio": 0.0, "completion_length": 573.0982208251953, "epoch": 0.6915092226122023, "grad_norm": 0.3228515684604645, "kl": 0.1546630859375, "learning_rate": 5.255016414894616e-06, "loss": 0.0088, "reward": 2.1863839626312256, "reward_std": 0.15281121619045734, "rewards/accuracy_reward": 0.1986607275903225, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.994419664144516, "step": 2315 }, { "clip_ratio": 0.0, "completion_length": 576.4933166503906, "epoch": 0.6918079306997237, "grad_norm": 0.34568947553634644, "kl": 0.33056640625, "learning_rate": 5.245837699456083e-06, "loss": 0.0392, "reward": 2.1250000596046448, "reward_std": 0.1934623122215271, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9933035969734192, "step": 2316 }, { "clip_ratio": 0.0, "completion_length": 591.060302734375, "epoch": 0.6921066387872452, "grad_norm": 0.356583833694458, "kl": 0.2867431640625, "learning_rate": 5.236664156080175e-06, "loss": 0.019, "reward": 2.1177456378936768, "reward_std": 0.14004047773778439, "rewards/accuracy_reward": 0.12500000302679837, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9972098469734192, "step": 2317 }, { "clip_ratio": 0.0, "completion_length": 582.6897583007812, "epoch": 0.6924053468747666, "grad_norm": 1.8009735345840454, "kl": 0.8748779296875, "learning_rate": 5.227495794746806e-06, "loss": 0.0582, "reward": 2.092076003551483, "reward_std": 0.16301273182034492, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9916294813156128, "step": 2318 }, { "clip_ratio": 0.0, "completion_length": 592.3460083007812, "epoch": 0.6927040549622882, "grad_norm": 1.206982135772705, "kl": 0.5369873046875, "learning_rate": 5.218332625430258e-06, "loss": 0.0649, "reward": 2.042968839406967, "reward_std": 0.1990203633904457, "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9893973469734192, "step": 2319 }, { "clip_ratio": 0.0, "completion_length": 590.0714569091797, "epoch": 0.6930027630498096, "grad_norm": 0.15231291949748993, "kl": 0.3504638671875, "learning_rate": 5.209174658099162e-06, "loss": 0.0038, "reward": 2.10881707072258, "reward_std": 0.11988569144159555, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9927455484867096, "step": 2320 }, { "clip_ratio": 0.0, "completion_length": 617.7120819091797, "epoch": 0.6933014711373311, "grad_norm": 0.24225319921970367, "kl": 0.18212890625, "learning_rate": 5.200021902716483e-06, "loss": 0.0321, "reward": 2.0401786267757416, "reward_std": 0.1607905998826027, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9933036118745804, "step": 2321 }, { "clip_ratio": 0.0, "completion_length": 602.1674346923828, "epoch": 0.6936001792248525, "grad_norm": 1.4123752117156982, "kl": 0.57421875, "learning_rate": 5.190874369239526e-06, "loss": 0.0516, "reward": 2.050781339406967, "reward_std": 0.1590961031615734, "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9882812947034836, "step": 2322 }, { "clip_ratio": 0.0, "completion_length": 586.7165374755859, "epoch": 0.693898887312374, "grad_norm": 1.600246548652649, "kl": 0.58740234375, "learning_rate": 5.181732067619913e-06, "loss": 0.0612, "reward": 2.1568081378936768, "reward_std": 0.2162460032850504, "rewards/accuracy_reward": 0.1785714328289032, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9916295111179352, "step": 2323 }, { "clip_ratio": 0.0, "completion_length": 600.3638610839844, "epoch": 0.6941975953998955, "grad_norm": 2.638596773147583, "kl": 0.815673828125, "learning_rate": 5.172595007803567e-06, "loss": 0.0668, "reward": 2.0541295409202576, "reward_std": 0.15805412642657757, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9871652275323868, "step": 2324 }, { "clip_ratio": 0.0, "completion_length": 635.872802734375, "epoch": 0.694496303487417, "grad_norm": 0.1306127905845642, "kl": 0.1097412109375, "learning_rate": 5.1634631997307165e-06, "loss": 0.0116, "reward": 2.084263503551483, "reward_std": 0.11289609037339687, "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.997209832072258, "step": 2325 }, { "clip_ratio": 0.0, "completion_length": 584.6272659301758, "epoch": 0.6947950115749384, "grad_norm": 2.2319090366363525, "kl": 0.7811279296875, "learning_rate": 5.1543366533358755e-06, "loss": 0.0845, "reward": 2.0976563096046448, "reward_std": 0.2748827412724495, "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9860491454601288, "step": 2326 }, { "clip_ratio": 0.0, "completion_length": 623.5111694335938, "epoch": 0.6950937196624598, "grad_norm": 0.8217201232910156, "kl": 0.6943359375, "learning_rate": 5.145215378547825e-06, "loss": 0.0491, "reward": 2.1138394474983215, "reward_std": 0.2266843430697918, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.988839328289032, "step": 2327 }, { "clip_ratio": 0.0, "completion_length": 628.6607360839844, "epoch": 0.6953924277499813, "grad_norm": 0.9698916077613831, "kl": 0.3104248046875, "learning_rate": 5.136099385289628e-06, "loss": 0.0386, "reward": 2.0585938692092896, "reward_std": 0.1551600033417344, "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.991629496216774, "step": 2328 }, { "clip_ratio": 0.0, "completion_length": 589.8170013427734, "epoch": 0.6956911358375027, "grad_norm": 1.2414265871047974, "kl": 0.6070556640625, "learning_rate": 5.126988683478582e-06, "loss": 0.0854, "reward": 2.1372768878936768, "reward_std": 0.2109485138207674, "rewards/accuracy_reward": 0.16071429289877415, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9899553954601288, "step": 2329 }, { "clip_ratio": 0.0, "completion_length": 567.0178833007812, "epoch": 0.6959898439250243, "grad_norm": 1.319916009902954, "kl": 0.5531005859375, "learning_rate": 5.117883283026243e-06, "loss": 0.0591, "reward": 2.1054688096046448, "reward_std": 0.1715819500386715, "rewards/accuracy_reward": 0.13839286239817739, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.984933078289032, "step": 2330 }, { "clip_ratio": 0.0, "completion_length": 615.3794708251953, "epoch": 0.6962885520125457, "grad_norm": 0.5523761510848999, "kl": 0.828125, "learning_rate": 5.108783193838396e-06, "loss": 0.0441, "reward": 2.0200893878936768, "reward_std": 0.18237296119332314, "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9866071939468384, "step": 2331 }, { "clip_ratio": 0.0, "completion_length": 603.2879638671875, "epoch": 0.6965872601000672, "grad_norm": 118.88762664794922, "kl": 1.26513671875, "learning_rate": 5.099688425815039e-06, "loss": 0.0677, "reward": 2.091517984867096, "reward_std": 0.2252875417470932, "rewards/accuracy_reward": 0.12946429383009672, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.98214291036129, "step": 2332 }, { "clip_ratio": 0.0, "completion_length": 597.4464569091797, "epoch": 0.6968859681875886, "grad_norm": 0.39911192655563354, "kl": 0.1966552734375, "learning_rate": 5.0905989888503924e-06, "loss": 0.0423, "reward": 2.0658482909202576, "reward_std": 0.12158025242388248, "rewards/accuracy_reward": 0.07589286309666932, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9966517984867096, "step": 2333 }, { "clip_ratio": 0.0, "completion_length": 593.6004791259766, "epoch": 0.6971846762751102, "grad_norm": 1.4551736116409302, "kl": 0.3333740234375, "learning_rate": 5.081514892832878e-06, "loss": 0.0418, "reward": 2.0982143878936768, "reward_std": 0.14255047962069511, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.995535746216774, "step": 2334 }, { "clip_ratio": 0.0, "completion_length": 633.4620819091797, "epoch": 0.6974833843626316, "grad_norm": 0.3856094181537628, "kl": 0.390625, "learning_rate": 5.0724361476450925e-06, "loss": 0.0224, "reward": 2.045759081840515, "reward_std": 0.1392341312021017, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9921875447034836, "step": 2335 }, { "clip_ratio": 0.0, "completion_length": 615.0268249511719, "epoch": 0.6977820924501531, "grad_norm": 0.8719306588172913, "kl": 0.5250244140625, "learning_rate": 5.063362763163826e-06, "loss": 0.0614, "reward": 2.0736608505249023, "reward_std": 0.12734234612435102, "rewards/accuracy_reward": 0.09151786309666932, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9910714626312256, "step": 2336 }, { "clip_ratio": 0.0, "completion_length": 596.4665374755859, "epoch": 0.6980808005376745, "grad_norm": 1.3276149034500122, "kl": 0.9195556640625, "learning_rate": 5.0542947492600336e-06, "loss": 0.0932, "reward": 2.047991156578064, "reward_std": 0.2398434840142727, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9832589626312256, "step": 2337 }, { "clip_ratio": 0.0, "completion_length": 631.4486846923828, "epoch": 0.698379508625196, "grad_norm": 0.8887019753456116, "kl": 0.4373779296875, "learning_rate": 5.045232115798819e-06, "loss": 0.0504, "reward": 2.0965403020381927, "reward_std": 0.13684550486505032, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9938616305589676, "step": 2338 }, { "clip_ratio": 0.0, "completion_length": 610.9933319091797, "epoch": 0.6986782167127175, "grad_norm": 0.7384017109870911, "kl": 0.323974609375, "learning_rate": 5.0361748726394435e-06, "loss": 0.0551, "reward": 2.076451003551483, "reward_std": 0.178282274864614, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9893973469734192, "step": 2339 }, { "clip_ratio": 0.0, "completion_length": 590.6942291259766, "epoch": 0.698976924800239, "grad_norm": 0.7707934975624084, "kl": 0.319091796875, "learning_rate": 5.027123029635301e-06, "loss": 0.0714, "reward": 2.0625001192092896, "reward_std": 0.15343830175697803, "rewards/accuracy_reward": 0.0825892873108387, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9910714626312256, "step": 2340 }, { "clip_ratio": 0.0, "completion_length": 560.1518096923828, "epoch": 0.6992756328877604, "grad_norm": 1.4814164638519287, "kl": 0.53515625, "learning_rate": 5.018076596633907e-06, "loss": 0.0479, "reward": 2.0982143878936768, "reward_std": 0.11029373481869698, "rewards/accuracy_reward": 0.11383929220028222, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9933035969734192, "step": 2341 }, { "clip_ratio": 0.0, "completion_length": 573.3058319091797, "epoch": 0.6995743409752819, "grad_norm": 1.1715346574783325, "kl": 0.48193359375, "learning_rate": 5.009035583476898e-06, "loss": 0.0723, "reward": 2.089843899011612, "reward_std": 0.20219067204743624, "rewards/accuracy_reward": 0.12053572130389512, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9871651977300644, "step": 2342 }, { "clip_ratio": 0.0, "completion_length": 608.5915374755859, "epoch": 0.6998730490628033, "grad_norm": 2.397639751434326, "kl": 0.5703125, "learning_rate": 5.000000000000003e-06, "loss": 0.0497, "reward": 2.1316965222358704, "reward_std": 0.1581052877008915, "rewards/accuracy_reward": 0.15401786123402417, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9910714775323868, "step": 2343 }, { "clip_ratio": 0.0, "completion_length": 585.2053833007812, "epoch": 0.7001717571503249, "grad_norm": 1.8685288429260254, "kl": 0.297607421875, "learning_rate": 4.990969856033055e-06, "loss": 0.0357, "reward": 2.071428656578064, "reward_std": 0.09745406173169613, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9933035969734192, "step": 2344 }, { "clip_ratio": 0.0, "completion_length": 640.857177734375, "epoch": 0.7004704652378463, "grad_norm": 0.3652750551700592, "kl": 0.3367919921875, "learning_rate": 4.981945161399969e-06, "loss": 0.0457, "reward": 2.0630581378936768, "reward_std": 0.16734039038419724, "rewards/accuracy_reward": 0.09821428591385484, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9871652275323868, "step": 2345 }, { "clip_ratio": 0.0, "completion_length": 572.6049346923828, "epoch": 0.7007691733253678, "grad_norm": 5.053745269775391, "kl": 1.088134765625, "learning_rate": 4.9729259259187235e-06, "loss": 0.1139, "reward": 2.068638503551483, "reward_std": 0.137535585090518, "rewards/accuracy_reward": 0.09598214854486287, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9882812798023224, "step": 2346 }, { "clip_ratio": 0.0, "completion_length": 627.2455596923828, "epoch": 0.7010678814128892, "grad_norm": 1.136798620223999, "kl": 0.850830078125, "learning_rate": 4.963912159401363e-06, "loss": 0.0488, "reward": 1.9966518580913544, "reward_std": 0.10213530249893665, "rewards/accuracy_reward": 0.0133928582072258, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9921875298023224, "step": 2347 }, { "clip_ratio": 0.0, "completion_length": 608.9553833007812, "epoch": 0.7013665895004108, "grad_norm": 1.0474061965942383, "kl": 0.56494140625, "learning_rate": 4.9549038716539865e-06, "loss": 0.0664, "reward": 2.044642925262451, "reward_std": 0.12139182165265083, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.988839328289032, "step": 2348 }, { "clip_ratio": 0.0, "completion_length": 571.0558395385742, "epoch": 0.7016652975879322, "grad_norm": 1.5510966777801514, "kl": 0.70458984375, "learning_rate": 4.945901072476723e-06, "loss": 0.0713, "reward": 2.0708706080913544, "reward_std": 0.21995650231838226, "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9905134290456772, "step": 2349 }, { "clip_ratio": 0.0, "completion_length": 590.9286041259766, "epoch": 0.7019640056754537, "grad_norm": 0.6119505763053894, "kl": 0.2919921875, "learning_rate": 4.936903771663737e-06, "loss": 0.0315, "reward": 2.0831474661827087, "reward_std": 0.11061189696192741, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9916295111179352, "step": 2350 }, { "clip_ratio": 0.0, "completion_length": 615.0669708251953, "epoch": 0.7022627137629751, "grad_norm": 0.7078641057014465, "kl": 0.4954833984375, "learning_rate": 4.927911979003214e-06, "loss": 0.073, "reward": 2.0513393580913544, "reward_std": 0.18339637108147144, "rewards/accuracy_reward": 0.07589286309666932, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9888393133878708, "step": 2351 }, { "clip_ratio": 0.0, "completion_length": 542.4843978881836, "epoch": 0.7025614218504966, "grad_norm": 0.9050997495651245, "kl": 0.5404052734375, "learning_rate": 4.918925704277336e-06, "loss": 0.1351, "reward": 2.172991156578064, "reward_std": 0.2567193787544966, "rewards/accuracy_reward": 0.2120535857975483, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9832589775323868, "step": 2352 }, { "clip_ratio": 0.0, "completion_length": 568.053596496582, "epoch": 0.702860129938018, "grad_norm": 1.1401921510696411, "kl": 0.4315185546875, "learning_rate": 4.909944957262298e-06, "loss": 0.039, "reward": 2.103236675262451, "reward_std": 0.1529970020055771, "rewards/accuracy_reward": 0.11830357275903225, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9938616305589676, "step": 2353 }, { "clip_ratio": 0.0, "completion_length": 603.310302734375, "epoch": 0.7031588380255396, "grad_norm": 0.2845262587070465, "kl": 0.5943603515625, "learning_rate": 4.900969747728263e-06, "loss": 0.0299, "reward": 2.126674175262451, "reward_std": 0.16567337885499, "rewards/accuracy_reward": 0.14508929289877415, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9927455633878708, "step": 2354 }, { "clip_ratio": 0.0, "completion_length": 561.3794860839844, "epoch": 0.703457546113061, "grad_norm": 5.638838291168213, "kl": 1.429443359375, "learning_rate": 4.892000085439383e-06, "loss": 0.1377, "reward": 2.130580484867096, "reward_std": 0.18115367740392685, "rewards/accuracy_reward": 0.1517857164144516, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9899553954601288, "step": 2355 }, { "clip_ratio": 0.0, "completion_length": 593.8214569091797, "epoch": 0.7037562542005825, "grad_norm": 1.8401347398757935, "kl": 0.40673828125, "learning_rate": 4.8830359801537765e-06, "loss": 0.0803, "reward": 2.0111608505249023, "reward_std": 0.17113377526402473, "rewards/accuracy_reward": 0.04017857275903225, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.988839328289032, "step": 2356 }, { "clip_ratio": 0.0, "completion_length": 594.4955596923828, "epoch": 0.7040549622881039, "grad_norm": 3.152203321456909, "kl": 1.2398681640625, "learning_rate": 4.874077441623504e-06, "loss": 0.1371, "reward": 2.146205425262451, "reward_std": 0.22970738634467125, "rewards/accuracy_reward": 0.1808035804424435, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9854910969734192, "step": 2357 }, { "clip_ratio": 0.0, "completion_length": 587.4977951049805, "epoch": 0.7043536703756255, "grad_norm": 0.5356863737106323, "kl": 0.5478515625, "learning_rate": 4.86512447959458e-06, "loss": 0.0164, "reward": 2.1266742050647736, "reward_std": 0.13597064465284348, "rewards/accuracy_reward": 0.149553582072258, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9905134290456772, "step": 2358 }, { "clip_ratio": 0.0, "completion_length": 610.5357513427734, "epoch": 0.7046523784631469, "grad_norm": 27.81760597229004, "kl": 0.58935546875, "learning_rate": 4.856177103806954e-06, "loss": 0.0789, "reward": 2.024553656578064, "reward_std": 0.19860705360770226, "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.988839328289032, "step": 2359 }, { "clip_ratio": 0.0, "completion_length": 553.0937805175781, "epoch": 0.7049510865506684, "grad_norm": 1.7485538721084595, "kl": 0.41455078125, "learning_rate": 4.847235323994487e-06, "loss": 0.0458, "reward": 2.125558078289032, "reward_std": 0.1375644225627184, "rewards/accuracy_reward": 0.1473214402794838, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9916294813156128, "step": 2360 }, { "clip_ratio": 0.0, "completion_length": 647.419677734375, "epoch": 0.7052497946381898, "grad_norm": 0.9177297353744507, "kl": 0.43896484375, "learning_rate": 4.8382991498849615e-06, "loss": 0.0214, "reward": 2.016741156578064, "reward_std": 0.08287574630230665, "rewards/accuracy_reward": 0.03125000232830644, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.994419664144516, "step": 2361 }, { "clip_ratio": 0.0, "completion_length": 595.8460083007812, "epoch": 0.7055485027257113, "grad_norm": 0.4369255602359772, "kl": 0.28955078125, "learning_rate": 4.829368591200064e-06, "loss": 0.0418, "reward": 2.087611734867096, "reward_std": 0.1693578790873289, "rewards/accuracy_reward": 0.11383929220028222, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.991629496216774, "step": 2362 }, { "clip_ratio": 0.0, "completion_length": 607.4486846923828, "epoch": 0.7058472108132328, "grad_norm": 0.8669313192367554, "kl": 0.2010498046875, "learning_rate": 4.82044365765536e-06, "loss": 0.0458, "reward": 2.0970982909202576, "reward_std": 0.23877723142504692, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9899553805589676, "step": 2363 }, { "clip_ratio": 0.0, "completion_length": 626.3884124755859, "epoch": 0.7061459189007543, "grad_norm": 0.7257909178733826, "kl": 0.5087890625, "learning_rate": 4.811524358960304e-06, "loss": 0.0496, "reward": 2.024553656578064, "reward_std": 0.09191895090043545, "rewards/accuracy_reward": 0.04241071501746774, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9910714626312256, "step": 2364 }, { "clip_ratio": 0.0, "completion_length": 613.0044708251953, "epoch": 0.7064446269882757, "grad_norm": 0.33879631757736206, "kl": 0.22216796875, "learning_rate": 4.802610704818226e-06, "loss": 0.0367, "reward": 2.1032367944717407, "reward_std": 0.15794652327895164, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.9888392984867096, "rewards/tag_count_reward": 0.9960937649011612, "step": 2365 }, { "clip_ratio": 0.0, "completion_length": 614.3527069091797, "epoch": 0.7067433350757972, "grad_norm": 1.4257786273956299, "kl": 0.5323486328125, "learning_rate": 4.793702704926297e-06, "loss": 0.0449, "reward": 2.1350446939468384, "reward_std": 0.16284780949354172, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9854911267757416, "step": 2366 }, { "clip_ratio": 0.0, "completion_length": 638.107177734375, "epoch": 0.7070420431633186, "grad_norm": 0.27922844886779785, "kl": 0.2149658203125, "learning_rate": 4.784800368975557e-06, "loss": 0.0466, "reward": 2.133928656578064, "reward_std": 0.13335565850138664, "rewards/accuracy_reward": 0.15625001210719347, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9910714626312256, "step": 2367 }, { "clip_ratio": 0.0, "completion_length": 613.1361846923828, "epoch": 0.7073407512508402, "grad_norm": 1.4915717840194702, "kl": 0.63427734375, "learning_rate": 4.775903706650866e-06, "loss": 0.0988, "reward": 2.1004464626312256, "reward_std": 0.18980243429541588, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9843750447034836, "step": 2368 }, { "clip_ratio": 0.0, "completion_length": 546.372802734375, "epoch": 0.7076394593383616, "grad_norm": 1.3151863813400269, "kl": 0.39306640625, "learning_rate": 4.767012727630927e-06, "loss": 0.0461, "reward": 2.1043527126312256, "reward_std": 0.13212991319596767, "rewards/accuracy_reward": 0.12053571571595967, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9927455484867096, "step": 2369 }, { "clip_ratio": 0.0, "completion_length": 621.2210083007812, "epoch": 0.707938167425883, "grad_norm": 0.7084788680076599, "kl": 0.6041259765625, "learning_rate": 4.758127441588257e-06, "loss": 0.0906, "reward": 2.0078125596046448, "reward_std": 0.15696480683982372, "rewards/accuracy_reward": 0.035714288242161274, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.987723246216774, "step": 2370 }, { "clip_ratio": 0.0, "completion_length": 586.2232360839844, "epoch": 0.7082368755134045, "grad_norm": 2.78342866897583, "kl": 1.34228515625, "learning_rate": 4.749247858189167e-06, "loss": 0.1173, "reward": 2.103236675262451, "reward_std": 0.3061543218791485, "rewards/accuracy_reward": 0.15178572130389512, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9804687947034836, "step": 2371 }, { "clip_ratio": 0.0, "completion_length": 586.7098388671875, "epoch": 0.7085355836009259, "grad_norm": 1.4381861686706543, "kl": 0.400390625, "learning_rate": 4.7403739870937786e-06, "loss": 0.0401, "reward": 2.1015626192092896, "reward_std": 0.10550143755972385, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.994419664144516, "step": 2372 }, { "clip_ratio": 0.0, "completion_length": 639.4286041259766, "epoch": 0.7088342916884475, "grad_norm": 1.1376210451126099, "kl": 0.5731201171875, "learning_rate": 4.731505837955997e-06, "loss": 0.0721, "reward": 2.1210938096046448, "reward_std": 0.1362636499106884, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9938616454601288, "step": 2373 }, { "clip_ratio": 0.0, "completion_length": 626.5781707763672, "epoch": 0.7091329997759689, "grad_norm": 0.695741593837738, "kl": 0.3223876953125, "learning_rate": 4.722643420423493e-06, "loss": 0.0478, "reward": 2.111049234867096, "reward_std": 0.1572271715849638, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9972098469734192, "step": 2374 }, { "clip_ratio": 0.0, "completion_length": 562.0201110839844, "epoch": 0.7094317078634904, "grad_norm": 1.350864291191101, "kl": 0.950439453125, "learning_rate": 4.71378674413771e-06, "loss": 0.1009, "reward": 2.1037946939468384, "reward_std": 0.19028658792376518, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9854911118745804, "step": 2375 }, { "clip_ratio": 0.0, "completion_length": 582.7522430419922, "epoch": 0.7097304159510118, "grad_norm": 0.5564544796943665, "kl": 0.3367919921875, "learning_rate": 4.704935818733848e-06, "loss": 0.0453, "reward": 2.079799175262451, "reward_std": 0.14540928974747658, "rewards/accuracy_reward": 0.09821428963914514, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9949777126312256, "step": 2376 }, { "clip_ratio": 0.0, "completion_length": 621.4174346923828, "epoch": 0.7100291240385334, "grad_norm": 0.49200963973999023, "kl": 0.254638671875, "learning_rate": 4.69609065384084e-06, "loss": 0.0454, "reward": 2.113839328289032, "reward_std": 0.1752476654946804, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9933036118745804, "step": 2377 }, { "clip_ratio": 0.0, "completion_length": 653.3616485595703, "epoch": 0.7103278321260548, "grad_norm": 0.2174617499113083, "kl": 0.1102294921875, "learning_rate": 4.687251259081362e-06, "loss": 0.0286, "reward": 2.108259081840515, "reward_std": 0.1529456228017807, "rewards/accuracy_reward": 0.12276786379516125, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.994419664144516, "step": 2378 }, { "clip_ratio": 0.0, "completion_length": 673.3236999511719, "epoch": 0.7106265402135763, "grad_norm": 1.1989710330963135, "kl": 0.5535888671875, "learning_rate": 4.678417644071813e-06, "loss": 0.0469, "reward": 2.122767925262451, "reward_std": 0.09255589731037617, "rewards/accuracy_reward": 0.14062500488944352, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9910714626312256, "step": 2379 }, { "clip_ratio": 0.0, "completion_length": 580.091552734375, "epoch": 0.7109252483010977, "grad_norm": 0.38678017258644104, "kl": 0.1795654296875, "learning_rate": 4.669589818422291e-06, "loss": 0.0276, "reward": 2.0982143878936768, "reward_std": 0.10112605523318052, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.993303582072258, "rewards/tag_count_reward": 0.9955357313156128, "step": 2380 }, { "clip_ratio": 0.0, "completion_length": 612.2433319091797, "epoch": 0.7112239563886192, "grad_norm": 1.2320162057876587, "kl": 0.47802734375, "learning_rate": 4.6607677917366155e-06, "loss": 0.0771, "reward": 2.0312501192092896, "reward_std": 0.15296676196157932, "rewards/accuracy_reward": 0.04910714365541935, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9910714626312256, "step": 2381 }, { "clip_ratio": 0.0, "completion_length": 591.9085083007812, "epoch": 0.7115226644761407, "grad_norm": 2.376413106918335, "kl": 0.91064453125, "learning_rate": 4.651951573612277e-06, "loss": 0.0954, "reward": 2.1099331378936768, "reward_std": 0.1653868369758129, "rewards/accuracy_reward": 0.133928582072258, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9893973469734192, "step": 2382 }, { "clip_ratio": 0.0, "completion_length": 626.4643096923828, "epoch": 0.7118213725636622, "grad_norm": 0.14848726987838745, "kl": 0.257080078125, "learning_rate": 4.643141173640461e-06, "loss": 0.0042, "reward": 2.069196581840515, "reward_std": 0.1766119971871376, "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9955357313156128, "step": 2383 }, { "clip_ratio": 0.0, "completion_length": 627.9509124755859, "epoch": 0.7121200806511836, "grad_norm": 0.2381768673658371, "kl": 0.2379150390625, "learning_rate": 4.6343366014060235e-06, "loss": 0.0192, "reward": 2.098772406578064, "reward_std": 0.16737618669867516, "rewards/accuracy_reward": 0.11607143119908869, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9938616454601288, "step": 2384 }, { "clip_ratio": 0.0, "completion_length": 607.6830444335938, "epoch": 0.7124187887387051, "grad_norm": 0.22766515612602234, "kl": 0.340087890625, "learning_rate": 4.625537866487468e-06, "loss": 0.0472, "reward": 2.087053656578064, "reward_std": 0.1674576885998249, "rewards/accuracy_reward": 0.11383928917348385, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9888393133878708, "step": 2385 }, { "clip_ratio": 0.0, "completion_length": 606.0714569091797, "epoch": 0.7127174968262265, "grad_norm": 0.28819549083709717, "kl": 0.150146484375, "learning_rate": 4.61674497845696e-06, "loss": 0.0175, "reward": 2.0848214626312256, "reward_std": 0.18267815932631493, "rewards/accuracy_reward": 0.10267857578583062, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9910714626312256, "step": 2386 }, { "clip_ratio": 0.0, "completion_length": 645.8192291259766, "epoch": 0.7130162049137481, "grad_norm": 3.1055490970611572, "kl": 0.98046875, "learning_rate": 4.607957946880305e-06, "loss": 0.0903, "reward": 2.0664062798023224, "reward_std": 0.24077194556593895, "rewards/accuracy_reward": 0.09598214598372579, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9882812947034836, "step": 2387 }, { "clip_ratio": 0.0, "completion_length": 579.9330596923828, "epoch": 0.7133149130012695, "grad_norm": 2.837232828140259, "kl": 0.5672607421875, "learning_rate": 4.599176781316922e-06, "loss": 0.0187, "reward": 2.1590403020381927, "reward_std": 0.12974920216947794, "rewards/accuracy_reward": 0.1830357287544757, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9893973469734192, "step": 2388 }, { "clip_ratio": 0.0, "completion_length": 579.8794860839844, "epoch": 0.713613621088791, "grad_norm": 0.639521598815918, "kl": 0.544189453125, "learning_rate": 4.590401491319864e-06, "loss": 0.0503, "reward": 2.148995578289032, "reward_std": 0.15456673875451088, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9860491305589676, "step": 2389 }, { "clip_ratio": 0.0, "completion_length": 597.0312652587891, "epoch": 0.7139123291763124, "grad_norm": 0.944195568561554, "kl": 0.41650390625, "learning_rate": 4.5816320864357875e-06, "loss": 0.0383, "reward": 2.0848215222358704, "reward_std": 0.17113907262682915, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9910714626312256, "step": 2390 }, { "clip_ratio": 0.0, "completion_length": 579.1629791259766, "epoch": 0.714211037263834, "grad_norm": 1.1749980449676514, "kl": 0.1353759765625, "learning_rate": 4.5728685762049415e-06, "loss": 0.0104, "reward": 2.1015626192092896, "reward_std": 0.07947594951838255, "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.996651828289032, "step": 2391 }, { "clip_ratio": 0.0, "completion_length": 574.0937805175781, "epoch": 0.7145097453513554, "grad_norm": 1.995025873184204, "kl": 0.9493408203125, "learning_rate": 4.564110970161168e-06, "loss": 0.0494, "reward": 2.091517895460129, "reward_std": 0.14269958809018135, "rewards/accuracy_reward": 0.12276786053553224, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.986607164144516, "step": 2392 }, { "clip_ratio": 0.0, "completion_length": 587.1651916503906, "epoch": 0.7148084534388769, "grad_norm": 313.81976318359375, "kl": 0.9453125, "learning_rate": 4.55535927783189e-06, "loss": 0.1084, "reward": 2.0652902722358704, "reward_std": 0.23652231693267822, "rewards/accuracy_reward": 0.10267857508733869, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9871652126312256, "step": 2393 }, { "clip_ratio": 0.0, "completion_length": 610.3303680419922, "epoch": 0.7151071615263983, "grad_norm": 0.49282050132751465, "kl": 0.2181396484375, "learning_rate": 4.54661350873808e-06, "loss": 0.0239, "reward": 2.014508992433548, "reward_std": 0.22770777717232704, "rewards/accuracy_reward": 0.05580357275903225, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9877232611179352, "step": 2394 }, { "clip_ratio": 0.0, "completion_length": 613.6652069091797, "epoch": 0.7154058696139198, "grad_norm": 0.7841877937316895, "kl": 0.48095703125, "learning_rate": 4.537873672394288e-06, "loss": 0.0379, "reward": 2.0145090222358704, "reward_std": 0.17565019987523556, "rewards/accuracy_reward": 0.05803571571595967, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9854911118745804, "step": 2395 }, { "clip_ratio": 0.0, "completion_length": 547.4107360839844, "epoch": 0.7157045777014412, "grad_norm": 1.1445996761322021, "kl": 0.6795654296875, "learning_rate": 4.52913977830859e-06, "loss": 0.0621, "reward": 2.02678582072258, "reward_std": 0.22546963952481747, "rewards/accuracy_reward": 0.06919643213041127, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9866071790456772, "step": 2396 }, { "clip_ratio": 0.0, "completion_length": 499.8214416503906, "epoch": 0.7160032857889628, "grad_norm": 0.1771859973669052, "kl": 0.1485595703125, "learning_rate": 4.520411835982612e-06, "loss": 0.0129, "reward": 2.1607143878936768, "reward_std": 0.2364093717187643, "rewards/accuracy_reward": 0.1941964402794838, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9933036118745804, "step": 2397 }, { "clip_ratio": 0.0, "completion_length": 550.3616256713867, "epoch": 0.7163019938764842, "grad_norm": 0.3169930875301361, "kl": 0.5267333984375, "learning_rate": 4.5116898549115025e-06, "loss": 0.0295, "reward": 2.145647346973419, "reward_std": 0.29765037074685097, "rewards/accuracy_reward": 0.1919642984867096, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9871652275323868, "step": 2398 }, { "clip_ratio": 0.0, "completion_length": 553.0000305175781, "epoch": 0.7166007019640057, "grad_norm": 0.8900113701820374, "kl": 0.76220703125, "learning_rate": 4.502973844583914e-06, "loss": 0.044, "reward": 2.0279018878936768, "reward_std": 0.22260859608650208, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9854911118745804, "step": 2399 }, { "clip_ratio": 0.0, "completion_length": 569.1920013427734, "epoch": 0.7168994100515271, "grad_norm": 0.5711090564727783, "kl": 0.3040771484375, "learning_rate": 4.494263814482018e-06, "loss": 0.0259, "reward": 2.125000149011612, "reward_std": 0.12339533492922783, "rewards/accuracy_reward": 0.1517857164144516, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9933035969734192, "step": 2400 }, { "clip_ratio": 0.0, "completion_length": 587.4710083007812, "epoch": 0.7171981181390487, "grad_norm": 0.8939747214317322, "kl": 0.20263671875, "learning_rate": 4.485559774081475e-06, "loss": 0.0353, "reward": 2.103794753551483, "reward_std": 0.1968657150864601, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9921875298023224, "step": 2401 }, { "clip_ratio": 0.0, "completion_length": 535.7567291259766, "epoch": 0.7174968262265701, "grad_norm": 0.26724886894226074, "kl": 0.25634765625, "learning_rate": 4.47686173285142e-06, "loss": 0.044, "reward": 2.029017984867096, "reward_std": 0.1681651622056961, "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.988839328289032, "step": 2402 }, { "clip_ratio": 0.0, "completion_length": 560.2634124755859, "epoch": 0.7177955343140916, "grad_norm": 3.3992738723754883, "kl": 0.6103515625, "learning_rate": 4.4681697002544746e-06, "loss": 0.0777, "reward": 2.0976563096046448, "reward_std": 0.20722659304738045, "rewards/accuracy_reward": 0.14285715040750802, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9838170111179352, "step": 2403 }, { "clip_ratio": 0.0, "completion_length": 571.2567138671875, "epoch": 0.718094242401613, "grad_norm": 0.4077945649623871, "kl": 0.163330078125, "learning_rate": 4.459483685746721e-06, "loss": 0.0222, "reward": 2.0228795409202576, "reward_std": 0.2098926529288292, "rewards/accuracy_reward": 0.053571430034935474, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.991629496216774, "step": 2404 }, { "clip_ratio": 0.0, "completion_length": 571.1428833007812, "epoch": 0.7183929504891345, "grad_norm": 0.1445639282464981, "kl": 0.2110595703125, "learning_rate": 4.450803698777684e-06, "loss": -0.005, "reward": 2.0809152722358704, "reward_std": 0.17558342963457108, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9938616454601288, "step": 2405 }, { "clip_ratio": 0.0, "completion_length": 559.2969055175781, "epoch": 0.718691658576656, "grad_norm": 2.860140323638916, "kl": 0.9024658203125, "learning_rate": 4.442129748790344e-06, "loss": 0.082, "reward": 2.142857253551483, "reward_std": 0.166894119232893, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9866071939468384, "step": 2406 }, { "clip_ratio": 0.0, "completion_length": 567.5401916503906, "epoch": 0.7189903666641775, "grad_norm": 0.25881335139274597, "kl": 0.2291259765625, "learning_rate": 4.4334618452211065e-06, "loss": 0.0198, "reward": 2.072544753551483, "reward_std": 0.12141739577054977, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.9843750149011612, "rewards/tag_count_reward": 0.994419664144516, "step": 2407 }, { "clip_ratio": 0.0, "completion_length": 524.8571624755859, "epoch": 0.7192890747516989, "grad_norm": 2.473314046859741, "kl": 0.52587890625, "learning_rate": 4.424799997499803e-06, "loss": 0.0463, "reward": 2.061384081840515, "reward_std": 0.18309857416898012, "rewards/accuracy_reward": 0.0870535783469677, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9921875298023224, "step": 2408 }, { "clip_ratio": 0.0, "completion_length": 584.013427734375, "epoch": 0.7195877828392204, "grad_norm": 0.26978442072868347, "kl": 0.2030029296875, "learning_rate": 4.416144215049677e-06, "loss": 0.0042, "reward": 2.047991156578064, "reward_std": 0.18547271564602852, "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9921875596046448, "step": 2409 }, { "clip_ratio": 0.0, "completion_length": 567.8928833007812, "epoch": 0.7198864909267418, "grad_norm": 0.44692182540893555, "kl": 0.2940673828125, "learning_rate": 4.4074945072873655e-06, "loss": 0.0476, "reward": 2.0926340222358704, "reward_std": 0.1973830685019493, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9899553954601288, "step": 2410 }, { "clip_ratio": 0.0, "completion_length": 580.0357360839844, "epoch": 0.7201851990142634, "grad_norm": 0.182308629155159, "kl": 0.2762451171875, "learning_rate": 4.398850883622905e-06, "loss": 0.0042, "reward": 2.0474331080913544, "reward_std": 0.1459271553903818, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9893973618745804, "step": 2411 }, { "clip_ratio": 0.0, "completion_length": 557.9843826293945, "epoch": 0.7204839071017848, "grad_norm": 0.8698432445526123, "kl": 0.3839111328125, "learning_rate": 4.390213353459715e-06, "loss": 0.0464, "reward": 2.135044753551483, "reward_std": 0.1926470585167408, "rewards/accuracy_reward": 0.16071429406292737, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9899553954601288, "step": 2412 }, { "clip_ratio": 0.0, "completion_length": 552.8750381469727, "epoch": 0.7207826151893062, "grad_norm": 0.7300761938095093, "kl": 0.1630859375, "learning_rate": 4.381581926194575e-06, "loss": 0.0236, "reward": 2.0206474661827087, "reward_std": 0.16750281117856503, "rewards/accuracy_reward": 0.040178573690354824, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9960937947034836, "step": 2413 }, { "clip_ratio": 0.0, "completion_length": 540.0424346923828, "epoch": 0.7210813232768277, "grad_norm": 0.3650171458721161, "kl": 0.211669921875, "learning_rate": 4.372956611217638e-06, "loss": 0.0121, "reward": 2.0970982909202576, "reward_std": 0.21932610869407654, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9921875447034836, "step": 2414 }, { "clip_ratio": 0.0, "completion_length": 545.4776992797852, "epoch": 0.7213800313643491, "grad_norm": 0.2925066351890564, "kl": 0.5345458984375, "learning_rate": 4.3643374179123955e-06, "loss": -0.0006, "reward": 2.1573662161827087, "reward_std": 0.24531916249543428, "rewards/accuracy_reward": 0.18973215040750802, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9877232313156128, "step": 2415 }, { "clip_ratio": 0.0, "completion_length": 576.7745819091797, "epoch": 0.7216787394518707, "grad_norm": 5.445150852203369, "kl": 1.0140380859375, "learning_rate": 4.355724355655688e-06, "loss": 0.107, "reward": 2.1328125596046448, "reward_std": 0.17446560133248568, "rewards/accuracy_reward": 0.16071429289877415, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9899553954601288, "step": 2416 }, { "clip_ratio": 0.0, "completion_length": 545.537971496582, "epoch": 0.7219774475393921, "grad_norm": 0.7906652092933655, "kl": 0.20068359375, "learning_rate": 4.347117433817687e-06, "loss": 0.0613, "reward": 2.0686384737491608, "reward_std": 0.13612964935600758, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.992745578289032, "step": 2417 }, { "clip_ratio": 0.0, "completion_length": 557.2857360839844, "epoch": 0.7222761556269136, "grad_norm": 1.4595272541046143, "kl": 0.3970947265625, "learning_rate": 4.3385166617618725e-06, "loss": 0.0355, "reward": 2.0797992050647736, "reward_std": 0.18589920178055763, "rewards/accuracy_reward": 0.11383928963914514, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9882812798023224, "step": 2418 }, { "clip_ratio": 0.0, "completion_length": 534.0022583007812, "epoch": 0.722574863714435, "grad_norm": 0.21490144729614258, "kl": 0.130859375, "learning_rate": 4.329922048845044e-06, "loss": 0.0345, "reward": 2.1372768878936768, "reward_std": 0.14224867708981037, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9944196790456772, "step": 2419 }, { "clip_ratio": 0.0, "completion_length": 551.303596496582, "epoch": 0.7228735718019565, "grad_norm": 0.2657545506954193, "kl": 0.2049560546875, "learning_rate": 4.3213336044173034e-06, "loss": 0.0132, "reward": 2.0937500596046448, "reward_std": 0.15989766269922256, "rewards/accuracy_reward": 0.10937500232830644, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.995535746216774, "step": 2420 }, { "clip_ratio": 0.0, "completion_length": 542.3884124755859, "epoch": 0.723172279889478, "grad_norm": 0.6458661556243896, "kl": 0.6224365234375, "learning_rate": 4.312751337822027e-06, "loss": 0.0537, "reward": 2.1088170409202576, "reward_std": 0.19656511209905148, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.98604916036129, "step": 2421 }, { "clip_ratio": 0.0, "completion_length": 597.5692291259766, "epoch": 0.7234709879769995, "grad_norm": 0.4882086217403412, "kl": 0.315673828125, "learning_rate": 4.304175258395887e-06, "loss": 0.0399, "reward": 2.0407367050647736, "reward_std": 0.16232961509376764, "rewards/accuracy_reward": 0.06250000116415322, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9916294813156128, "step": 2422 }, { "clip_ratio": 0.0, "completion_length": 573.5759201049805, "epoch": 0.7237696960645209, "grad_norm": 2.150583267211914, "kl": 0.3370361328125, "learning_rate": 4.295605375468818e-06, "loss": 0.0444, "reward": 2.0580357909202576, "reward_std": 0.14530619978904724, "rewards/accuracy_reward": 0.07366071594879031, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9933035969734192, "step": 2423 }, { "clip_ratio": 0.0, "completion_length": 574.0201263427734, "epoch": 0.7240684041520424, "grad_norm": 0.6797676682472229, "kl": 0.2882080078125, "learning_rate": 4.287041698364005e-06, "loss": 0.0349, "reward": 2.088169753551483, "reward_std": 0.16890599578619003, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9944196939468384, "step": 2424 }, { "clip_ratio": 0.0, "completion_length": 569.9911041259766, "epoch": 0.7243671122395638, "grad_norm": 0.1125260591506958, "kl": 0.08642578125, "learning_rate": 4.278484236397895e-06, "loss": -0.0038, "reward": 2.0959821939468384, "reward_std": 0.10909374058246613, "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9977678805589676, "step": 2425 }, { "clip_ratio": 0.0, "completion_length": 589.4174346923828, "epoch": 0.7246658203270854, "grad_norm": 0.4701201021671295, "kl": 0.365234375, "learning_rate": 4.269932998880171e-06, "loss": 0.0052, "reward": 2.138951003551483, "reward_std": 0.15044416673481464, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9938616305589676, "step": 2426 }, { "clip_ratio": 0.0, "completion_length": 573.1919860839844, "epoch": 0.7249645284146068, "grad_norm": 0.9766054153442383, "kl": 0.587158203125, "learning_rate": 4.261387995113733e-06, "loss": 0.0565, "reward": 2.091517984867096, "reward_std": 0.1941515076905489, "rewards/accuracy_reward": 0.11607143515720963, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9888393133878708, "step": 2427 }, { "clip_ratio": 0.0, "completion_length": 589.6026916503906, "epoch": 0.7252632365021283, "grad_norm": 0.40962469577789307, "kl": 0.4183349609375, "learning_rate": 4.2528492343947155e-06, "loss": 0.0051, "reward": 2.133928656578064, "reward_std": 0.1551322154700756, "rewards/accuracy_reward": 0.14285714668221772, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9955357313156128, "step": 2428 }, { "clip_ratio": 0.0, "completion_length": 599.7634124755859, "epoch": 0.7255619445896497, "grad_norm": 0.19387993216514587, "kl": 0.2591552734375, "learning_rate": 4.244316726012446e-06, "loss": 0.0259, "reward": 2.077008992433548, "reward_std": 0.1035456731915474, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9921875447034836, "step": 2429 }, { "clip_ratio": 0.0, "completion_length": 603.997802734375, "epoch": 0.7258606526771713, "grad_norm": 1.5003154277801514, "kl": 0.3548583984375, "learning_rate": 4.2357904792494606e-06, "loss": 0.0172, "reward": 2.0770090222358704, "reward_std": 0.21318018063902855, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9877232760190964, "step": 2430 }, { "clip_ratio": 0.0, "completion_length": 581.1384201049805, "epoch": 0.7261593607646927, "grad_norm": 0.4143844544887543, "kl": 0.2464599609375, "learning_rate": 4.2272705033814854e-06, "loss": 0.0197, "reward": 2.0011162161827087, "reward_std": 0.14090784545987844, "rewards/accuracy_reward": 0.022321429336443543, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9921875298023224, "step": 2431 }, { "clip_ratio": 0.0, "completion_length": 584.1741333007812, "epoch": 0.7264580688522142, "grad_norm": 0.19169747829437256, "kl": 0.14599609375, "learning_rate": 4.21875680767741e-06, "loss": 0.0326, "reward": 2.1439733505249023, "reward_std": 0.1686420189216733, "rewards/accuracy_reward": 0.16517858020961285, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9921875298023224, "step": 2432 }, { "clip_ratio": 0.0, "completion_length": 580.6652069091797, "epoch": 0.7267567769397356, "grad_norm": 0.6656681895256042, "kl": 0.4844970703125, "learning_rate": 4.210249401399305e-06, "loss": 0.0578, "reward": 2.0909598767757416, "reward_std": 0.14522267505526543, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.992745578289032, "step": 2433 }, { "clip_ratio": 0.0, "completion_length": 593.1183319091797, "epoch": 0.7270554850272571, "grad_norm": 2.108872413635254, "kl": 0.631591796875, "learning_rate": 4.201748293802398e-06, "loss": 0.0858, "reward": 2.048549175262451, "reward_std": 0.19233328104019165, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9882812947034836, "step": 2434 }, { "clip_ratio": 0.0, "completion_length": 559.2366180419922, "epoch": 0.7273541931147786, "grad_norm": 0.2192699909210205, "kl": 0.09326171875, "learning_rate": 4.1932534941350545e-06, "loss": 0.008, "reward": 2.1093751192092896, "reward_std": 0.13625115901231766, "rewards/accuracy_reward": 0.11383929289877415, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9977678656578064, "step": 2435 }, { "clip_ratio": 0.0, "completion_length": 627.2812652587891, "epoch": 0.7276529012023001, "grad_norm": 0.23117105662822723, "kl": 0.6531982421875, "learning_rate": 4.184765011638787e-06, "loss": 0.0217, "reward": 2.055245667695999, "reward_std": 0.17939877696335316, "rewards/accuracy_reward": 0.08928572060540318, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9882812947034836, "step": 2436 }, { "clip_ratio": 0.0, "completion_length": 632.9018249511719, "epoch": 0.7279516092898215, "grad_norm": 0.1409948766231537, "kl": 0.124755859375, "learning_rate": 4.176282855548236e-06, "loss": 0.0189, "reward": 2.0513393878936768, "reward_std": 0.10852781310677528, "rewards/accuracy_reward": 0.06473214528523386, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9977678954601288, "step": 2437 }, { "clip_ratio": 0.0, "completion_length": 600.5893096923828, "epoch": 0.728250317377343, "grad_norm": 0.7913737893104553, "kl": 0.518310546875, "learning_rate": 4.1678070350911496e-06, "loss": 0.0314, "reward": 2.090959906578064, "reward_std": 0.16003508493304253, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9905134290456772, "step": 2438 }, { "clip_ratio": 0.0, "completion_length": 565.6272583007812, "epoch": 0.7285490254648644, "grad_norm": 0.16232487559318542, "kl": 0.21875, "learning_rate": 4.1593375594883955e-06, "loss": 0.008, "reward": 2.1361607909202576, "reward_std": 0.15037376806139946, "rewards/accuracy_reward": 0.15625000977888703, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9933036267757416, "step": 2439 }, { "clip_ratio": 0.0, "completion_length": 583.7388610839844, "epoch": 0.728847733552386, "grad_norm": 0.24966640770435333, "kl": 0.213134765625, "learning_rate": 4.150874437953927e-06, "loss": 0.0186, "reward": 2.0731027126312256, "reward_std": 0.13639197871088982, "rewards/accuracy_reward": 0.09151786426082253, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9949776977300644, "step": 2440 }, { "clip_ratio": 0.0, "completion_length": 601.607177734375, "epoch": 0.7291464416399074, "grad_norm": 0.3262089192867279, "kl": 0.1932373046875, "learning_rate": 4.142417679694794e-06, "loss": 0.0184, "reward": 2.090959906578064, "reward_std": 0.10415759216994047, "rewards/accuracy_reward": 0.10267857811413705, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9949777126312256, "step": 2441 }, { "clip_ratio": 0.0, "completion_length": 612.2544860839844, "epoch": 0.7294451497274289, "grad_norm": 0.4342747628688812, "kl": 0.65771484375, "learning_rate": 4.133967293911124e-06, "loss": 0.0496, "reward": 2.130022406578064, "reward_std": 0.17680637165904045, "rewards/accuracy_reward": 0.15401786309666932, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9893973618745804, "step": 2442 }, { "clip_ratio": 0.0, "completion_length": 629.2701263427734, "epoch": 0.7297438578149503, "grad_norm": 0.3205488622188568, "kl": 0.313232421875, "learning_rate": 4.1255232897961015e-06, "loss": 0.039, "reward": 2.0820313096046448, "reward_std": 0.17602363973855972, "rewards/accuracy_reward": 0.11383929336443543, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9882812947034836, "step": 2443 }, { "clip_ratio": 0.0, "completion_length": 606.3683319091797, "epoch": 0.7300425659024719, "grad_norm": 1.5425671339035034, "kl": 0.47607421875, "learning_rate": 4.117085676535979e-06, "loss": 0.0625, "reward": 2.0864956974983215, "reward_std": 0.239535141736269, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9860491454601288, "step": 2444 }, { "clip_ratio": 0.0, "completion_length": 587.2232360839844, "epoch": 0.7303412739899933, "grad_norm": 0.13874362409114838, "kl": 0.107421875, "learning_rate": 4.108654463310058e-06, "loss": 0.0104, "reward": 2.095424234867096, "reward_std": 0.1304291905835271, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.997209832072258, "step": 2445 }, { "clip_ratio": 0.0, "completion_length": 570.5357437133789, "epoch": 0.7306399820775148, "grad_norm": 0.8229165077209473, "kl": 0.1644287109375, "learning_rate": 4.100229659290662e-06, "loss": 0.0225, "reward": 2.0691965222358704, "reward_std": 0.07469875365495682, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9977678805589676, "step": 2446 }, { "clip_ratio": 0.0, "completion_length": 623.0826110839844, "epoch": 0.7309386901650362, "grad_norm": 0.10733766853809357, "kl": 0.096923828125, "learning_rate": 4.091811273643157e-06, "loss": 0.0067, "reward": 2.0507814288139343, "reward_std": 0.11090241465717554, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9949776977300644, "step": 2447 }, { "clip_ratio": 0.0, "completion_length": 568.4821701049805, "epoch": 0.7312373982525577, "grad_norm": 0.4143773019313812, "kl": 0.3448486328125, "learning_rate": 4.083399315525925e-06, "loss": 0.0313, "reward": 2.0334822237491608, "reward_std": 0.14925704151391983, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9910714626312256, "step": 2448 }, { "clip_ratio": 0.0, "completion_length": 565.8660888671875, "epoch": 0.7315361063400792, "grad_norm": 0.13170655071735382, "kl": 0.0941162109375, "learning_rate": 4.074993794090346e-06, "loss": 0.0113, "reward": 2.114397406578064, "reward_std": 0.13726905174553394, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9960937947034836, "step": 2449 }, { "clip_ratio": 0.0, "completion_length": 593.8482360839844, "epoch": 0.7318348144276007, "grad_norm": 0.15034204721450806, "kl": 0.0736083984375, "learning_rate": 4.066594718480805e-06, "loss": -0.0008, "reward": 2.157366156578064, "reward_std": 0.20855946466326714, "rewards/accuracy_reward": 0.16964286798611283, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9966518133878708, "step": 2450 }, { "clip_ratio": 0.0, "completion_length": 616.3080596923828, "epoch": 0.7321335225151221, "grad_norm": 0.8218938708305359, "kl": 0.474365234375, "learning_rate": 4.058202097834679e-06, "loss": 0.0506, "reward": 2.0864956378936768, "reward_std": 0.22942280769348145, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9860491454601288, "step": 2451 }, { "clip_ratio": 0.0, "completion_length": 613.2522583007812, "epoch": 0.7324322306026436, "grad_norm": 0.5754503011703491, "kl": 0.5794677734375, "learning_rate": 4.049815941282307e-06, "loss": 0.0364, "reward": 2.095424234867096, "reward_std": 0.1602232987061143, "rewards/accuracy_reward": 0.12276786123402417, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9905134290456772, "step": 2452 }, { "clip_ratio": 0.0, "completion_length": 605.2835083007812, "epoch": 0.732730938690165, "grad_norm": 0.7627363801002502, "kl": 0.56298828125, "learning_rate": 4.041436257947015e-06, "loss": 0.0395, "reward": 2.10491082072258, "reward_std": 0.19155289605259895, "rewards/accuracy_reward": 0.12946429220028222, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9888393133878708, "step": 2453 }, { "clip_ratio": 0.0, "completion_length": 589.2411041259766, "epoch": 0.7330296467776866, "grad_norm": 0.19927911460399628, "kl": 0.236328125, "learning_rate": 4.033063056945067e-06, "loss": 0.0203, "reward": 2.0697545409202576, "reward_std": 0.15391680039465427, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.99386166036129, "step": 2454 }, { "clip_ratio": 0.0, "completion_length": 634.9732513427734, "epoch": 0.733328354865208, "grad_norm": 0.7820965647697449, "kl": 0.25048828125, "learning_rate": 4.0246963473856915e-06, "loss": 0.0253, "reward": 2.0468751192092896, "reward_std": 0.12015431560575962, "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.995535746216774, "step": 2455 }, { "clip_ratio": 0.0, "completion_length": 613.1138763427734, "epoch": 0.7336270629527294, "grad_norm": 0.21791788935661316, "kl": 0.196044921875, "learning_rate": 4.01633613837105e-06, "loss": 0.0204, "reward": 2.0781250596046448, "reward_std": 0.15711440332233906, "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9933035969734192, "step": 2456 }, { "clip_ratio": 0.0, "completion_length": 565.9352874755859, "epoch": 0.7339257710402509, "grad_norm": 1.1264855861663818, "kl": 0.5904541015625, "learning_rate": 4.0079824389962255e-06, "loss": 0.0719, "reward": 2.1077009737491608, "reward_std": 0.17205377481877804, "rewards/accuracy_reward": 0.14062500488944352, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9871651977300644, "step": 2457 }, { "clip_ratio": 0.0, "completion_length": 581.8058319091797, "epoch": 0.7342244791277723, "grad_norm": 4.323127746582031, "kl": 0.695556640625, "learning_rate": 3.999635258349226e-06, "loss": 0.0986, "reward": 2.157924234867096, "reward_std": 0.16602792032063007, "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.990513414144516, "step": 2458 }, { "clip_ratio": 0.0, "completion_length": 602.6607360839844, "epoch": 0.7345231872152939, "grad_norm": 0.28735291957855225, "kl": 0.359375, "learning_rate": 3.991294605510969e-06, "loss": 0.0409, "reward": 2.0853796005249023, "reward_std": 0.16467585042119026, "rewards/accuracy_reward": 0.10714286402799189, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.991629496216774, "step": 2459 }, { "clip_ratio": 0.0, "completion_length": 560.8192367553711, "epoch": 0.7348218953028153, "grad_norm": 0.1808469146490097, "kl": 0.1881103515625, "learning_rate": 3.982960489555263e-06, "loss": 0.0221, "reward": 2.143415331840515, "reward_std": 0.1369278598576784, "rewards/accuracy_reward": 0.15625000931322575, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9960937649011612, "step": 2460 }, { "clip_ratio": 0.0, "completion_length": 593.1875305175781, "epoch": 0.7351206033903368, "grad_norm": 0.38271331787109375, "kl": 0.2071533203125, "learning_rate": 3.97463291954881e-06, "loss": 0.0264, "reward": 2.224888563156128, "reward_std": 0.16575832106173038, "rewards/accuracy_reward": 0.2477678693830967, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9927455484867096, "step": 2461 }, { "clip_ratio": 0.0, "completion_length": 636.2745666503906, "epoch": 0.7354193114778582, "grad_norm": 0.2193976640701294, "kl": 0.1484375, "learning_rate": 3.966311904551195e-06, "loss": 0.0082, "reward": 2.0820313692092896, "reward_std": 0.18144380673766136, "rewards/accuracy_reward": 0.09821429010480642, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9949777275323868, "step": 2462 }, { "clip_ratio": 0.0, "completion_length": 566.3705673217773, "epoch": 0.7357180195653797, "grad_norm": 1.326737642288208, "kl": 0.65966796875, "learning_rate": 3.957997453614859e-06, "loss": 0.0566, "reward": 2.237723231315613, "reward_std": 0.1789430696517229, "rewards/accuracy_reward": 0.2633928619325161, "rewards/format_reward": 0.9843750149011612, "rewards/tag_count_reward": 0.9899553805589676, "step": 2463 }, { "clip_ratio": 0.0, "completion_length": 612.2857360839844, "epoch": 0.7360167276529012, "grad_norm": 0.5478139519691467, "kl": 0.3548583984375, "learning_rate": 3.949689575785114e-06, "loss": 0.0503, "reward": 2.0139509737491608, "reward_std": 0.12923802435398102, "rewards/accuracy_reward": 0.04241071501746774, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9893973618745804, "step": 2464 }, { "clip_ratio": 0.0, "completion_length": 623.0357360839844, "epoch": 0.7363154357404227, "grad_norm": 0.4249440133571625, "kl": 0.223876953125, "learning_rate": 3.94138828010012e-06, "loss": 0.0243, "reward": 2.0569197237491608, "reward_std": 0.13806810230016708, "rewards/accuracy_reward": 0.07366072130389512, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9944196790456772, "step": 2465 }, { "clip_ratio": 0.0, "completion_length": 621.2076263427734, "epoch": 0.7366141438279441, "grad_norm": 5.892307281494141, "kl": 0.784912109375, "learning_rate": 3.933093575590866e-06, "loss": 0.0534, "reward": 2.0742188692092896, "reward_std": 0.18302912078797817, "rewards/accuracy_reward": 0.09151785937137902, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.99386166036129, "step": 2466 }, { "clip_ratio": 0.0, "completion_length": 630.3393096923828, "epoch": 0.7369128519154656, "grad_norm": 1.2016288042068481, "kl": 0.6064453125, "learning_rate": 3.924805471281184e-06, "loss": 0.0854, "reward": 2.1551339626312256, "reward_std": 0.19353235326707363, "rewards/accuracy_reward": 0.2008928656578064, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9810268431901932, "step": 2467 }, { "clip_ratio": 0.0, "completion_length": 597.4129638671875, "epoch": 0.737211560002987, "grad_norm": 0.39844971895217896, "kl": 0.476318359375, "learning_rate": 3.916523976187713e-06, "loss": 0.0224, "reward": 2.0424107909202576, "reward_std": 0.16983656398952007, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9933036267757416, "step": 2468 }, { "clip_ratio": 0.0, "completion_length": 619.5803833007812, "epoch": 0.7375102680905086, "grad_norm": 1.137334942817688, "kl": 0.61962890625, "learning_rate": 3.90824909931991e-06, "loss": 0.0778, "reward": 2.058035761117935, "reward_std": 0.20753038488328457, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9866071939468384, "step": 2469 }, { "clip_ratio": 0.0, "completion_length": 574.5290222167969, "epoch": 0.73780897617803, "grad_norm": 1.484424352645874, "kl": 0.438720703125, "learning_rate": 3.899980849680036e-06, "loss": 0.0731, "reward": 2.122209906578064, "reward_std": 0.19801921769976616, "rewards/accuracy_reward": 0.15178572130389512, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9905134290456772, "step": 2470 }, { "clip_ratio": 0.0, "completion_length": 577.5893096923828, "epoch": 0.7381076842655515, "grad_norm": 0.8226521015167236, "kl": 0.223876953125, "learning_rate": 3.8917192362631285e-06, "loss": 0.0423, "reward": 2.013392984867096, "reward_std": 0.12860987521708012, "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9910714775323868, "step": 2471 }, { "clip_ratio": 0.0, "completion_length": 597.9397583007812, "epoch": 0.7384063923530729, "grad_norm": 1.7606308460235596, "kl": 0.7476806640625, "learning_rate": 3.883464268057015e-06, "loss": 0.06, "reward": 2.185826003551483, "reward_std": 0.1674415674060583, "rewards/accuracy_reward": 0.21428572572767735, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9893973767757416, "step": 2472 }, { "clip_ratio": 0.0, "completion_length": 618.5089416503906, "epoch": 0.7387051004405945, "grad_norm": 0.16565962135791779, "kl": 0.199462890625, "learning_rate": 3.875215954042297e-06, "loss": 0.0163, "reward": 2.1183037161827087, "reward_std": 0.09031233284622431, "rewards/accuracy_reward": 0.12276786682195961, "rewards/format_reward": 0.9977678656578064, "rewards/tag_count_reward": 0.9977678656578064, "step": 2473 }, { "clip_ratio": 0.0, "completion_length": 624.1942138671875, "epoch": 0.7390038085281159, "grad_norm": 0.18325570225715637, "kl": 0.40185546875, "learning_rate": 3.866974303192323e-06, "loss": 0.0239, "reward": 2.0837054550647736, "reward_std": 0.18267012014985085, "rewards/accuracy_reward": 0.12500000977888703, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9854911118745804, "step": 2474 }, { "clip_ratio": 0.0, "completion_length": 610.6116333007812, "epoch": 0.7393025166156374, "grad_norm": 0.10717210173606873, "kl": 0.10302734375, "learning_rate": 3.858739324473208e-06, "loss": 0.004, "reward": 2.037388503551483, "reward_std": 0.06872299592942, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.997209832072258, "step": 2475 }, { "clip_ratio": 0.0, "completion_length": 637.3638763427734, "epoch": 0.7396012247031588, "grad_norm": 0.2568654716014862, "kl": 0.2581787109375, "learning_rate": 3.850511026843802e-06, "loss": 0.0334, "reward": 2.0312500596046448, "reward_std": 0.2321218503639102, "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9843750596046448, "step": 2476 }, { "clip_ratio": 0.0, "completion_length": 576.6919860839844, "epoch": 0.7398999327906803, "grad_norm": 8.109870910644531, "kl": 0.9525146484375, "learning_rate": 3.842289419255681e-06, "loss": 0.0718, "reward": 2.1478795409202576, "reward_std": 0.23629219457507133, "rewards/accuracy_reward": 0.16741072246804833, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9938616454601288, "step": 2477 }, { "clip_ratio": 0.0, "completion_length": 607.6741485595703, "epoch": 0.7401986408782018, "grad_norm": 0.2079111635684967, "kl": 0.10107421875, "learning_rate": 3.834074510653151e-06, "loss": 0.016, "reward": 2.0128349363803864, "reward_std": 0.14876938611268997, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9927455633878708, "step": 2478 }, { "clip_ratio": 0.0, "completion_length": 611.7366333007812, "epoch": 0.7404973489657233, "grad_norm": 0.22201749682426453, "kl": 0.2069091796875, "learning_rate": 3.8258663099732304e-06, "loss": 0.0277, "reward": 2.0820313692092896, "reward_std": 0.17167818173766136, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.994977742433548, "step": 2479 }, { "clip_ratio": 0.0, "completion_length": 608.3973388671875, "epoch": 0.7407960570532447, "grad_norm": 0.2969864308834076, "kl": 0.1341552734375, "learning_rate": 3.817664826145633e-06, "loss": 0.0234, "reward": 2.0809152126312256, "reward_std": 0.1536339931190014, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9938616454601288, "step": 2480 }, { "clip_ratio": 0.0, "completion_length": 605.0558319091797, "epoch": 0.7410947651407662, "grad_norm": 1.0471707582473755, "kl": 0.385498046875, "learning_rate": 3.809470068092772e-06, "loss": 0.0307, "reward": 2.1333706974983215, "reward_std": 0.12817614432424307, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9949776977300644, "step": 2481 }, { "clip_ratio": 0.0, "completion_length": 577.7343902587891, "epoch": 0.7413934732282876, "grad_norm": 0.4812787175178528, "kl": 0.4029541015625, "learning_rate": 3.8012820447297384e-06, "loss": 0.0663, "reward": 2.138392925262451, "reward_std": 0.20407811366021633, "rewards/accuracy_reward": 0.17187500838190317, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9843750596046448, "step": 2482 }, { "clip_ratio": 0.0, "completion_length": 619.2477874755859, "epoch": 0.7416921813158092, "grad_norm": 0.8278794288635254, "kl": 0.1912841796875, "learning_rate": 3.793100764964299e-06, "loss": 0.0249, "reward": 2.090959906578064, "reward_std": 0.1843038909137249, "rewards/accuracy_reward": 0.11160715040750802, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.992745578289032, "step": 2483 }, { "clip_ratio": 0.0, "completion_length": 558.5982360839844, "epoch": 0.7419908894033306, "grad_norm": 0.5153234004974365, "kl": 0.7967529296875, "learning_rate": 3.7849262376968897e-06, "loss": 0.0725, "reward": 2.0970982909202576, "reward_std": 0.2220485508441925, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9854911267757416, "step": 2484 }, { "clip_ratio": 0.0, "completion_length": 628.6205444335938, "epoch": 0.7422895974908521, "grad_norm": 0.7201616168022156, "kl": 0.391845703125, "learning_rate": 3.7767584718205875e-06, "loss": 0.0413, "reward": 2.041294753551483, "reward_std": 0.17558139376342297, "rewards/accuracy_reward": 0.06696428684517741, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9899553954601288, "step": 2485 }, { "clip_ratio": 0.0, "completion_length": 625.2567138671875, "epoch": 0.7425883055783735, "grad_norm": 0.4808993935585022, "kl": 0.2900390625, "learning_rate": 3.768597476221125e-06, "loss": 0.0359, "reward": 2.047991156578064, "reward_std": 0.15712846256792545, "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.98995541036129, "step": 2486 }, { "clip_ratio": 0.0, "completion_length": 630.3594055175781, "epoch": 0.742887013665895, "grad_norm": 0.2781783938407898, "kl": 0.1798095703125, "learning_rate": 3.7604432597768693e-06, "loss": 0.0087, "reward": 2.0362724363803864, "reward_std": 0.1049107201397419, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9938616305589676, "step": 2487 }, { "clip_ratio": 0.0, "completion_length": 631.9040374755859, "epoch": 0.7431857217534165, "grad_norm": 1.454400658607483, "kl": 0.49951171875, "learning_rate": 3.7522958313587996e-06, "loss": 0.0747, "reward": 2.0468751192092896, "reward_std": 0.19377663545310497, "rewards/accuracy_reward": 0.08705357694998384, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9843750298023224, "step": 2488 }, { "clip_ratio": 0.0, "completion_length": 616.1607360839844, "epoch": 0.743484429840938, "grad_norm": 0.829561710357666, "kl": 0.1904296875, "learning_rate": 3.744155199830526e-06, "loss": 0.0353, "reward": 2.1283482909202576, "reward_std": 0.13912202417850494, "rewards/accuracy_reward": 0.14955357578583062, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9921875298023224, "step": 2489 }, { "clip_ratio": 0.0, "completion_length": 616.9821624755859, "epoch": 0.7437831379284594, "grad_norm": 0.6013960242271423, "kl": 0.5018310546875, "learning_rate": 3.73602137404826e-06, "loss": 0.0236, "reward": 2.036830484867096, "reward_std": 0.11204813048243523, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9899553954601288, "step": 2490 }, { "clip_ratio": 0.0, "completion_length": 634.9352874755859, "epoch": 0.7440818460159809, "grad_norm": 1.0080548524856567, "kl": 0.373046875, "learning_rate": 3.727894362860799e-06, "loss": 0.0621, "reward": 2.1049107909202576, "reward_std": 0.22280686348676682, "rewards/accuracy_reward": 0.14508928707800806, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9821428805589676, "step": 2491 }, { "clip_ratio": 0.0, "completion_length": 610.4085083007812, "epoch": 0.7443805541035023, "grad_norm": 7.113353729248047, "kl": 0.64404296875, "learning_rate": 3.7197741751095383e-06, "loss": 0.0577, "reward": 2.103236675262451, "reward_std": 0.18325339071452618, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9893973618745804, "step": 2492 }, { "clip_ratio": 0.0, "completion_length": 604.6786041259766, "epoch": 0.7446792621910239, "grad_norm": 0.47773122787475586, "kl": 0.574951171875, "learning_rate": 3.711660819628451e-06, "loss": 0.0579, "reward": 2.070312589406967, "reward_std": 0.19624873995780945, "rewards/accuracy_reward": 0.10937500232830644, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9832589626312256, "step": 2493 }, { "clip_ratio": 0.0, "completion_length": 565.3326034545898, "epoch": 0.7449779702785453, "grad_norm": 1.0002268552780151, "kl": 0.602294921875, "learning_rate": 3.7035543052440646e-06, "loss": 0.0884, "reward": 2.083705484867096, "reward_std": 0.26723165437579155, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.9765625447034836, "step": 2494 }, { "clip_ratio": 0.0, "completion_length": 648.7299499511719, "epoch": 0.7452766783660668, "grad_norm": 0.6675733923912048, "kl": 0.4483642578125, "learning_rate": 3.69545464077548e-06, "loss": 0.0537, "reward": 2.1166295409202576, "reward_std": 0.23354875668883324, "rewards/accuracy_reward": 0.15401786798611283, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.984933078289032, "step": 2495 }, { "clip_ratio": 0.0, "completion_length": 578.3616333007812, "epoch": 0.7455753864535882, "grad_norm": 0.3745753765106201, "kl": 0.2449951171875, "learning_rate": 3.68736183503433e-06, "loss": 0.0322, "reward": 2.11272332072258, "reward_std": 0.18576020747423172, "rewards/accuracy_reward": 0.1294642947614193, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9944196790456772, "step": 2496 }, { "clip_ratio": 0.0, "completion_length": 587.7812728881836, "epoch": 0.7458740945411098, "grad_norm": 4.503454685211182, "kl": 0.9453125, "learning_rate": 3.6792758968247986e-06, "loss": 0.0762, "reward": 2.087611645460129, "reward_std": 0.20794599503278732, "rewards/accuracy_reward": 0.12500000302679837, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.984933078289032, "step": 2497 }, { "clip_ratio": 0.0, "completion_length": 610.3817367553711, "epoch": 0.7461728026286312, "grad_norm": 2.564373731613159, "kl": 0.843017578125, "learning_rate": 3.6711968349435988e-06, "loss": 0.1205, "reward": 2.0691965222358704, "reward_std": 0.25451524183154106, "rewards/accuracy_reward": 0.12723215227015316, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9754464775323868, "step": 2498 }, { "clip_ratio": 0.0, "completion_length": 584.8259201049805, "epoch": 0.7464715107161526, "grad_norm": 67.23339080810547, "kl": 0.58642578125, "learning_rate": 3.6631246581799483e-06, "loss": 0.0946, "reward": 2.1333706378936768, "reward_std": 0.2774777188897133, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9860491454601288, "step": 2499 }, { "clip_ratio": 0.0, "completion_length": 611.4665374755859, "epoch": 0.7467702188036741, "grad_norm": 1.903896450996399, "kl": 0.4873046875, "learning_rate": 3.6550593753155893e-06, "loss": 0.0417, "reward": 2.094866156578064, "reward_std": 0.21770090609788895, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9921875149011612, "step": 2500 }, { "clip_ratio": 0.0, "completion_length": 612.372802734375, "epoch": 0.7470689268911955, "grad_norm": 0.26951658725738525, "kl": 0.341064453125, "learning_rate": 3.647000995124763e-06, "loss": 0.0671, "reward": 2.0703125596046448, "reward_std": 0.19950835406780243, "rewards/accuracy_reward": 0.09151785913854837, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9921875298023224, "step": 2501 }, { "clip_ratio": 0.0, "completion_length": 551.6607437133789, "epoch": 0.747367634978717, "grad_norm": 0.3638762831687927, "kl": 0.444091796875, "learning_rate": 3.6389495263741894e-06, "loss": 0.0285, "reward": 2.1333706378936768, "reward_std": 0.2109605148434639, "rewards/accuracy_reward": 0.16071429406292737, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9882812798023224, "step": 2502 }, { "clip_ratio": 0.0, "completion_length": 557.2232513427734, "epoch": 0.7476663430662385, "grad_norm": 1.5077687501907349, "kl": 0.2020263671875, "learning_rate": 3.6309049778230822e-06, "loss": 0.0396, "reward": 2.1517857909202576, "reward_std": 0.18052516225725412, "rewards/accuracy_reward": 0.16964286495931447, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9933035969734192, "step": 2503 }, { "clip_ratio": 0.0, "completion_length": 625.9219055175781, "epoch": 0.74796505115376, "grad_norm": 0.6759883761405945, "kl": 0.355712890625, "learning_rate": 3.622867358223122e-06, "loss": 0.046, "reward": 2.0401786863803864, "reward_std": 0.17839510180056095, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9933035969734192, "step": 2504 }, { "clip_ratio": 0.0, "completion_length": 593.1495666503906, "epoch": 0.7482637592412814, "grad_norm": 1.029949426651001, "kl": 0.259033203125, "learning_rate": 3.6148366763184485e-06, "loss": 0.0256, "reward": 2.0591518878936768, "reward_std": 0.17210186086595058, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9921875298023224, "step": 2505 }, { "clip_ratio": 0.0, "completion_length": 587.8750305175781, "epoch": 0.7485624673288029, "grad_norm": 0.6254590749740601, "kl": 0.570556640625, "learning_rate": 3.6068129408456564e-06, "loss": 0.0711, "reward": 2.067522406578064, "reward_std": 0.2403547577559948, "rewards/accuracy_reward": 0.11383929220028222, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9827009439468384, "step": 2506 }, { "clip_ratio": 0.0, "completion_length": 597.4531402587891, "epoch": 0.7488611754163244, "grad_norm": 1.3898049592971802, "kl": 0.5045166015625, "learning_rate": 3.5987961605337894e-06, "loss": 0.0466, "reward": 2.030133992433548, "reward_std": 0.07589286286383867, "rewards/accuracy_reward": 0.04241071501746774, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.994419664144516, "step": 2507 }, { "clip_ratio": 0.0, "completion_length": 520.7901840209961, "epoch": 0.7491598835038459, "grad_norm": 0.5612919330596924, "kl": 0.112548828125, "learning_rate": 3.5907863441043113e-06, "loss": 0.035, "reward": 2.107701003551483, "reward_std": 0.21315594390034676, "rewards/accuracy_reward": 0.12276786658912897, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9938616305589676, "step": 2508 }, { "clip_ratio": 0.0, "completion_length": 523.4754791259766, "epoch": 0.7494585915913673, "grad_norm": 0.41859543323516846, "kl": 0.1573486328125, "learning_rate": 3.582783500271122e-06, "loss": 0.0283, "reward": 2.0842634737491608, "reward_std": 0.17850715853273869, "rewards/accuracy_reward": 0.09821429033763707, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9949776977300644, "step": 2509 }, { "clip_ratio": 0.0, "completion_length": 577.7344207763672, "epoch": 0.7497572996788888, "grad_norm": 0.9675335884094238, "kl": 0.355224609375, "learning_rate": 3.574787637740528e-06, "loss": 0.0567, "reward": 2.0479911267757416, "reward_std": 0.2373841628432274, "rewards/accuracy_reward": 0.08928571944124997, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9854911118745804, "step": 2510 }, { "clip_ratio": 0.0, "completion_length": 533.3058319091797, "epoch": 0.7500560077664102, "grad_norm": 2.1717474460601807, "kl": 0.7899169921875, "learning_rate": 3.5667987652112445e-06, "loss": 0.1117, "reward": 2.1054688096046448, "reward_std": 0.24792726337909698, "rewards/accuracy_reward": 0.14062500977888703, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.984933078289032, "step": 2511 }, { "clip_ratio": 0.0, "completion_length": 520.0647506713867, "epoch": 0.7503547158539318, "grad_norm": 1.308581829071045, "kl": 0.55078125, "learning_rate": 3.558816891374387e-06, "loss": 0.0735, "reward": 2.1450893878936768, "reward_std": 0.1997009739279747, "rewards/accuracy_reward": 0.18303572246804833, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9866071790456772, "step": 2512 }, { "clip_ratio": 0.0, "completion_length": 517.7544937133789, "epoch": 0.7506534239414532, "grad_norm": 0.5221760869026184, "kl": 0.299560546875, "learning_rate": 3.5508420249134432e-06, "loss": 0.038, "reward": 2.119419753551483, "reward_std": 0.19666175171732903, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9921875298023224, "step": 2513 }, { "clip_ratio": 0.0, "completion_length": 543.0625152587891, "epoch": 0.7509521320289747, "grad_norm": 0.9543902277946472, "kl": 0.43505859375, "learning_rate": 3.5428741745042926e-06, "loss": 0.0995, "reward": 2.028459906578064, "reward_std": 0.28109483048319817, "rewards/accuracy_reward": 0.09151785867288709, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.9793527126312256, "step": 2514 }, { "clip_ratio": 0.0, "completion_length": 563.1205444335938, "epoch": 0.7512508401164961, "grad_norm": 0.676356852054596, "kl": 0.53466796875, "learning_rate": 3.5349133488151764e-06, "loss": 0.0891, "reward": 2.0223215520381927, "reward_std": 0.3093463219702244, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.9598214626312256, "rewards/tag_count_reward": 0.9732143431901932, "step": 2515 }, { "clip_ratio": 0.0, "completion_length": 548.0000152587891, "epoch": 0.7515495482040176, "grad_norm": 1.0480977296829224, "kl": 0.74755859375, "learning_rate": 3.526959556506687e-06, "loss": 0.1135, "reward": 2.039062589406967, "reward_std": 0.3679114803671837, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.9508928954601288, "rewards/tag_count_reward": 0.972098246216774, "step": 2516 }, { "clip_ratio": 0.0, "completion_length": 564.3236846923828, "epoch": 0.7518482562915391, "grad_norm": 0.8090245127677917, "kl": 1.35400390625, "learning_rate": 3.5190128062317742e-06, "loss": 0.1381, "reward": 1.9525670409202576, "reward_std": 0.3477543443441391, "rewards/accuracy_reward": 0.06919643026776612, "rewards/format_reward": 0.9263393431901932, "rewards/tag_count_reward": 0.9570312947034836, "step": 2517 }, { "clip_ratio": 0.0, "completion_length": 568.678596496582, "epoch": 0.7521469643790606, "grad_norm": 1.3116267919540405, "kl": 0.5908203125, "learning_rate": 3.5110731066357264e-06, "loss": 0.1455, "reward": 2.102120578289032, "reward_std": 0.40660126507282257, "rewards/accuracy_reward": 0.20758929662406445, "rewards/format_reward": 0.933035746216774, "rewards/tag_count_reward": 0.961495578289032, "step": 2518 }, { "clip_ratio": 0.0, "completion_length": 582.2210083007812, "epoch": 0.752445672466582, "grad_norm": 1.877231240272522, "kl": 1.1689453125, "learning_rate": 3.5031404663561507e-06, "loss": 0.1744, "reward": 1.915178656578064, "reward_std": 0.41643569618463516, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.8906250447034836, "rewards/tag_count_reward": 0.948660746216774, "step": 2519 }, { "clip_ratio": 0.0, "completion_length": 557.2098617553711, "epoch": 0.7527443805541035, "grad_norm": 3.097912549972534, "kl": 1.810546875, "learning_rate": 3.4952148940229915e-06, "loss": 0.2555, "reward": 1.9553572237491608, "reward_std": 0.3844388350844383, "rewards/accuracy_reward": 0.11160714365541935, "rewards/format_reward": 0.8995536118745804, "rewards/tag_count_reward": 0.9441964626312256, "step": 2520 }, { "clip_ratio": 0.0, "completion_length": 535.4531555175781, "epoch": 0.753043088641625, "grad_norm": 2.8023602962493896, "kl": 1.6328125, "learning_rate": 3.4872963982584873e-06, "loss": 0.2333, "reward": 1.92912957072258, "reward_std": 0.3946498706936836, "rewards/accuracy_reward": 0.10714286006987095, "rewards/format_reward": 0.8816964626312256, "rewards/tag_count_reward": 0.9402902126312256, "step": 2521 }, { "clip_ratio": 0.0, "completion_length": 579.1852874755859, "epoch": 0.7533417967291465, "grad_norm": 27.502107620239258, "kl": 2.19140625, "learning_rate": 3.4793849876771867e-06, "loss": 0.2558, "reward": 1.8420759737491608, "reward_std": 0.50318194180727, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.8392857313156128, "rewards/tag_count_reward": 0.9157366454601288, "step": 2522 }, { "clip_ratio": 0.0, "completion_length": 545.1875152587891, "epoch": 0.7536405048166679, "grad_norm": 86.36105346679688, "kl": 1.4990234375, "learning_rate": 3.471480670885935e-06, "loss": 0.2142, "reward": 1.9693081378936768, "reward_std": 0.4302360415458679, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.9084821939468384, "rewards/tag_count_reward": 0.9402902275323868, "step": 2523 }, { "clip_ratio": 0.0, "completion_length": 551.0647506713867, "epoch": 0.7539392129041894, "grad_norm": 1.1167352199554443, "kl": 1.28271484375, "learning_rate": 3.4635834564838467e-06, "loss": 0.1751, "reward": 1.9386162161827087, "reward_std": 0.36007189378142357, "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.9062500298023224, "rewards/tag_count_reward": 0.949776828289032, "step": 2524 }, { "clip_ratio": 0.0, "completion_length": 534.3214492797852, "epoch": 0.7542379209917108, "grad_norm": 2.4001386165618896, "kl": 1.087890625, "learning_rate": 3.4556933530623193e-06, "loss": 0.1673, "reward": 2.0446429550647736, "reward_std": 0.3354535922408104, "rewards/accuracy_reward": 0.15848215040750802, "rewards/format_reward": 0.926339328289032, "rewards/tag_count_reward": 0.9598214775323868, "step": 2525 }, { "clip_ratio": 0.0, "completion_length": 557.8884124755859, "epoch": 0.7545366290792324, "grad_norm": 68.73328399658203, "kl": 0.8271484375, "learning_rate": 3.4478103692050168e-06, "loss": 0.1308, "reward": 1.9966518878936768, "reward_std": 0.3594096079468727, "rewards/accuracy_reward": 0.10267857741564512, "rewards/format_reward": 0.9285714775323868, "rewards/tag_count_reward": 0.9654018431901932, "step": 2526 }, { "clip_ratio": 0.0, "completion_length": 476.0312728881836, "epoch": 0.7548353371667538, "grad_norm": 1.6004587411880493, "kl": 0.607421875, "learning_rate": 3.439934513487845e-06, "loss": 0.1168, "reward": 2.2371652722358704, "reward_std": 0.28115272894501686, "rewards/accuracy_reward": 0.290178582072258, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9804687947034836, "step": 2527 }, { "clip_ratio": 0.0, "completion_length": 487.28572845458984, "epoch": 0.7551340452542753, "grad_norm": 1.3672561645507812, "kl": 0.48193359375, "learning_rate": 3.432065794478967e-06, "loss": 0.109, "reward": 2.0898438096046448, "reward_std": 0.33100827783346176, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.9486607611179352, "rewards/tag_count_reward": 0.9715402126312256, "step": 2528 }, { "clip_ratio": 0.0, "completion_length": 549.2812805175781, "epoch": 0.7554327533417967, "grad_norm": 1.0064101219177246, "kl": 0.468017578125, "learning_rate": 3.4242042207387815e-06, "loss": 0.0873, "reward": 2.0106027722358704, "reward_std": 0.1946013830602169, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.983816996216774, "step": 2529 }, { "clip_ratio": 0.0, "completion_length": 545.4933166503906, "epoch": 0.7557314614293182, "grad_norm": 0.7095066905021667, "kl": 0.70068359375, "learning_rate": 3.4163498008199038e-06, "loss": 0.0629, "reward": 2.0591518878936768, "reward_std": 0.24813199788331985, "rewards/accuracy_reward": 0.10267858020961285, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9854911118745804, "step": 2530 }, { "clip_ratio": 0.0, "completion_length": 526.7254638671875, "epoch": 0.7560301695168397, "grad_norm": 1.4786458015441895, "kl": 0.66357421875, "learning_rate": 3.4085025432671746e-06, "loss": 0.1333, "reward": 2.079241156578064, "reward_std": 0.2659861668944359, "rewards/accuracy_reward": 0.12946428917348385, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9787946939468384, "step": 2531 }, { "clip_ratio": 0.0, "completion_length": 553.6540374755859, "epoch": 0.7563288776043612, "grad_norm": 2.0492920875549316, "kl": 0.565673828125, "learning_rate": 3.400662456617646e-06, "loss": 0.1157, "reward": 1.9648438692092896, "reward_std": 0.2626533471047878, "rewards/accuracy_reward": 0.0334821455180645, "rewards/format_reward": 0.9531250298023224, "rewards/tag_count_reward": 0.9782366454601288, "step": 2532 }, { "clip_ratio": 0.0, "completion_length": 561.9732513427734, "epoch": 0.7566275856918826, "grad_norm": 1.6886322498321533, "kl": 1.0400390625, "learning_rate": 3.392829549400557e-06, "loss": 0.1425, "reward": 2.0825893878936768, "reward_std": 0.32958924770355225, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.9553571939468384, "rewards/tag_count_reward": 0.973214328289032, "step": 2533 }, { "clip_ratio": 0.0, "completion_length": 578.1986846923828, "epoch": 0.7569262937794041, "grad_norm": 1.5802016258239746, "kl": 1.10009765625, "learning_rate": 3.385003830137349e-06, "loss": 0.1639, "reward": 1.9648438394069672, "reward_std": 0.4113229438662529, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.9196428954601288, "rewards/tag_count_reward": 0.9559152275323868, "step": 2534 }, { "clip_ratio": 0.0, "completion_length": 571.5536041259766, "epoch": 0.7572250018669255, "grad_norm": 6.88553524017334, "kl": 3.51953125, "learning_rate": 3.3771853073416306e-06, "loss": 0.3936, "reward": 1.859933078289032, "reward_std": 0.5617782175540924, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.8459821939468384, "rewards/tag_count_reward": 0.9068080633878708, "step": 2535 }, { "clip_ratio": 0.0, "completion_length": 577.1116180419922, "epoch": 0.7575237099544471, "grad_norm": 5.875586986541748, "kl": 2.26171875, "learning_rate": 3.3693739895191934e-06, "loss": 0.2566, "reward": 1.9436384737491608, "reward_std": 0.45376764237880707, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.8816964775323868, "rewards/tag_count_reward": 0.930245578289032, "step": 2536 }, { "clip_ratio": 0.0, "completion_length": 591.4531402587891, "epoch": 0.7578224180419685, "grad_norm": 8.71484661102295, "kl": 3.978515625, "learning_rate": 3.3615698851679866e-06, "loss": 0.4551, "reward": 1.7767857909202576, "reward_std": 0.5968044400215149, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.8281250447034836, "rewards/tag_count_reward": 0.8906250447034836, "step": 2537 }, { "clip_ratio": 0.0, "completion_length": 633.6763610839844, "epoch": 0.75812112612949, "grad_norm": 7.839754104614258, "kl": 3.875, "learning_rate": 3.3537730027781057e-06, "loss": 0.4293, "reward": 1.7790179550647736, "reward_std": 0.627232626080513, "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.7700893133878708, "rewards/tag_count_reward": 0.863839328289032, "step": 2538 }, { "clip_ratio": 0.0, "completion_length": 627.6406402587891, "epoch": 0.7584198342170114, "grad_norm": 7.823958396911621, "kl": 4.515625, "learning_rate": 3.3459833508317984e-06, "loss": 0.4747, "reward": 1.7299107909202576, "reward_std": 0.6460355371236801, "rewards/accuracy_reward": 0.08035714854486287, "rewards/format_reward": 0.7745535969734192, "rewards/tag_count_reward": 0.8750000298023224, "step": 2539 }, { "clip_ratio": 0.0, "completion_length": 584.7433166503906, "epoch": 0.758718542304533, "grad_norm": 4.95302152633667, "kl": 3.62890625, "learning_rate": 3.338200937803444e-06, "loss": 0.4024, "reward": 1.868303656578064, "reward_std": 0.5863147675991058, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.8415178954601288, "rewards/tag_count_reward": 0.9084821790456772, "step": 2540 }, { "clip_ratio": 0.0, "completion_length": 592.3036041259766, "epoch": 0.7590172503920544, "grad_norm": 2.133282423019409, "kl": 2.345703125, "learning_rate": 3.33042577215954e-06, "loss": 0.2841, "reward": 1.8710938692092896, "reward_std": 0.4616982415318489, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.8683036118745804, "rewards/tag_count_reward": 0.9291295111179352, "step": 2541 }, { "clip_ratio": 0.0, "completion_length": 535.1741180419922, "epoch": 0.7593159584795758, "grad_norm": 3.2062249183654785, "kl": 1.873046875, "learning_rate": 3.322657862358707e-06, "loss": 0.2453, "reward": 2.0184152722358704, "reward_std": 0.3904435336589813, "rewards/accuracy_reward": 0.1562500037252903, "rewards/format_reward": 0.910714328289032, "rewards/tag_count_reward": 0.9514509439468384, "step": 2542 }, { "clip_ratio": 0.0, "completion_length": 587.9732360839844, "epoch": 0.7596146665670973, "grad_norm": 130.06239318847656, "kl": 1.7314453125, "learning_rate": 3.3148972168516737e-06, "loss": 0.1793, "reward": 1.9190849363803864, "reward_std": 0.33977270871400833, "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.910714328289032, "rewards/tag_count_reward": 0.9592634290456772, "step": 2543 }, { "clip_ratio": 0.0, "completion_length": 558.7522583007812, "epoch": 0.7599133746546187, "grad_norm": 2.0277726650238037, "kl": 0.9111328125, "learning_rate": 3.307143844081253e-06, "loss": 0.115, "reward": 1.9129465222358704, "reward_std": 0.2733018472790718, "rewards/accuracy_reward": 0.022321429336443543, "rewards/format_reward": 0.933035746216774, "rewards/tag_count_reward": 0.957589328289032, "step": 2544 }, { "clip_ratio": 0.0, "completion_length": 539.4219055175781, "epoch": 0.7602120827421402, "grad_norm": 52.539093017578125, "kl": 1.45703125, "learning_rate": 3.2993977524823616e-06, "loss": 0.1793, "reward": 1.9609376192092896, "reward_std": 0.45918213576078415, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.8995536118745804, "rewards/tag_count_reward": 0.949776828289032, "step": 2545 }, { "clip_ratio": 0.0, "completion_length": 557.8058242797852, "epoch": 0.7605107908296617, "grad_norm": 45.13301086425781, "kl": 1.724609375, "learning_rate": 3.2916589504819886e-06, "loss": 0.1531, "reward": 2.017299234867096, "reward_std": 0.3407190665602684, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.9308036118745804, "rewards/tag_count_reward": 0.9681920260190964, "step": 2546 }, { "clip_ratio": 0.0, "completion_length": 550.4397506713867, "epoch": 0.7608094989171832, "grad_norm": 3.6464881896972656, "kl": 1.34716796875, "learning_rate": 3.2839274464991856e-06, "loss": 0.1572, "reward": 2.0223215222358704, "reward_std": 0.4107897914946079, "rewards/accuracy_reward": 0.16741072200238705, "rewards/format_reward": 0.9062500447034836, "rewards/tag_count_reward": 0.9486607611179352, "step": 2547 }, { "clip_ratio": 0.0, "completion_length": 542.4642944335938, "epoch": 0.7611082070047046, "grad_norm": 4.808828830718994, "kl": 1.0009765625, "learning_rate": 3.276203248945078e-06, "loss": 0.2087, "reward": 1.9972098767757416, "reward_std": 0.4086413308978081, "rewards/accuracy_reward": 0.13392857555299997, "rewards/format_reward": 0.910714328289032, "rewards/tag_count_reward": 0.9525670111179352, "step": 2548 }, { "clip_ratio": 0.0, "completion_length": 518.357177734375, "epoch": 0.7614069150922261, "grad_norm": 2.3576252460479736, "kl": 1.0244140625, "learning_rate": 3.2684863662228307e-06, "loss": 0.1542, "reward": 1.9905134439468384, "reward_std": 0.36497482657432556, "rewards/accuracy_reward": 0.13169643841683865, "rewards/format_reward": 0.9017857611179352, "rewards/tag_count_reward": 0.9570313096046448, "step": 2549 }, { "clip_ratio": 0.0, "completion_length": 550.6183319091797, "epoch": 0.7617056231797475, "grad_norm": 3.744899034500122, "kl": 1.451171875, "learning_rate": 3.260776806727657e-06, "loss": 0.2387, "reward": 1.8504465222358704, "reward_std": 0.42430344969034195, "rewards/accuracy_reward": 0.024553572293370962, "rewards/format_reward": 0.8861607611179352, "rewards/tag_count_reward": 0.9397321790456772, "step": 2550 }, { "clip_ratio": 0.0, "completion_length": 582.154052734375, "epoch": 0.7620043312672691, "grad_norm": 2.917238235473633, "kl": 2.318359375, "learning_rate": 3.2530745788468052e-06, "loss": 0.2595, "reward": 1.8666295409202576, "reward_std": 0.5307346358895302, "rewards/accuracy_reward": 0.10491071874275804, "rewards/format_reward": 0.8437500447034836, "rewards/tag_count_reward": 0.9179687947034836, "step": 2551 }, { "clip_ratio": 0.0, "completion_length": 585.1250305175781, "epoch": 0.7623030393547905, "grad_norm": 4.492969036102295, "kl": 4.11328125, "learning_rate": 3.2453796909595394e-06, "loss": 0.4673, "reward": 1.7896206080913544, "reward_std": 0.6470943689346313, "rewards/accuracy_reward": 0.10044643236324191, "rewards/format_reward": 0.8080357313156128, "rewards/tag_count_reward": 0.8811384439468384, "step": 2552 }, { "clip_ratio": 0.0, "completion_length": 567.7812805175781, "epoch": 0.762601747442312, "grad_norm": 3.0230534076690674, "kl": 3.953125, "learning_rate": 3.237692151437146e-06, "loss": 0.432, "reward": 1.768415242433548, "reward_std": 0.6192170530557632, "rewards/accuracy_reward": 0.08258928824216127, "rewards/format_reward": 0.8035714775323868, "rewards/tag_count_reward": 0.8822545111179352, "step": 2553 }, { "clip_ratio": 0.0, "completion_length": 588.6295013427734, "epoch": 0.7629004555298334, "grad_norm": 3.0653069019317627, "kl": 3.89453125, "learning_rate": 3.2300119686429177e-06, "loss": 0.4704, "reward": 1.7232143580913544, "reward_std": 0.5899878740310669, "rewards/accuracy_reward": 0.04687500116415322, "rewards/format_reward": 0.7901785969734192, "rewards/tag_count_reward": 0.8861607611179352, "step": 2554 }, { "clip_ratio": 0.0, "completion_length": 575.0491180419922, "epoch": 0.763199163617355, "grad_norm": 4.98789119720459, "kl": 3.8046875, "learning_rate": 3.2223391509321335e-06, "loss": 0.447, "reward": 1.8789063096046448, "reward_std": 0.5850331336259842, "rewards/accuracy_reward": 0.15178571827709675, "rewards/format_reward": 0.8348214626312256, "rewards/tag_count_reward": 0.8922991454601288, "step": 2555 }, { "clip_ratio": 0.0, "completion_length": 569.1093902587891, "epoch": 0.7634978717048764, "grad_norm": 7.8812255859375, "kl": 4.4921875, "learning_rate": 3.2146737066520705e-06, "loss": 0.465, "reward": 1.8856027722358704, "reward_std": 0.5720180347561836, "rewards/accuracy_reward": 0.13839286752045155, "rewards/format_reward": 0.8459821790456772, "rewards/tag_count_reward": 0.9012277275323868, "step": 2556 }, { "clip_ratio": 0.0, "completion_length": 569.0670013427734, "epoch": 0.7637965797923979, "grad_norm": 1.7828775644302368, "kl": 2.8515625, "learning_rate": 3.2070156441419864e-06, "loss": 0.3615, "reward": 1.8532367050647736, "reward_std": 0.5264393612742424, "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.854910746216774, "rewards/tag_count_reward": 0.9135045111179352, "step": 2557 }, { "clip_ratio": 0.0, "completion_length": 567.9665374755859, "epoch": 0.7640952878799193, "grad_norm": 16.694072723388672, "kl": 3.029296875, "learning_rate": 3.199364971733092e-06, "loss": 0.3826, "reward": 1.8364956378936768, "reward_std": 0.606773130595684, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.8147321790456772, "rewards/tag_count_reward": 0.8878348618745804, "step": 2558 }, { "clip_ratio": 0.0, "completion_length": 609.8504638671875, "epoch": 0.7643939959674408, "grad_norm": 1.410156011581421, "kl": 2.58984375, "learning_rate": 3.1917216977485765e-06, "loss": 0.3317, "reward": 1.778459906578064, "reward_std": 0.5629691332578659, "rewards/accuracy_reward": 0.05357142956927419, "rewards/format_reward": 0.8281250447034836, "rewards/tag_count_reward": 0.8967634439468384, "step": 2559 }, { "clip_ratio": 0.0, "completion_length": 565.2924346923828, "epoch": 0.7646927040549623, "grad_norm": 14.868843078613281, "kl": 3.517578125, "learning_rate": 3.1840858305035727e-06, "loss": 0.4257, "reward": 1.9241072237491608, "reward_std": 0.5378433540463448, "rewards/accuracy_reward": 0.14732143469154835, "rewards/format_reward": 0.8593750596046448, "rewards/tag_count_reward": 0.9174107611179352, "step": 2560 }, { "clip_ratio": 0.0, "completion_length": 581.4620819091797, "epoch": 0.7649914121424838, "grad_norm": 2.274280309677124, "kl": 2.052734375, "learning_rate": 3.176457378305151e-06, "loss": 0.2847, "reward": 1.8588170409202576, "reward_std": 0.5106440037488937, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.8526786267757416, "rewards/tag_count_reward": 0.9146205931901932, "step": 2561 }, { "clip_ratio": 0.0, "completion_length": 560.575927734375, "epoch": 0.7652901202300052, "grad_norm": 28.59758186340332, "kl": 1.4921875, "learning_rate": 3.1688363494523267e-06, "loss": 0.2841, "reward": 1.9363840520381927, "reward_std": 0.4822167530655861, "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.9017857611179352, "rewards/tag_count_reward": 0.9408482611179352, "step": 2562 }, { "clip_ratio": 0.0, "completion_length": 586.5156402587891, "epoch": 0.7655888283175267, "grad_norm": 2.2372639179229736, "kl": 1.8125, "learning_rate": 3.161222752236024e-06, "loss": 0.2593, "reward": 1.872767984867096, "reward_std": 0.49300502240657806, "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.870535746216774, "rewards/tag_count_reward": 0.9218750447034836, "step": 2563 }, { "clip_ratio": 0.0, "completion_length": 588.3102874755859, "epoch": 0.7658875364050481, "grad_norm": 4.183320999145508, "kl": 2.1796875, "learning_rate": 3.1536165949390953e-06, "loss": 0.2808, "reward": 1.8750001192092896, "reward_std": 0.5109494104981422, "rewards/accuracy_reward": 0.06696428777649999, "rewards/format_reward": 0.8772321790456772, "rewards/tag_count_reward": 0.9308036118745804, "step": 2564 }, { "clip_ratio": 0.0, "completion_length": 567.6562805175781, "epoch": 0.7661862444925697, "grad_norm": 172.87733459472656, "kl": 1.607421875, "learning_rate": 3.1460178858362955e-06, "loss": 0.2187, "reward": 1.9129465222358704, "reward_std": 0.48715466260910034, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.8660714775323868, "rewards/tag_count_reward": 0.917410746216774, "step": 2565 }, { "clip_ratio": 0.0, "completion_length": 527.2321548461914, "epoch": 0.7664849525800911, "grad_norm": 263.27935791015625, "kl": 3.27734375, "learning_rate": 3.1384266331942715e-06, "loss": 0.3645, "reward": 1.9988840222358704, "reward_std": 0.3891155831515789, "rewards/accuracy_reward": 0.13392857694998384, "rewards/format_reward": 0.917410746216774, "rewards/tag_count_reward": 0.9475446790456772, "step": 2566 }, { "clip_ratio": 0.0, "completion_length": 558.4888687133789, "epoch": 0.7667836606676126, "grad_norm": 29.271007537841797, "kl": 1.990234375, "learning_rate": 3.1308428452715643e-06, "loss": 0.3388, "reward": 1.8398438394069672, "reward_std": 0.5387094840407372, "rewards/accuracy_reward": 0.07366071688011289, "rewards/format_reward": 0.8571428954601288, "rewards/tag_count_reward": 0.9090402126312256, "step": 2567 }, { "clip_ratio": 0.0, "completion_length": 560.3594055175781, "epoch": 0.767082368755134, "grad_norm": 1.6214121580123901, "kl": 1.9716796875, "learning_rate": 3.123266530318594e-06, "loss": 0.3319, "reward": 1.903459906578064, "reward_std": 0.5118336528539658, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.8705357760190964, "rewards/tag_count_reward": 0.9213170111179352, "step": 2568 }, { "clip_ratio": 0.0, "completion_length": 556.8303833007812, "epoch": 0.7673810768426556, "grad_norm": 4.0903215408325195, "kl": 1.935546875, "learning_rate": 3.115697696577644e-06, "loss": 0.3355, "reward": 1.9179688692092896, "reward_std": 0.5598429888486862, "rewards/accuracy_reward": 0.1584821566939354, "rewards/format_reward": 0.8549107611179352, "rewards/tag_count_reward": 0.9045759439468384, "step": 2569 }, { "clip_ratio": 0.0, "completion_length": 535.9107360839844, "epoch": 0.767679784930177, "grad_norm": 1.613326907157898, "kl": 1.19921875, "learning_rate": 3.1081363522828655e-06, "loss": 0.2538, "reward": 1.9514510035514832, "reward_std": 0.40593064948916435, "rewards/accuracy_reward": 0.09375000605359674, "rewards/format_reward": 0.9129464775323868, "rewards/tag_count_reward": 0.944754496216774, "step": 2570 }, { "clip_ratio": 0.0, "completion_length": 557.9352951049805, "epoch": 0.7679784930176985, "grad_norm": 36.063602447509766, "kl": 1.5634765625, "learning_rate": 3.1005825056602634e-06, "loss": 0.2688, "reward": 1.89006707072258, "reward_std": 0.5611154288053513, "rewards/accuracy_reward": 0.12276786286383867, "rewards/format_reward": 0.8593750298023224, "rewards/tag_count_reward": 0.90792416036129, "step": 2571 }, { "clip_ratio": 0.0, "completion_length": 547.5736846923828, "epoch": 0.7682772011052199, "grad_norm": 12.69835376739502, "kl": 1.4248046875, "learning_rate": 3.0930361649276774e-06, "loss": 0.2545, "reward": 1.8655135035514832, "reward_std": 0.45553672313690186, "rewards/accuracy_reward": 0.06250000116415322, "rewards/format_reward": 0.879464328289032, "rewards/tag_count_reward": 0.92354916036129, "step": 2572 }, { "clip_ratio": 0.0, "completion_length": 568.6428833007812, "epoch": 0.7685759091927414, "grad_norm": 5.46826696395874, "kl": 1.234375, "learning_rate": 3.0854973382947884e-06, "loss": 0.2605, "reward": 1.8716518878936768, "reward_std": 0.4967822805047035, "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.8839286267757416, "rewards/tag_count_reward": 0.9229911118745804, "step": 2573 }, { "clip_ratio": 0.0, "completion_length": 544.3058319091797, "epoch": 0.7688746172802629, "grad_norm": 2.31199312210083, "kl": 1.5859375, "learning_rate": 3.0779660339631035e-06, "loss": 0.2471, "reward": 1.9202009737491608, "reward_std": 0.4224333018064499, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.9040178954601288, "rewards/tag_count_reward": 0.938058078289032, "step": 2574 }, { "clip_ratio": 0.0, "completion_length": 552.2232513427734, "epoch": 0.7691733253677844, "grad_norm": 10.10221004486084, "kl": 1.3076171875, "learning_rate": 3.070442260125939e-06, "loss": 0.2971, "reward": 1.9436384737491608, "reward_std": 0.4642871543765068, "rewards/accuracy_reward": 0.14508928963914514, "rewards/format_reward": 0.879464328289032, "rewards/tag_count_reward": 0.9190848618745804, "step": 2575 }, { "clip_ratio": 0.0, "completion_length": 576.0870819091797, "epoch": 0.7694720334553058, "grad_norm": 86.9609375, "kl": 2.310546875, "learning_rate": 3.0629260249684288e-06, "loss": 0.3699, "reward": 1.8253348767757416, "reward_std": 0.5949529930949211, "rewards/accuracy_reward": 0.08705357392318547, "rewards/format_reward": 0.8437500447034836, "rewards/tag_count_reward": 0.8945312798023224, "step": 2576 }, { "clip_ratio": 0.0, "completion_length": 527.5268249511719, "epoch": 0.7697707415428273, "grad_norm": 0.7751642465591431, "kl": 0.912109375, "learning_rate": 3.0554173366674944e-06, "loss": 0.2668, "reward": 1.9123884737491608, "reward_std": 0.4869481474161148, "rewards/accuracy_reward": 0.07142857392318547, "rewards/format_reward": 0.9062500298023224, "rewards/tag_count_reward": 0.9347098618745804, "step": 2577 }, { "clip_ratio": 0.0, "completion_length": 564.3348388671875, "epoch": 0.7700694496303487, "grad_norm": 617.5186767578125, "kl": 2.65625, "learning_rate": 3.0479162033918553e-06, "loss": 0.3445, "reward": 1.8828126192092896, "reward_std": 0.4610721990466118, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.8727678954601288, "rewards/tag_count_reward": 0.9162946790456772, "step": 2578 }, { "clip_ratio": 0.0, "completion_length": 574.2076110839844, "epoch": 0.7703681577178703, "grad_norm": 1.7649353742599487, "kl": 1.2880859375, "learning_rate": 3.0404226333020117e-06, "loss": 0.3211, "reward": 1.83537957072258, "reward_std": 0.5584254041314125, "rewards/accuracy_reward": 0.06473214644938707, "rewards/format_reward": 0.8660714775323868, "rewards/tag_count_reward": 0.9045759290456772, "step": 2579 }, { "clip_ratio": 0.0, "completion_length": 564.7567291259766, "epoch": 0.7706668658053917, "grad_norm": 4.142560005187988, "kl": 1.4560546875, "learning_rate": 3.0329366345502287e-06, "loss": 0.2624, "reward": 1.9218751192092896, "reward_std": 0.4685066118836403, "rewards/accuracy_reward": 0.09151786379516125, "rewards/format_reward": 0.9017857611179352, "rewards/tag_count_reward": 0.9285714775323868, "step": 2580 }, { "clip_ratio": 0.0, "completion_length": 548.3594055175781, "epoch": 0.7709655738929132, "grad_norm": 913.284912109375, "kl": 3.779296875, "learning_rate": 3.025458215280542e-06, "loss": 0.456, "reward": 1.9732144176959991, "reward_std": 0.4364239200949669, "rewards/accuracy_reward": 0.14508929452858865, "rewards/format_reward": 0.8973214775323868, "rewards/tag_count_reward": 0.9308036118745804, "step": 2581 }, { "clip_ratio": 0.0, "completion_length": 546.3013610839844, "epoch": 0.7712642819804346, "grad_norm": 10.159666061401367, "kl": 1.828125, "learning_rate": 3.017987383628741e-06, "loss": 0.2999, "reward": 1.9514510035514832, "reward_std": 0.47200383991003036, "rewards/accuracy_reward": 0.12500000279396772, "rewards/format_reward": 0.8928571790456772, "rewards/tag_count_reward": 0.9335937947034836, "step": 2582 }, { "clip_ratio": 0.0, "completion_length": 555.8638610839844, "epoch": 0.7715629900679561, "grad_norm": 6.577317714691162, "kl": 1.697265625, "learning_rate": 3.0105241477223533e-06, "loss": 0.3279, "reward": 1.8125000894069672, "reward_std": 0.5169334709644318, "rewards/accuracy_reward": 0.03125000139698386, "rewards/format_reward": 0.8683036267757416, "rewards/tag_count_reward": 0.9129464626312256, "step": 2583 }, { "clip_ratio": 0.0, "completion_length": 548.8437805175781, "epoch": 0.7718616981554776, "grad_norm": 1.47555673122406, "kl": 1.17578125, "learning_rate": 3.0030685156806506e-06, "loss": 0.2615, "reward": 1.8945313394069672, "reward_std": 0.47634056210517883, "rewards/accuracy_reward": 0.06919643026776612, "rewards/format_reward": 0.8950893133878708, "rewards/tag_count_reward": 0.930245578289032, "step": 2584 }, { "clip_ratio": 0.0, "completion_length": 575.8102874755859, "epoch": 0.772160406242999, "grad_norm": 7.0371994972229, "kl": 0.9267578125, "learning_rate": 2.995620495614633e-06, "loss": 0.234, "reward": 1.9397322535514832, "reward_std": 0.48741472512483597, "rewards/accuracy_reward": 0.12053571874275804, "rewards/format_reward": 0.8928571939468384, "rewards/tag_count_reward": 0.926339328289032, "step": 2585 }, { "clip_ratio": 0.0, "completion_length": 534.5580520629883, "epoch": 0.7724591143305205, "grad_norm": 1.3016339540481567, "kl": 1.068359375, "learning_rate": 2.98818009562701e-06, "loss": 0.2293, "reward": 1.9391741752624512, "reward_std": 0.39801541715860367, "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.9218750447034836, "rewards/tag_count_reward": 0.9436384439468384, "step": 2586 }, { "clip_ratio": 0.0, "completion_length": 556.8125305175781, "epoch": 0.7727578224180419, "grad_norm": 1.444108247756958, "kl": 0.919921875, "learning_rate": 2.9807473238122097e-06, "loss": 0.2195, "reward": 1.9375000596046448, "reward_std": 0.4083630442619324, "rewards/accuracy_reward": 0.07366071688011289, "rewards/format_reward": 0.9196428954601288, "rewards/tag_count_reward": 0.9441964626312256, "step": 2587 }, { "clip_ratio": 0.0, "completion_length": 540.053596496582, "epoch": 0.7730565305055634, "grad_norm": 1.9404668807983398, "kl": 1.3330078125, "learning_rate": 2.9733221882563647e-06, "loss": 0.2964, "reward": 1.9006697237491608, "reward_std": 0.46302976459264755, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.879464328289032, "rewards/tag_count_reward": 0.9207589626312256, "step": 2588 }, { "clip_ratio": 0.0, "completion_length": 495.7388610839844, "epoch": 0.7733552385930849, "grad_norm": 1.3290857076644897, "kl": 0.6826171875, "learning_rate": 2.9659046970372875e-06, "loss": 0.2066, "reward": 2.0295759737491608, "reward_std": 0.34249261021614075, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.9397321790456772, "rewards/tag_count_reward": 0.9626116454601288, "step": 2589 }, { "clip_ratio": 0.0, "completion_length": 514.8125305175781, "epoch": 0.7736539466806064, "grad_norm": 36.46235275268555, "kl": 0.382568359375, "learning_rate": 2.9584948582244865e-06, "loss": 0.0882, "reward": 1.9743304550647736, "reward_std": 0.21429041773080826, "rewards/accuracy_reward": 0.03125000209547579, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9765625447034836, "step": 2590 }, { "clip_ratio": 0.0, "completion_length": 532.7968902587891, "epoch": 0.7739526547681278, "grad_norm": 0.6770066618919373, "kl": 0.521484375, "learning_rate": 2.951092679879136e-06, "loss": 0.1407, "reward": 2.0591518878936768, "reward_std": 0.2590383365750313, "rewards/accuracy_reward": 0.13616071827709675, "rewards/format_reward": 0.9531250447034836, "rewards/tag_count_reward": 0.9698661118745804, "step": 2591 }, { "clip_ratio": 0.0, "completion_length": 502.57591247558594, "epoch": 0.7742513628556493, "grad_norm": 5.988120079040527, "kl": 0.647216796875, "learning_rate": 2.9436981700540824e-06, "loss": 0.1337, "reward": 2.0357143580913544, "reward_std": 0.3263329453766346, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.9553571790456772, "rewards/tag_count_reward": 0.9687500596046448, "step": 2592 }, { "clip_ratio": 0.0, "completion_length": 456.08483123779297, "epoch": 0.7745500709431707, "grad_norm": 2.579204559326172, "kl": 0.46435546875, "learning_rate": 2.936311336793831e-06, "loss": 0.1421, "reward": 2.0998884737491608, "reward_std": 0.24040727503597736, "rewards/accuracy_reward": 0.15401786658912897, "rewards/format_reward": 0.9687500596046448, "rewards/tag_count_reward": 0.977120578289032, "step": 2593 }, { "clip_ratio": 0.0, "completion_length": 517.8393173217773, "epoch": 0.7748487790306923, "grad_norm": 1.4281120300292969, "kl": 0.58837890625, "learning_rate": 2.9289321881345257e-06, "loss": 0.1594, "reward": 2.0786831378936768, "reward_std": 0.3782290294766426, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.9464286118745804, "rewards/tag_count_reward": 0.9670759290456772, "step": 2594 }, { "clip_ratio": 0.0, "completion_length": 505.5580520629883, "epoch": 0.7751474871182137, "grad_norm": 1.400578260421753, "kl": 0.761962890625, "learning_rate": 2.9215607321039606e-06, "loss": 0.2024, "reward": 1.9592634737491608, "reward_std": 0.3316238895058632, "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.9486607611179352, "rewards/tag_count_reward": 0.9637277275323868, "step": 2595 }, { "clip_ratio": 0.0, "completion_length": 475.31029510498047, "epoch": 0.7754461952057352, "grad_norm": 0.4827735722064972, "kl": 0.228759765625, "learning_rate": 2.9141969767215607e-06, "loss": 0.074, "reward": 2.0312501192092896, "reward_std": 0.22068623453378677, "rewards/accuracy_reward": 0.07589286030270159, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9799107611179352, "step": 2596 }, { "clip_ratio": 0.0, "completion_length": 476.8192138671875, "epoch": 0.7757449032932566, "grad_norm": 1.3826065063476562, "kl": 0.75244140625, "learning_rate": 2.9068409299983634e-06, "loss": 0.1836, "reward": 2.011718839406967, "reward_std": 0.2803462687879801, "rewards/accuracy_reward": 0.1049107164144516, "rewards/format_reward": 0.9441964775323868, "rewards/tag_count_reward": 0.9626116454601288, "step": 2597 }, { "clip_ratio": 0.0, "completion_length": 539.6875228881836, "epoch": 0.7760436113807782, "grad_norm": 11.838194847106934, "kl": 0.3399658203125, "learning_rate": 2.8994925999370305e-06, "loss": 0.1046, "reward": 2.0725447237491608, "reward_std": 0.2585071660578251, "rewards/accuracy_reward": 0.16071429289877415, "rewards/format_reward": 0.9464285969734192, "rewards/tag_count_reward": 0.965401828289032, "step": 2598 }, { "clip_ratio": 0.0, "completion_length": 530.1852874755859, "epoch": 0.7763423194682996, "grad_norm": 0.404083251953125, "kl": 0.31494140625, "learning_rate": 2.8921519945318276e-06, "loss": 0.1336, "reward": 2.0005581378936768, "reward_std": 0.2646028883755207, "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.97823666036129, "step": 2599 }, { "clip_ratio": 0.0, "completion_length": 488.8169860839844, "epoch": 0.7766410275558211, "grad_norm": 0.49425747990608215, "kl": 0.162841796875, "learning_rate": 2.884819121768607e-06, "loss": 0.0821, "reward": 2.039620667695999, "reward_std": 0.2149523552507162, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.983816996216774, "step": 2600 }, { "clip_ratio": 0.0, "completion_length": 458.63842010498047, "epoch": 0.7769397356433425, "grad_norm": 0.5725934505462646, "kl": 0.298095703125, "learning_rate": 2.877493989624822e-06, "loss": 0.096, "reward": 2.1255581080913544, "reward_std": 0.1759230885654688, "rewards/accuracy_reward": 0.16517857694998384, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.982700914144516, "step": 2601 }, { "clip_ratio": 0.0, "completion_length": 487.8638687133789, "epoch": 0.777238443730864, "grad_norm": 0.27086910605430603, "kl": 0.19970703125, "learning_rate": 2.8701766060694937e-06, "loss": 0.0627, "reward": 2.072544753551483, "reward_std": 0.18936522863805294, "rewards/accuracy_reward": 0.11383928917348385, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9832589775323868, "step": 2602 }, { "clip_ratio": 0.0, "completion_length": 497.51341247558594, "epoch": 0.7775371518183855, "grad_norm": 0.40381282567977905, "kl": 0.246826171875, "learning_rate": 2.862866979063219e-06, "loss": 0.0773, "reward": 2.0362723767757416, "reward_std": 0.2031015194952488, "rewards/accuracy_reward": 0.07142857508733869, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9871652126312256, "step": 2603 }, { "clip_ratio": 0.0, "completion_length": 523.1830596923828, "epoch": 0.777835859905907, "grad_norm": 0.4172159433364868, "kl": 0.271240234375, "learning_rate": 2.855565116558161e-06, "loss": 0.117, "reward": 2.0301340222358704, "reward_std": 0.25413206964731216, "rewards/accuracy_reward": 0.10044643213041127, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.9720982611179352, "step": 2604 }, { "clip_ratio": 0.0, "completion_length": 480.60047149658203, "epoch": 0.7781345679934284, "grad_norm": 0.4421561062335968, "kl": 0.2003173828125, "learning_rate": 2.848271026498023e-06, "loss": 0.1104, "reward": 2.138951003551483, "reward_std": 0.2199342306703329, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9782366454601288, "step": 2605 }, { "clip_ratio": 0.0, "completion_length": 437.1183166503906, "epoch": 0.7784332760809499, "grad_norm": 0.6564431190490723, "kl": 0.24267578125, "learning_rate": 2.8409847168180628e-06, "loss": 0.1264, "reward": 2.124442011117935, "reward_std": 0.19928980618715286, "rewards/accuracy_reward": 0.16294643562287092, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.983816996216774, "step": 2606 }, { "clip_ratio": 0.0, "completion_length": 506.1384048461914, "epoch": 0.7787319841684713, "grad_norm": 0.39309948682785034, "kl": 0.2608642578125, "learning_rate": 2.833706195445075e-06, "loss": 0.0896, "reward": 2.066406339406967, "reward_std": 0.1863395031541586, "rewards/accuracy_reward": 0.10267857322469354, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.983816996216774, "step": 2607 }, { "clip_ratio": 0.0, "completion_length": 484.71207427978516, "epoch": 0.7790306922559929, "grad_norm": 0.28917232155799866, "kl": 0.364990234375, "learning_rate": 2.826435470297372e-06, "loss": 0.0905, "reward": 2.1054688692092896, "reward_std": 0.22253309935331345, "rewards/accuracy_reward": 0.1428571455180645, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9827009290456772, "step": 2608 }, { "clip_ratio": 0.0, "completion_length": 485.7589416503906, "epoch": 0.7793294003435143, "grad_norm": 0.609236478805542, "kl": 0.35009765625, "learning_rate": 2.8191725492847923e-06, "loss": 0.1285, "reward": 2.088169753551483, "reward_std": 0.2549532763659954, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9787946939468384, "step": 2609 }, { "clip_ratio": 0.0, "completion_length": 474.7120666503906, "epoch": 0.7796281084310358, "grad_norm": 0.654880702495575, "kl": 0.367919921875, "learning_rate": 2.8119174403086845e-06, "loss": 0.1045, "reward": 2.069754481315613, "reward_std": 0.19179902970790863, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9827009290456772, "step": 2610 }, { "clip_ratio": 0.0, "completion_length": 470.0201110839844, "epoch": 0.7799268165185572, "grad_norm": 0.5593488812446594, "kl": 0.208251953125, "learning_rate": 2.8046701512618914e-06, "loss": 0.0616, "reward": 2.110491156578064, "reward_std": 0.22961454093456268, "rewards/accuracy_reward": 0.13839286426082253, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.987723246216774, "step": 2611 }, { "clip_ratio": 0.0, "completion_length": 477.1897506713867, "epoch": 0.7802255246060787, "grad_norm": 0.5460464954376221, "kl": 0.28076171875, "learning_rate": 2.797430690028755e-06, "loss": 0.0857, "reward": 2.0825893878936768, "reward_std": 0.2227441305294633, "rewards/accuracy_reward": 0.12723215157166123, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.979910746216774, "step": 2612 }, { "clip_ratio": 0.0, "completion_length": 498.28797149658203, "epoch": 0.7805242326936002, "grad_norm": 0.15523970127105713, "kl": 0.10107421875, "learning_rate": 2.7901990644851042e-06, "loss": 0.0392, "reward": 2.0262277722358704, "reward_std": 0.11639170069247484, "rewards/accuracy_reward": 0.04910714668221772, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.990513414144516, "step": 2613 }, { "clip_ratio": 0.0, "completion_length": 438.83260345458984, "epoch": 0.7808229407811217, "grad_norm": 0.23975612223148346, "kl": 0.4068603515625, "learning_rate": 2.7829752824982305e-06, "loss": 0.0487, "reward": 2.115513503551483, "reward_std": 0.1319560231640935, "rewards/accuracy_reward": 0.13392857951112092, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9927455484867096, "step": 2614 }, { "clip_ratio": 0.0, "completion_length": 483.10269927978516, "epoch": 0.7811216488686431, "grad_norm": 0.23397468030452728, "kl": 0.1221923828125, "learning_rate": 2.7757593519269088e-06, "loss": 0.0326, "reward": 2.135044753551483, "reward_std": 0.11644208803772926, "rewards/accuracy_reward": 0.15178571874275804, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9921875298023224, "step": 2615 }, { "clip_ratio": 0.0, "completion_length": 492.17189025878906, "epoch": 0.7814203569561646, "grad_norm": 0.6476691365242004, "kl": 0.2523193359375, "learning_rate": 2.768551280621358e-06, "loss": 0.058, "reward": 2.0558036267757416, "reward_std": 0.1453307792544365, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9843750149011612, "step": 2616 }, { "clip_ratio": 0.0, "completion_length": 519.6451187133789, "epoch": 0.781719065043686, "grad_norm": 0.20256371796131134, "kl": 0.0960693359375, "learning_rate": 2.7613510764232542e-06, "loss": 0.0104, "reward": 2.0703126788139343, "reward_std": 0.16572260670363903, "rewards/accuracy_reward": 0.08258929057046771, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.994419664144516, "step": 2617 }, { "clip_ratio": 0.0, "completion_length": 487.7210159301758, "epoch": 0.7820177731312076, "grad_norm": 0.38075339794158936, "kl": 0.13427734375, "learning_rate": 2.7541587471657205e-06, "loss": 0.0387, "reward": 2.106026917695999, "reward_std": 0.15296300314366817, "rewards/accuracy_reward": 0.12276786658912897, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9921875298023224, "step": 2618 }, { "clip_ratio": 0.0, "completion_length": 507.55358123779297, "epoch": 0.782316481218729, "grad_norm": 0.22452695667743683, "kl": 0.156005859375, "learning_rate": 2.7469743006732964e-06, "loss": 0.0511, "reward": 2.0301340520381927, "reward_std": 0.1718068439513445, "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9899553954601288, "step": 2619 }, { "clip_ratio": 0.0, "completion_length": 463.9218978881836, "epoch": 0.7826151893062505, "grad_norm": 0.5103834271430969, "kl": 0.239013671875, "learning_rate": 2.7397977447619606e-06, "loss": 0.0781, "reward": 2.0401786863803864, "reward_std": 0.11072595044970512, "rewards/accuracy_reward": 0.05580357275903225, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9933035969734192, "step": 2620 }, { "clip_ratio": 0.0, "completion_length": 492.78128814697266, "epoch": 0.7829138973937719, "grad_norm": 0.15705819427967072, "kl": 0.1253662109375, "learning_rate": 2.732629087239106e-06, "loss": 0.0126, "reward": 2.1445313096046448, "reward_std": 0.14728985913097858, "rewards/accuracy_reward": 0.16071428917348385, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9927455633878708, "step": 2621 }, { "clip_ratio": 0.0, "completion_length": 485.3772506713867, "epoch": 0.7832126054812935, "grad_norm": 0.9097967743873596, "kl": 0.13916015625, "learning_rate": 2.7254683359035216e-06, "loss": 0.0276, "reward": 2.126674175262451, "reward_std": 0.10078805591911077, "rewards/accuracy_reward": 0.14508929196745157, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.992745578289032, "step": 2622 }, { "clip_ratio": 0.0, "completion_length": 543.8460159301758, "epoch": 0.7835113135688149, "grad_norm": 0.24160507321357727, "kl": 0.1427001953125, "learning_rate": 2.7183154985454075e-06, "loss": 0.0101, "reward": 2.039062589406967, "reward_std": 0.14524613507092, "rewards/accuracy_reward": 0.05133928847499192, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.994419664144516, "step": 2623 }, { "clip_ratio": 0.0, "completion_length": 453.10717010498047, "epoch": 0.7838100216563364, "grad_norm": 0.5805845856666565, "kl": 0.42822265625, "learning_rate": 2.711170582946352e-06, "loss": 0.0414, "reward": 2.0831474363803864, "reward_std": 0.20964486710727215, "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9849330633878708, "step": 2624 }, { "clip_ratio": 0.0, "completion_length": 482.2522506713867, "epoch": 0.7841087297438578, "grad_norm": 0.4767290949821472, "kl": 0.2119140625, "learning_rate": 2.7040335968793174e-06, "loss": 0.1038, "reward": 2.122209906578064, "reward_std": 0.21545179933309555, "rewards/accuracy_reward": 0.15625000558793545, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.983816996216774, "step": 2625 }, { "clip_ratio": 0.0, "completion_length": 516.4777069091797, "epoch": 0.7844074378313793, "grad_norm": 1.1706491708755493, "kl": 0.350830078125, "learning_rate": 2.6969045481086476e-06, "loss": 0.0886, "reward": 2.073102742433548, "reward_std": 0.2197880521416664, "rewards/accuracy_reward": 0.11383928917348385, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9815848767757416, "step": 2626 }, { "clip_ratio": 0.0, "completion_length": 508.00894927978516, "epoch": 0.7847061459189008, "grad_norm": 1.209365963935852, "kl": 0.554931640625, "learning_rate": 2.689783444390053e-06, "loss": 0.1073, "reward": 2.0396206080913544, "reward_std": 0.26338205859065056, "rewards/accuracy_reward": 0.09598214970901608, "rewards/format_reward": 0.96651791036129, "rewards/tag_count_reward": 0.977120578289032, "step": 2627 }, { "clip_ratio": 0.0, "completion_length": 515.9977951049805, "epoch": 0.7850048540064222, "grad_norm": 1.147337794303894, "kl": 0.433837890625, "learning_rate": 2.6826702934705885e-06, "loss": 0.0553, "reward": 2.087611675262451, "reward_std": 0.22633856534957886, "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9804687947034836, "step": 2628 }, { "clip_ratio": 0.0, "completion_length": 514.4018020629883, "epoch": 0.7853035620939437, "grad_norm": 0.2767829895019531, "kl": 0.172607421875, "learning_rate": 2.6755651030886733e-06, "loss": 0.0451, "reward": 2.0262277722358704, "reward_std": 0.14927378855645657, "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9882812798023224, "step": 2629 }, { "clip_ratio": 0.0, "completion_length": 510.8839416503906, "epoch": 0.7856022701814651, "grad_norm": 0.3976430594921112, "kl": 0.149169921875, "learning_rate": 2.6684678809740505e-06, "loss": 0.0856, "reward": 2.060826003551483, "reward_std": 0.14359062165021896, "rewards/accuracy_reward": 0.08705357438884676, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9893973618745804, "step": 2630 }, { "clip_ratio": 0.0, "completion_length": 546.4375152587891, "epoch": 0.7859009782689866, "grad_norm": 0.2565854787826538, "kl": 0.3505859375, "learning_rate": 2.6613786348478053e-06, "loss": 0.0982, "reward": 2.080357253551483, "reward_std": 0.24691227078437805, "rewards/accuracy_reward": 0.13169643585570157, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9776786267757416, "step": 2631 }, { "clip_ratio": 0.0, "completion_length": 537.6071624755859, "epoch": 0.786199686356508, "grad_norm": 4.984301567077637, "kl": 0.398193359375, "learning_rate": 2.6542973724223475e-06, "loss": 0.1249, "reward": 2.0100447237491608, "reward_std": 0.217438081279397, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9765625447034836, "step": 2632 }, { "clip_ratio": 0.0, "completion_length": 552.3928833007812, "epoch": 0.7864983944440296, "grad_norm": 0.6404681205749512, "kl": 0.365234375, "learning_rate": 2.647224101401389e-06, "loss": 0.077, "reward": 2.0139510333538055, "reward_std": 0.198871286585927, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.984933078289032, "step": 2633 }, { "clip_ratio": 0.0, "completion_length": 500.02234649658203, "epoch": 0.786797102531551, "grad_norm": 0.3922658860683441, "kl": 0.188720703125, "learning_rate": 2.6401588294799574e-06, "loss": 0.051, "reward": 2.1328125596046448, "reward_std": 0.23729006201028824, "rewards/accuracy_reward": 0.17633929336443543, "rewards/format_reward": 0.9754464477300644, "rewards/tag_count_reward": 0.9810268133878708, "step": 2634 }, { "clip_ratio": 0.0, "completion_length": 510.6205596923828, "epoch": 0.7870958106190725, "grad_norm": 1.0699141025543213, "kl": 0.3740234375, "learning_rate": 2.633101564344381e-06, "loss": 0.1062, "reward": 2.0591518580913544, "reward_std": 0.26757729053497314, "rewards/accuracy_reward": 0.10937500186264515, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9787946790456772, "step": 2635 }, { "clip_ratio": 0.0, "completion_length": 548.4844055175781, "epoch": 0.7873945187065939, "grad_norm": 8.688281059265137, "kl": 0.78271484375, "learning_rate": 2.626052313672267e-06, "loss": 0.1693, "reward": 1.9263393878936768, "reward_std": 0.34926391392946243, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.9241071790456772, "rewards/tag_count_reward": 0.9464286118745804, "step": 2636 }, { "clip_ratio": 0.0, "completion_length": 530.3951187133789, "epoch": 0.7876932267941155, "grad_norm": 0.4234243631362915, "kl": 0.38525390625, "learning_rate": 2.61901108513251e-06, "loss": 0.0871, "reward": 2.029576003551483, "reward_std": 0.2443981170654297, "rewards/accuracy_reward": 0.07812500488944352, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9804687947034836, "step": 2637 }, { "clip_ratio": 0.0, "completion_length": 552.0915374755859, "epoch": 0.7879919348816369, "grad_norm": 0.9694635272026062, "kl": 0.45068359375, "learning_rate": 2.611977886385282e-06, "loss": 0.1158, "reward": 2.0652902722358704, "reward_std": 0.3443847745656967, "rewards/accuracy_reward": 0.1294642873108387, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9715401977300644, "step": 2638 }, { "clip_ratio": 0.0, "completion_length": 512.4018020629883, "epoch": 0.7882906429691584, "grad_norm": 3.4288644790649414, "kl": 0.349609375, "learning_rate": 2.604952725082005e-06, "loss": 0.1191, "reward": 2.091517925262451, "reward_std": 0.29360548220574856, "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.970982164144516, "step": 2639 }, { "clip_ratio": 0.0, "completion_length": 524.6674423217773, "epoch": 0.7885893510566798, "grad_norm": 1.0934525728225708, "kl": 0.454345703125, "learning_rate": 2.5979356088653718e-06, "loss": 0.1548, "reward": 2.0273437798023224, "reward_std": 0.3251454755663872, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.9575893133878708, "rewards/tag_count_reward": 0.969308078289032, "step": 2640 }, { "clip_ratio": 0.0, "completion_length": 577.8705596923828, "epoch": 0.7888880591442013, "grad_norm": 0.9101006388664246, "kl": 0.68701171875, "learning_rate": 2.5909265453693187e-06, "loss": 0.1611, "reward": 1.993861734867096, "reward_std": 0.4154718369245529, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.9308036118745804, "rewards/tag_count_reward": 0.951450914144516, "step": 2641 }, { "clip_ratio": 0.0, "completion_length": 534.6451110839844, "epoch": 0.7891867672317228, "grad_norm": 5.555881500244141, "kl": 0.546875, "learning_rate": 2.5839255422190136e-06, "loss": 0.1655, "reward": 2.044642984867096, "reward_std": 0.2650579810142517, "rewards/accuracy_reward": 0.10267857741564512, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9754464775323868, "step": 2642 }, { "clip_ratio": 0.0, "completion_length": 579.044677734375, "epoch": 0.7894854753192443, "grad_norm": 2.2681186199188232, "kl": 0.66650390625, "learning_rate": 2.5769326070308676e-06, "loss": 0.1876, "reward": 1.954241156578064, "reward_std": 0.3717816714197397, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.9263393431901932, "rewards/tag_count_reward": 0.9453125298023224, "step": 2643 }, { "clip_ratio": 0.0, "completion_length": 542.3169784545898, "epoch": 0.7897841834067657, "grad_norm": 5.050332546234131, "kl": 0.54931640625, "learning_rate": 2.5699477474125044e-06, "loss": 0.1842, "reward": 1.9508929252624512, "reward_std": 0.41513024270534515, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.926339328289032, "rewards/tag_count_reward": 0.9486607760190964, "step": 2644 }, { "clip_ratio": 0.0, "completion_length": 505.2812805175781, "epoch": 0.7900828914942872, "grad_norm": 0.9508845210075378, "kl": 0.65771484375, "learning_rate": 2.562970970962768e-06, "loss": 0.182, "reward": 2.027901917695999, "reward_std": 0.3040843140333891, "rewards/accuracy_reward": 0.10714286426082253, "rewards/format_reward": 0.9531250447034836, "rewards/tag_count_reward": 0.9676339775323868, "step": 2645 }, { "clip_ratio": 0.0, "completion_length": 552.6518096923828, "epoch": 0.7903815995818086, "grad_norm": 2.7482082843780518, "kl": 0.73779296875, "learning_rate": 2.5560022852717115e-06, "loss": 0.16, "reward": 2.0848215222358704, "reward_std": 0.2824629582464695, "rewards/accuracy_reward": 0.15848215157166123, "rewards/format_reward": 0.9553571790456772, "rewards/tag_count_reward": 0.9709821939468384, "step": 2646 }, { "clip_ratio": 0.0, "completion_length": 545.9933242797852, "epoch": 0.7906803076693302, "grad_norm": 9.854264259338379, "kl": 1.745361328125, "learning_rate": 2.5490416979205758e-06, "loss": 0.1962, "reward": 2.0206474363803864, "reward_std": 0.29082850366830826, "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.9419643431901932, "rewards/tag_count_reward": 0.9626116454601288, "step": 2647 }, { "clip_ratio": 0.0, "completion_length": 539.6116409301758, "epoch": 0.7909790157568516, "grad_norm": 1.4076149463653564, "kl": 0.826171875, "learning_rate": 2.542089216481799e-06, "loss": 0.1803, "reward": 1.9637277722358704, "reward_std": 0.3259435184299946, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.93526791036129, "rewards/tag_count_reward": 0.9592634290456772, "step": 2648 }, { "clip_ratio": 0.0, "completion_length": 491.8303756713867, "epoch": 0.7912777238443731, "grad_norm": 0.9984496831893921, "kl": 0.9736328125, "learning_rate": 2.5351448485190043e-06, "loss": 0.2185, "reward": 2.0764509737491608, "reward_std": 0.37485186755657196, "rewards/accuracy_reward": 0.18526786658912897, "rewards/format_reward": 0.93526791036129, "rewards/tag_count_reward": 0.9559152126312256, "step": 2649 }, { "clip_ratio": 0.0, "completion_length": 549.4955749511719, "epoch": 0.7915764319318945, "grad_norm": 4.382633209228516, "kl": 1.4990234375, "learning_rate": 2.5282086015869777e-06, "loss": 0.2006, "reward": 2.0217634737491608, "reward_std": 0.38390473276376724, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.9397321790456772, "rewards/tag_count_reward": 0.9659598618745804, "step": 2650 }, { "clip_ratio": 0.0, "completion_length": 537.0357360839844, "epoch": 0.7918751400194161, "grad_norm": 1.5882079601287842, "kl": 1.1298828125, "learning_rate": 2.5212804832316783e-06, "loss": 0.2068, "reward": 2.034040242433548, "reward_std": 0.38484451174736023, "rewards/accuracy_reward": 0.14955357694998384, "rewards/format_reward": 0.9330357760190964, "rewards/tag_count_reward": 0.9514509439468384, "step": 2651 }, { "clip_ratio": 0.0, "completion_length": 528.6674423217773, "epoch": 0.7921738481069375, "grad_norm": 0.8153865337371826, "kl": 0.8203125, "learning_rate": 2.514360500990223e-06, "loss": 0.1699, "reward": 1.9497768580913544, "reward_std": 0.26669249683618546, "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.9441964626312256, "rewards/tag_count_reward": 0.9609375596046448, "step": 2652 }, { "clip_ratio": 0.0, "completion_length": 542.9531555175781, "epoch": 0.792472556194459, "grad_norm": 1.3450994491577148, "kl": 0.70458984375, "learning_rate": 2.5074486623908668e-06, "loss": 0.1379, "reward": 1.9804688394069672, "reward_std": 0.33676696941256523, "rewards/accuracy_reward": 0.07366071688011289, "rewards/format_reward": 0.941964328289032, "rewards/tag_count_reward": 0.9648437947034836, "step": 2653 }, { "clip_ratio": 0.0, "completion_length": 494.88394927978516, "epoch": 0.7927712642819804, "grad_norm": 2.0521936416625977, "kl": 0.83740234375, "learning_rate": 2.5005449749530174e-06, "loss": 0.208, "reward": 2.01897332072258, "reward_std": 0.3427983671426773, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.9397321939468384, "rewards/tag_count_reward": 0.9631696939468384, "step": 2654 }, { "clip_ratio": 0.0, "completion_length": 505.43529510498047, "epoch": 0.7930699723695019, "grad_norm": 0.9902012348175049, "kl": 0.9970703125, "learning_rate": 2.493649446187213e-06, "loss": 0.2398, "reward": 1.9196429550647736, "reward_std": 0.39535974711179733, "rewards/accuracy_reward": 0.044642857974395156, "rewards/format_reward": 0.9241071790456772, "rewards/tag_count_reward": 0.9508928805589676, "step": 2655 }, { "clip_ratio": 0.0, "completion_length": 499.9375305175781, "epoch": 0.7933686804570234, "grad_norm": 1.0154156684875488, "kl": 0.765625, "learning_rate": 2.4867620835951066e-06, "loss": 0.2246, "reward": 1.9955357909202576, "reward_std": 0.38705479353666306, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.9352678805589676, "rewards/tag_count_reward": 0.957589328289032, "step": 2656 }, { "clip_ratio": 0.0, "completion_length": 487.33483123779297, "epoch": 0.7936673885445449, "grad_norm": 1.1089117527008057, "kl": 0.720703125, "learning_rate": 2.479882894669481e-06, "loss": 0.1694, "reward": 2.0301340520381927, "reward_std": 0.3005603477358818, "rewards/accuracy_reward": 0.10044643143191934, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.9720982611179352, "step": 2657 }, { "clip_ratio": 0.0, "completion_length": 493.24779510498047, "epoch": 0.7939660966320663, "grad_norm": 0.8610315322875977, "kl": 0.83056640625, "learning_rate": 2.473011886894211e-06, "loss": 0.1448, "reward": 2.053571581840515, "reward_std": 0.3245658278465271, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.9486607611179352, "rewards/tag_count_reward": 0.9687500298023224, "step": 2658 }, { "clip_ratio": 0.0, "completion_length": 486.32814025878906, "epoch": 0.7942648047195878, "grad_norm": 1.225783348083496, "kl": 0.5859375, "learning_rate": 2.4661490677442834e-06, "loss": 0.1894, "reward": 1.9832590222358704, "reward_std": 0.3080999404191971, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.941964328289032, "rewards/tag_count_reward": 0.9587053954601288, "step": 2659 }, { "clip_ratio": 0.0, "completion_length": 507.5044860839844, "epoch": 0.7945635128071092, "grad_norm": 2.197431802749634, "kl": 0.857421875, "learning_rate": 2.459294444685778e-06, "loss": 0.1917, "reward": 2.0000001192092896, "reward_std": 0.34961482137441635, "rewards/accuracy_reward": 0.09151786379516125, "rewards/format_reward": 0.9419643133878708, "rewards/tag_count_reward": 0.9665178805589676, "step": 2660 }, { "clip_ratio": 0.0, "completion_length": 485.9486770629883, "epoch": 0.7948622208946308, "grad_norm": 10.225906372070312, "kl": 0.9248046875, "learning_rate": 2.452448025175844e-06, "loss": 0.1748, "reward": 2.015625089406967, "reward_std": 0.2969442345201969, "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.941964328289032, "rewards/tag_count_reward": 0.9620536118745804, "step": 2661 }, { "clip_ratio": 0.0, "completion_length": 508.9553756713867, "epoch": 0.7951609289821522, "grad_norm": 2.4273226261138916, "kl": 0.431640625, "learning_rate": 2.4456098166627194e-06, "loss": 0.1383, "reward": 2.0055804550647736, "reward_std": 0.24134613201022148, "rewards/accuracy_reward": 0.06919643119908869, "rewards/format_reward": 0.9598214626312256, "rewards/tag_count_reward": 0.9765625447034836, "step": 2662 }, { "clip_ratio": 0.0, "completion_length": 507.14734649658203, "epoch": 0.7954596370696737, "grad_norm": 10.397481918334961, "kl": 1.078125, "learning_rate": 2.4387798265857078e-06, "loss": 0.2995, "reward": 1.9419643580913544, "reward_std": 0.4393833503127098, "rewards/accuracy_reward": 0.1049107164144516, "rewards/format_reward": 0.9017857611179352, "rewards/tag_count_reward": 0.9352678954601288, "step": 2663 }, { "clip_ratio": 0.0, "completion_length": 470.85047149658203, "epoch": 0.7957583451571951, "grad_norm": 0.8655720949172974, "kl": 0.501708984375, "learning_rate": 2.4319580623751614e-06, "loss": 0.152, "reward": 2.0987724363803864, "reward_std": 0.2635178752243519, "rewards/accuracy_reward": 0.16964286658912897, "rewards/format_reward": 0.9598214775323868, "rewards/tag_count_reward": 0.9693080931901932, "step": 2664 }, { "clip_ratio": 0.0, "completion_length": 481.22769927978516, "epoch": 0.7960570532447167, "grad_norm": 1.3353288173675537, "kl": 0.736328125, "learning_rate": 2.425144531452497e-06, "loss": 0.2098, "reward": 2.072544753551483, "reward_std": 0.30970196425914764, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.9464286118745804, "rewards/tag_count_reward": 0.965401828289032, "step": 2665 }, { "clip_ratio": 0.0, "completion_length": 496.09154510498047, "epoch": 0.7963557613322381, "grad_norm": 1.399744987487793, "kl": 0.494140625, "learning_rate": 2.4183392412301686e-06, "loss": 0.1899, "reward": 1.989397406578064, "reward_std": 0.31393295153975487, "rewards/accuracy_reward": 0.06026786100119352, "rewards/format_reward": 0.9553571939468384, "rewards/tag_count_reward": 0.9737723618745804, "step": 2666 }, { "clip_ratio": 0.0, "completion_length": 485.26342010498047, "epoch": 0.7966544694197596, "grad_norm": 1.002046823501587, "kl": 0.6796875, "learning_rate": 2.4115421991116605e-06, "loss": 0.2154, "reward": 1.9698662161827087, "reward_std": 0.32827308028936386, "rewards/accuracy_reward": 0.06919643026776612, "rewards/format_reward": 0.941964328289032, "rewards/tag_count_reward": 0.9587053954601288, "step": 2667 }, { "clip_ratio": 0.0, "completion_length": 489.17859649658203, "epoch": 0.796953177507281, "grad_norm": 4.727227210998535, "kl": 0.93798828125, "learning_rate": 2.4047534124914907e-06, "loss": 0.2682, "reward": 1.9626116752624512, "reward_std": 0.38323426619172096, "rewards/accuracy_reward": 0.07366071967408061, "rewards/format_reward": 0.93526791036129, "rewards/tag_count_reward": 0.953683078289032, "step": 2668 }, { "clip_ratio": 0.0, "completion_length": 491.82592010498047, "epoch": 0.7972518855948025, "grad_norm": 3.3049538135528564, "kl": 1.2578125, "learning_rate": 2.397972888755197e-06, "loss": 0.3179, "reward": 1.8738840222358704, "reward_std": 0.46028588712215424, "rewards/accuracy_reward": 0.031250001629814506, "rewards/format_reward": 0.9062500447034836, "rewards/tag_count_reward": 0.9363839626312256, "step": 2669 }, { "clip_ratio": 0.0, "completion_length": 454.4419860839844, "epoch": 0.797550593682324, "grad_norm": 1.609108328819275, "kl": 1.1259765625, "learning_rate": 2.3912006352793184e-06, "loss": 0.3065, "reward": 1.8889509737491608, "reward_std": 0.4069697633385658, "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.9151786118745804, "rewards/tag_count_reward": 0.9425223618745804, "step": 2670 }, { "clip_ratio": 0.0, "completion_length": 489.2656478881836, "epoch": 0.7978493017698454, "grad_norm": 2.92680287361145, "kl": 1.2275390625, "learning_rate": 2.3844366594314096e-06, "loss": 0.3196, "reward": 1.9631697535514832, "reward_std": 0.48765651881694794, "rewards/accuracy_reward": 0.15178571827709675, "rewards/format_reward": 0.8883928954601288, "rewards/tag_count_reward": 0.9229911118745804, "step": 2671 }, { "clip_ratio": 0.0, "completion_length": 447.82591247558594, "epoch": 0.7981480098573669, "grad_norm": 3.6147940158843994, "kl": 1.2529296875, "learning_rate": 2.3776809685700063e-06, "loss": 0.264, "reward": 1.963727742433548, "reward_std": 0.4227661266922951, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.9084821939468384, "rewards/tag_count_reward": 0.9369420111179352, "step": 2672 }, { "clip_ratio": 0.0, "completion_length": 430.8460006713867, "epoch": 0.7984467179448883, "grad_norm": 1.1562179327011108, "kl": 0.61669921875, "learning_rate": 2.3709335700446425e-06, "loss": 0.2221, "reward": 2.0418528020381927, "reward_std": 0.28681014105677605, "rewards/accuracy_reward": 0.11607143259607255, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.968191996216774, "step": 2673 }, { "clip_ratio": 0.0, "completion_length": 464.6339416503906, "epoch": 0.7987454260324098, "grad_norm": 2.6575136184692383, "kl": 1.1396484375, "learning_rate": 2.3641944711958286e-06, "loss": 0.2929, "reward": 1.9146206378936768, "reward_std": 0.3823224604129791, "rewards/accuracy_reward": 0.06250000093132257, "rewards/format_reward": 0.9129464775323868, "rewards/tag_count_reward": 0.93917416036129, "step": 2674 }, { "clip_ratio": 0.0, "completion_length": 474.43528747558594, "epoch": 0.7990441341199312, "grad_norm": 41.660362243652344, "kl": 1.373046875, "learning_rate": 2.3574636793550376e-06, "loss": 0.3923, "reward": 1.9347099363803864, "reward_std": 0.49761340022087097, "rewards/accuracy_reward": 0.10937500465661287, "rewards/format_reward": 0.8950893431901932, "rewards/tag_count_reward": 0.9302455633878708, "step": 2675 }, { "clip_ratio": 0.0, "completion_length": 438.7031478881836, "epoch": 0.7993428422074528, "grad_norm": 4.759024620056152, "kl": 1.8388671875, "learning_rate": 2.350741201844714e-06, "loss": 0.4263, "reward": 1.881696492433548, "reward_std": 0.4368711858987808, "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.8973214626312256, "rewards/tag_count_reward": 0.9308036267757416, "step": 2676 }, { "clip_ratio": 0.0, "completion_length": 439.1875228881836, "epoch": 0.7996415502949742, "grad_norm": 1.281712293624878, "kl": 0.841796875, "learning_rate": 2.3440270459782575e-06, "loss": 0.2562, "reward": 1.9871653318405151, "reward_std": 0.41892824321985245, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.9151786118745804, "rewards/tag_count_reward": 0.9447545111179352, "step": 2677 }, { "clip_ratio": 0.0, "completion_length": 475.5357437133789, "epoch": 0.7999402583824957, "grad_norm": 13.275123596191406, "kl": 2.5859375, "learning_rate": 2.337321219060007e-06, "loss": 0.4423, "reward": 1.9235492050647736, "reward_std": 0.44925037026405334, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.8839286118745804, "rewards/tag_count_reward": 0.9235491454601288, "step": 2678 }, { "clip_ratio": 0.0, "completion_length": 447.5960006713867, "epoch": 0.8002389664700171, "grad_norm": 147.1424560546875, "kl": 1.5927734375, "learning_rate": 2.330623728385246e-06, "loss": 0.3979, "reward": 1.9051340222358704, "reward_std": 0.4125876724720001, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.910714328289032, "rewards/tag_count_reward": 0.940848246216774, "step": 2679 }, { "clip_ratio": 0.0, "completion_length": 450.68082427978516, "epoch": 0.8005376745575387, "grad_norm": 61.70359420776367, "kl": 0.9931640625, "learning_rate": 2.3239345812401913e-06, "loss": 0.3107, "reward": 1.9994420409202576, "reward_std": 0.43700721859931946, "rewards/accuracy_reward": 0.12500001024454832, "rewards/format_reward": 0.926339328289032, "rewards/tag_count_reward": 0.9481027275323868, "step": 2680 }, { "clip_ratio": 0.0, "completion_length": 450.3348388671875, "epoch": 0.8008363826450601, "grad_norm": 1.2519259452819824, "kl": 0.57861328125, "learning_rate": 2.317253784901976e-06, "loss": 0.1793, "reward": 1.9670760035514832, "reward_std": 0.2563748434185982, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.9508928954601288, "rewards/tag_count_reward": 0.9670759439468384, "step": 2681 }, { "clip_ratio": 0.0, "completion_length": 413.7120666503906, "epoch": 0.8011350907325816, "grad_norm": 2.9001784324645996, "kl": 1.296875, "learning_rate": 2.3105813466386538e-06, "loss": 0.3955, "reward": 1.9804688394069672, "reward_std": 0.4441663399338722, "rewards/accuracy_reward": 0.12946428917348385, "rewards/format_reward": 0.9129464626312256, "rewards/tag_count_reward": 0.9380580931901932, "step": 2682 }, { "clip_ratio": 0.0, "completion_length": 432.23439025878906, "epoch": 0.801433798820103, "grad_norm": 3.159109592437744, "kl": 1.03515625, "learning_rate": 2.303917273709181e-06, "loss": 0.2621, "reward": 1.962053656578064, "reward_std": 0.34898392856121063, "rewards/accuracy_reward": 0.06250000349245965, "rewards/format_reward": 0.9375000298023224, "rewards/tag_count_reward": 0.9620536118745804, "step": 2683 }, { "clip_ratio": 0.0, "completion_length": 392.6518020629883, "epoch": 0.8017325069076245, "grad_norm": 36.749080657958984, "kl": 0.9658203125, "learning_rate": 2.2972615733634164e-06, "loss": 0.3197, "reward": 1.9508929550647736, "reward_std": 0.39975082874298096, "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.9308036118745804, "rewards/tag_count_reward": 0.9531250447034836, "step": 2684 }, { "clip_ratio": 0.0, "completion_length": 403.2455520629883, "epoch": 0.802031214995146, "grad_norm": 2.194207191467285, "kl": 1.0830078125, "learning_rate": 2.2906142528421127e-06, "loss": 0.3358, "reward": 2.130580425262451, "reward_std": 0.44425972551107407, "rewards/accuracy_reward": 0.2700893022119999, "rewards/format_reward": 0.9196428954601288, "rewards/tag_count_reward": 0.9408482611179352, "step": 2685 }, { "clip_ratio": 0.0, "completion_length": 409.4397506713867, "epoch": 0.8023299230826675, "grad_norm": 4.159286022186279, "kl": 1.01025390625, "learning_rate": 2.2839753193768988e-06, "loss": 0.3053, "reward": 1.9241072237491608, "reward_std": 0.3832045644521713, "rewards/accuracy_reward": 0.05803571594879031, "rewards/format_reward": 0.9218750596046448, "rewards/tag_count_reward": 0.9441964626312256, "step": 2686 }, { "clip_ratio": 0.0, "completion_length": 386.6763610839844, "epoch": 0.8026286311701889, "grad_norm": 1.7927322387695312, "kl": 0.95751953125, "learning_rate": 2.277344780190286e-06, "loss": 0.3284, "reward": 1.9274554252624512, "reward_std": 0.348530612885952, "rewards/accuracy_reward": 0.053571430034935474, "rewards/format_reward": 0.9263393431901932, "rewards/tag_count_reward": 0.9475446939468384, "step": 2687 }, { "clip_ratio": 0.0, "completion_length": 395.6696548461914, "epoch": 0.8029273392577104, "grad_norm": 2.4175381660461426, "kl": 0.9521484375, "learning_rate": 2.270722642495653e-06, "loss": 0.3687, "reward": 1.9899554252624512, "reward_std": 0.4193105250597, "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.91964291036129, "rewards/tag_count_reward": 0.9475446790456772, "step": 2688 }, { "clip_ratio": 0.0, "completion_length": 384.9241256713867, "epoch": 0.8032260473452318, "grad_norm": 69.60901641845703, "kl": 1.408203125, "learning_rate": 2.2641089134972317e-06, "loss": 0.3927, "reward": 1.920758992433548, "reward_std": 0.37493521720170975, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.9084821790456772, "rewards/tag_count_reward": 0.9363839775323868, "step": 2689 }, { "clip_ratio": 0.0, "completion_length": 415.3772430419922, "epoch": 0.8035247554327534, "grad_norm": 6.244246006011963, "kl": 1.83203125, "learning_rate": 2.257503600390114e-06, "loss": 0.5358, "reward": 1.807477742433548, "reward_std": 0.5020678117871284, "rewards/accuracy_reward": 0.017857143888249993, "rewards/format_reward": 0.8727678954601288, "rewards/tag_count_reward": 0.9168527275323868, "step": 2690 }, { "clip_ratio": 0.0, "completion_length": 378.03126525878906, "epoch": 0.8038234635202748, "grad_norm": 3.256105661392212, "kl": 1.400390625, "learning_rate": 2.2509067103602354e-06, "loss": 0.4526, "reward": 1.9040179252624512, "reward_std": 0.42247268557548523, "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.9040178954601288, "rewards/tag_count_reward": 0.9330357611179352, "step": 2691 }, { "clip_ratio": 0.0, "completion_length": 378.15625762939453, "epoch": 0.8041221716077963, "grad_norm": 10.561646461486816, "kl": 1.89453125, "learning_rate": 2.244318250584361e-06, "loss": 0.4645, "reward": 1.8945313394069672, "reward_std": 0.48684727400541306, "rewards/accuracy_reward": 0.11160714784637094, "rewards/format_reward": 0.87276791036129, "rewards/tag_count_reward": 0.9101562947034836, "step": 2692 }, { "clip_ratio": 0.0, "completion_length": 424.3772430419922, "epoch": 0.8044208796953177, "grad_norm": 7.8265252113342285, "kl": 2.76953125, "learning_rate": 2.237738228230091e-06, "loss": 0.7775, "reward": 1.6791295111179352, "reward_std": 0.6327333077788353, "rewards/accuracy_reward": 0.03348214365541935, "rewards/format_reward": 0.7924107313156128, "rewards/tag_count_reward": 0.8532366454601288, "step": 2693 }, { "clip_ratio": 0.0, "completion_length": 350.1830520629883, "epoch": 0.8047195877828393, "grad_norm": 3.788282871246338, "kl": 1.4375, "learning_rate": 2.231166650455847e-06, "loss": 0.5025, "reward": 1.9179688096046448, "reward_std": 0.4349181726574898, "rewards/accuracy_reward": 0.08705357555299997, "rewards/format_reward": 0.901785746216774, "rewards/tag_count_reward": 0.929129496216774, "step": 2694 }, { "clip_ratio": 0.0, "completion_length": 355.5178680419922, "epoch": 0.8050182958703607, "grad_norm": 13.278861999511719, "kl": 2.029296875, "learning_rate": 2.2246035244108588e-06, "loss": 0.5629, "reward": 1.9598215222358704, "reward_std": 0.45529167354106903, "rewards/accuracy_reward": 0.14955357578583062, "rewards/format_reward": 0.8861607611179352, "rewards/tag_count_reward": 0.9241071790456772, "step": 2695 }, { "clip_ratio": 0.0, "completion_length": 391.71653747558594, "epoch": 0.8053170039578822, "grad_norm": 2.3675990104675293, "kl": 2.201171875, "learning_rate": 2.2180488572351667e-06, "loss": 0.7872, "reward": 1.7265625596046448, "reward_std": 0.593453511595726, "rewards/accuracy_reward": 0.011160714784637094, "rewards/format_reward": 0.8303571790456772, "rewards/tag_count_reward": 0.8850446790456772, "step": 2696 }, { "clip_ratio": 0.0, "completion_length": 340.3370666503906, "epoch": 0.8056157120454036, "grad_norm": 5.308838367462158, "kl": 1.705078125, "learning_rate": 2.211502656059602e-06, "loss": 0.5285, "reward": 1.9704241752624512, "reward_std": 0.42970389127731323, "rewards/accuracy_reward": 0.14732143515720963, "rewards/format_reward": 0.8973214626312256, "rewards/tag_count_reward": 0.9257812947034836, "step": 2697 }, { "clip_ratio": 0.0, "completion_length": 350.1495666503906, "epoch": 0.8059144201329251, "grad_norm": 7.70464563369751, "kl": 2.76171875, "learning_rate": 2.204964928005794e-06, "loss": 0.7932, "reward": 1.8610492050647736, "reward_std": 0.5471006259322166, "rewards/accuracy_reward": 0.10044643329456449, "rewards/format_reward": 0.8593750298023224, "rewards/tag_count_reward": 0.9012277275323868, "step": 2698 }, { "clip_ratio": 0.0, "completion_length": 329.3080520629883, "epoch": 0.8062131282204466, "grad_norm": 9.257550239562988, "kl": 2.0380859375, "learning_rate": 2.1984356801861506e-06, "loss": 0.63, "reward": 1.918526828289032, "reward_std": 0.444450207054615, "rewards/accuracy_reward": 0.0915178582072258, "rewards/format_reward": 0.8973214775323868, "rewards/tag_count_reward": 0.9296875596046448, "step": 2699 }, { "clip_ratio": 0.0, "completion_length": 343.18751525878906, "epoch": 0.8065118363079681, "grad_norm": 106.8553237915039, "kl": 2.4013671875, "learning_rate": 2.1919149197038494e-06, "loss": 0.5524, "reward": 1.8962054252624512, "reward_std": 0.43586108088493347, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.8638393133878708, "rewards/tag_count_reward": 0.9073661118745804, "step": 2700 }, { "clip_ratio": 0.0, "completion_length": 273.3705406188965, "epoch": 0.8068105443954895, "grad_norm": 3.0279834270477295, "kl": 1.05810546875, "learning_rate": 2.1854026536528405e-06, "loss": 0.4376, "reward": 2.0016742050647736, "reward_std": 0.3636321946978569, "rewards/accuracy_reward": 0.10714286053553224, "rewards/format_reward": 0.9375000447034836, "rewards/tag_count_reward": 0.9570312947034836, "step": 2701 }, { "clip_ratio": 0.0, "completion_length": 333.6317138671875, "epoch": 0.807109252483011, "grad_norm": 11.21565055847168, "kl": 1.58203125, "learning_rate": 2.1788988891178342e-06, "loss": 0.56, "reward": 1.9252232611179352, "reward_std": 0.3956140726804733, "rewards/accuracy_reward": 0.10937500488944352, "rewards/format_reward": 0.8928571939468384, "rewards/tag_count_reward": 0.9229911118745804, "step": 2702 }, { "clip_ratio": 0.0, "completion_length": 320.1852798461914, "epoch": 0.8074079605705324, "grad_norm": 4.334867000579834, "kl": 1.84375, "learning_rate": 2.172403633174284e-06, "loss": 0.456, "reward": 1.911272406578064, "reward_std": 0.3842606395483017, "rewards/accuracy_reward": 0.09821429150179029, "rewards/format_reward": 0.8906250298023224, "rewards/tag_count_reward": 0.922433078289032, "step": 2703 }, { "clip_ratio": 0.0, "completion_length": 324.2835006713867, "epoch": 0.807706668658054, "grad_norm": 3.56825590133667, "kl": 1.470703125, "learning_rate": 2.1659168928883933e-06, "loss": 0.5059, "reward": 1.91741082072258, "reward_std": 0.4494297653436661, "rewards/accuracy_reward": 0.07812500558793545, "rewards/format_reward": 0.9040178954601288, "rewards/tag_count_reward": 0.93526791036129, "step": 2704 }, { "clip_ratio": 0.0, "completion_length": 286.50001525878906, "epoch": 0.8080053767455754, "grad_norm": 3.134756088256836, "kl": 1.15625, "learning_rate": 2.1594386753171035e-06, "loss": 0.4847, "reward": 1.898995578289032, "reward_std": 0.3264428675174713, "rewards/accuracy_reward": 0.022321430267766118, "rewards/format_reward": 0.9241071790456772, "rewards/tag_count_reward": 0.9525670111179352, "step": 2705 }, { "clip_ratio": 0.0, "completion_length": 296.3035888671875, "epoch": 0.8083040848330969, "grad_norm": 8.444872856140137, "kl": 1.4130859375, "learning_rate": 2.152968987508075e-06, "loss": 0.5118, "reward": 1.9274554550647736, "reward_std": 0.34395869076251984, "rewards/accuracy_reward": 0.03794643096625805, "rewards/format_reward": 0.93526791036129, "rewards/tag_count_reward": 0.9542411267757416, "step": 2706 }, { "clip_ratio": 0.0, "completion_length": 280.10269927978516, "epoch": 0.8086027929206183, "grad_norm": 3.153169631958008, "kl": 1.068359375, "learning_rate": 2.146507836499697e-06, "loss": 0.5385, "reward": 1.9308036863803864, "reward_std": 0.355979535728693, "rewards/accuracy_reward": 0.051339289639145136, "rewards/format_reward": 0.9285714775323868, "rewards/tag_count_reward": 0.95089291036129, "step": 2707 }, { "clip_ratio": 0.0, "completion_length": 270.11162185668945, "epoch": 0.8089015010081398, "grad_norm": 5.517673969268799, "kl": 0.9931640625, "learning_rate": 2.1400552293210697e-06, "loss": 0.4828, "reward": 1.971540242433548, "reward_std": 0.32333215326070786, "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.9397321939468384, "rewards/tag_count_reward": 0.960379496216774, "step": 2708 }, { "clip_ratio": 0.0, "completion_length": 286.11831283569336, "epoch": 0.8092002090956613, "grad_norm": 6.1076483726501465, "kl": 1.6064453125, "learning_rate": 2.133611172991993e-06, "loss": 0.5255, "reward": 1.9224331080913544, "reward_std": 0.3635866940021515, "rewards/accuracy_reward": 0.05133928777649999, "rewards/format_reward": 0.9218750298023224, "rewards/tag_count_reward": 0.9492187798023224, "step": 2709 }, { "clip_ratio": 0.0, "completion_length": 271.82813262939453, "epoch": 0.8094989171831828, "grad_norm": 2081.7919921875, "kl": 3.2841796875, "learning_rate": 2.1271756745229744e-06, "loss": 0.6075, "reward": 1.9430804252624512, "reward_std": 0.34645215421915054, "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.9218750447034836, "rewards/tag_count_reward": 0.9520089775323868, "step": 2710 }, { "clip_ratio": 0.0, "completion_length": 249.25894165039062, "epoch": 0.8097976252707042, "grad_norm": 7.76999568939209, "kl": 1.705078125, "learning_rate": 2.1207487409151984e-06, "loss": 0.4651, "reward": 1.9324777722358704, "reward_std": 0.2605358622968197, "rewards/accuracy_reward": 0.011160714784637094, "rewards/format_reward": 0.9531250447034836, "rewards/tag_count_reward": 0.9681920111179352, "step": 2711 }, { "clip_ratio": 0.0, "completion_length": 276.7790298461914, "epoch": 0.8100963333582257, "grad_norm": 131.9098358154297, "kl": 1.021484375, "learning_rate": 2.114330379160543e-06, "loss": 0.6195, "reward": 1.8610492050647736, "reward_std": 0.40564459562301636, "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.917410746216774, "rewards/tag_count_reward": 0.9391741454601288, "step": 2712 }, { "clip_ratio": 0.0, "completion_length": 251.92188262939453, "epoch": 0.8103950414457471, "grad_norm": 2.4958608150482178, "kl": 0.70654296875, "learning_rate": 2.1079205962415593e-06, "loss": 0.4115, "reward": 1.9458706080913544, "reward_std": 0.296571709215641, "rewards/accuracy_reward": 0.04017857299186289, "rewards/format_reward": 0.9441964775323868, "rewards/tag_count_reward": 0.961495578289032, "step": 2713 }, { "clip_ratio": 0.0, "completion_length": 264.0803642272949, "epoch": 0.8106937495332686, "grad_norm": 58.60993576049805, "kl": 1.376953125, "learning_rate": 2.1015193991314577e-06, "loss": 0.4871, "reward": 1.9665179252624512, "reward_std": 0.32977671176195145, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.93526791036129, "rewards/tag_count_reward": 0.9598214775323868, "step": 2714 }, { "clip_ratio": 0.0, "completion_length": 277.98215103149414, "epoch": 0.8109924576207901, "grad_norm": 2.6347057819366455, "kl": 1.190673828125, "learning_rate": 2.0951267947941146e-06, "loss": 0.4657, "reward": 1.9704241752624512, "reward_std": 0.36391832306981087, "rewards/accuracy_reward": 0.10937500582076609, "rewards/format_reward": 0.91964291036129, "rewards/tag_count_reward": 0.9414062947034836, "step": 2715 }, { "clip_ratio": 0.0, "completion_length": 254.20759963989258, "epoch": 0.8112911657083115, "grad_norm": 5.357048988342285, "kl": 1.2470703125, "learning_rate": 2.088742790184062e-06, "loss": 0.4985, "reward": 2.0491071939468384, "reward_std": 0.31914055347442627, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.941964328289032, "rewards/tag_count_reward": 0.9620536267757416, "step": 2716 }, { "clip_ratio": 0.0, "completion_length": 245.7009048461914, "epoch": 0.811589873795833, "grad_norm": 2.4820716381073, "kl": 0.884765625, "learning_rate": 2.0823673922464625e-06, "loss": 0.4537, "reward": 1.9324778020381927, "reward_std": 0.29654838517308235, "rewards/accuracy_reward": 0.03125000139698386, "rewards/format_reward": 0.9419643133878708, "rewards/tag_count_reward": 0.9592634290456772, "step": 2717 }, { "clip_ratio": 0.0, "completion_length": 270.2522430419922, "epoch": 0.8118885818833544, "grad_norm": 3.2555770874023438, "kl": 0.8486328125, "learning_rate": 2.0760006079171303e-06, "loss": 0.4245, "reward": 1.9866072237491608, "reward_std": 0.35542404651641846, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.9441964775323868, "rewards/tag_count_reward": 0.9598214775323868, "step": 2718 }, { "clip_ratio": 0.0, "completion_length": 238.35045623779297, "epoch": 0.812187289970876, "grad_norm": 0.8630750179290771, "kl": 0.41943359375, "learning_rate": 2.0696424441225037e-06, "loss": 0.3143, "reward": 2.041852831840515, "reward_std": 0.2690652161836624, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.9620535969734192, "rewards/tag_count_reward": 0.9726562947034836, "step": 2719 }, { "clip_ratio": 0.0, "completion_length": 247.2187614440918, "epoch": 0.8124859980583974, "grad_norm": 2.583189010620117, "kl": 0.869140625, "learning_rate": 2.063292907779636e-06, "loss": 0.4545, "reward": 1.984933078289032, "reward_std": 0.34755711257457733, "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.926339328289032, "rewards/tag_count_reward": 0.94698666036129, "step": 2720 }, { "clip_ratio": 0.0, "completion_length": 244.55804824829102, "epoch": 0.8127847061459189, "grad_norm": 2.2154972553253174, "kl": 0.7734375, "learning_rate": 2.0569520057962044e-06, "loss": 0.3424, "reward": 2.013392925262451, "reward_std": 0.28917692601680756, "rewards/accuracy_reward": 0.11160714412108064, "rewards/format_reward": 0.9419643431901932, "rewards/tag_count_reward": 0.9598214626312256, "step": 2721 }, { "clip_ratio": 0.0, "completion_length": 258.5424270629883, "epoch": 0.8130834142334403, "grad_norm": 6.040074825286865, "kl": 1.6640625, "learning_rate": 2.050619745070491e-06, "loss": 0.625, "reward": 2.000558078289032, "reward_std": 0.3738839402794838, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.9218750447034836, "rewards/tag_count_reward": 0.9447545111179352, "step": 2722 }, { "clip_ratio": 0.0, "completion_length": 246.7589454650879, "epoch": 0.8133821223209619, "grad_norm": 1.1393043994903564, "kl": 0.693359375, "learning_rate": 2.044296132491369e-06, "loss": 0.3708, "reward": 2.0691965222358704, "reward_std": 0.31089016050100327, "rewards/accuracy_reward": 0.1696428619325161, "rewards/format_reward": 0.941964328289032, "rewards/tag_count_reward": 0.9575893431901932, "step": 2723 }, { "clip_ratio": 0.0, "completion_length": 246.6897430419922, "epoch": 0.8136808304084833, "grad_norm": 48.74825668334961, "kl": 1.494140625, "learning_rate": 2.037981174938315e-06, "loss": 0.7308, "reward": 1.9810269176959991, "reward_std": 0.41149356216192245, "rewards/accuracy_reward": 0.09821429057046771, "rewards/format_reward": 0.9330357611179352, "rewards/tag_count_reward": 0.949776828289032, "step": 2724 }, { "clip_ratio": 0.0, "completion_length": 226.94867324829102, "epoch": 0.8139795384960048, "grad_norm": 0.9941070675849915, "kl": 0.70263671875, "learning_rate": 2.031674879281378e-06, "loss": 0.3048, "reward": 1.9704242050647736, "reward_std": 0.2406511940062046, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9598214626312256, "rewards/tag_count_reward": 0.9748884439468384, "step": 2725 }, { "clip_ratio": 0.0, "completion_length": 192.35045623779297, "epoch": 0.8142782465835262, "grad_norm": 1.9557567834854126, "kl": 0.643798828125, "learning_rate": 2.025377252381192e-06, "loss": 0.411, "reward": 2.0686384737491608, "reward_std": 0.21820732951164246, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9815848618745804, "step": 2726 }, { "clip_ratio": 0.0, "completion_length": 208.8102798461914, "epoch": 0.8145769546710477, "grad_norm": 2.566035509109497, "kl": 0.69482421875, "learning_rate": 2.019088301088962e-06, "loss": 0.3109, "reward": 2.1512277722358704, "reward_std": 0.21393011882901192, "rewards/accuracy_reward": 0.196428582072258, "rewards/format_reward": 0.9732143431901932, "rewards/tag_count_reward": 0.9815848767757416, "step": 2727 }, { "clip_ratio": 0.0, "completion_length": 215.99777221679688, "epoch": 0.8148756627585692, "grad_norm": 1.7678449153900146, "kl": 0.8232421875, "learning_rate": 2.0128080322464437e-06, "loss": 0.5302, "reward": 2.0106027722358704, "reward_std": 0.30175187066197395, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.9508928954601288, "rewards/tag_count_reward": 0.9659598767757416, "step": 2728 }, { "clip_ratio": 0.0, "completion_length": 210.57143783569336, "epoch": 0.8151743708460907, "grad_norm": 0.9521965980529785, "kl": 0.447265625, "learning_rate": 2.0065364526859576e-06, "loss": 0.3935, "reward": 1.978236734867096, "reward_std": 0.28553155809640884, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9737723618745804, "step": 2729 }, { "clip_ratio": 0.0, "completion_length": 231.94867324829102, "epoch": 0.8154730789336121, "grad_norm": 2.567028760910034, "kl": 0.77294921875, "learning_rate": 2.00027356923037e-06, "loss": 0.3202, "reward": 2.0357143580913544, "reward_std": 0.1793438084423542, "rewards/accuracy_reward": 0.08482143399305642, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9799107611179352, "step": 2730 }, { "clip_ratio": 0.0, "completion_length": 216.2678680419922, "epoch": 0.8157717870211336, "grad_norm": 0.8347272872924805, "kl": 0.50830078125, "learning_rate": 1.9940193886930783e-06, "loss": 0.3477, "reward": 1.9748884737491608, "reward_std": 0.19476269744336605, "rewards/accuracy_reward": 0.029017858672887087, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.977120578289032, "step": 2731 }, { "clip_ratio": 0.0, "completion_length": 205.24108123779297, "epoch": 0.816070495108655, "grad_norm": 1.9742002487182617, "kl": 0.333984375, "learning_rate": 1.987773917878022e-06, "loss": 0.2041, "reward": 2.030134081840515, "reward_std": 0.16840876638889313, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.985491082072258, "step": 2732 }, { "clip_ratio": 0.0, "completion_length": 226.70537185668945, "epoch": 0.8163692031961766, "grad_norm": 1.341564655303955, "kl": 0.6806640625, "learning_rate": 1.981537163579663e-06, "loss": 0.4587, "reward": 2.0172992050647736, "reward_std": 0.2984892167150974, "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.9531250447034836, "rewards/tag_count_reward": 0.968191996216774, "step": 2733 }, { "clip_ratio": 0.0, "completion_length": 206.5223274230957, "epoch": 0.816667911283698, "grad_norm": 3.5767288208007812, "kl": 0.611083984375, "learning_rate": 1.975309132582973e-06, "loss": 0.3571, "reward": 2.028459906578064, "reward_std": 0.2315771710127592, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.977120578289032, "step": 2734 }, { "clip_ratio": 0.0, "completion_length": 221.04465103149414, "epoch": 0.8169666193712195, "grad_norm": 1.0255842208862305, "kl": 0.615234375, "learning_rate": 1.969089831663443e-06, "loss": 0.2239, "reward": 2.0033482909202576, "reward_std": 0.1972288191318512, "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9787946790456772, "step": 2735 }, { "clip_ratio": 0.0, "completion_length": 202.7410774230957, "epoch": 0.8172653274587409, "grad_norm": 0.7433746457099915, "kl": 0.317138671875, "learning_rate": 1.9628792675870656e-06, "loss": 0.2123, "reward": 2.02678582072258, "reward_std": 0.14543528482317924, "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9866071790456772, "step": 2736 }, { "clip_ratio": 0.0, "completion_length": 212.41519165039062, "epoch": 0.8175640355462624, "grad_norm": 0.5945969223976135, "kl": 0.377685546875, "learning_rate": 1.95667744711032e-06, "loss": 0.2154, "reward": 2.0078125596046448, "reward_std": 0.26119206845760345, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9765625298023224, "step": 2737 }, { "clip_ratio": 0.0, "completion_length": 212.6897430419922, "epoch": 0.8178627436337839, "grad_norm": 1.1889562606811523, "kl": 0.434814453125, "learning_rate": 1.950484376980183e-06, "loss": 0.2561, "reward": 2.025669753551483, "reward_std": 0.21179202944040298, "rewards/accuracy_reward": 0.058035718742758036, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9854911118745804, "step": 2738 }, { "clip_ratio": 0.0, "completion_length": 200.4263458251953, "epoch": 0.8181614517213054, "grad_norm": 1.988491177558899, "kl": 0.28466796875, "learning_rate": 1.9443000639341046e-06, "loss": 0.098, "reward": 2.0887278020381927, "reward_std": 0.12665259931236506, "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9927455633878708, "step": 2739 }, { "clip_ratio": 0.0, "completion_length": 190.16741943359375, "epoch": 0.8184601598088268, "grad_norm": 0.6183285117149353, "kl": 0.2783203125, "learning_rate": 1.9381245147000138e-06, "loss": 0.2148, "reward": 2.0318081080913544, "reward_std": 0.1485206689685583, "rewards/accuracy_reward": 0.06026785960420966, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9893973469734192, "step": 2740 }, { "clip_ratio": 0.0, "completion_length": 214.5468864440918, "epoch": 0.8187588678963483, "grad_norm": 1.2100903987884521, "kl": 0.57421875, "learning_rate": 1.931957735996304e-06, "loss": 0.2864, "reward": 2.1026786267757416, "reward_std": 0.2071908712387085, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.9754464477300644, "rewards/tag_count_reward": 0.979910746216774, "step": 2741 }, { "clip_ratio": 0.0, "completion_length": 195.8125114440918, "epoch": 0.8190575759838697, "grad_norm": 0.27645084261894226, "kl": 0.212158203125, "learning_rate": 1.9257997345318223e-06, "loss": 0.1336, "reward": 2.095424175262451, "reward_std": 0.1763601042330265, "rewards/accuracy_reward": 0.12276786053553224, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9905134290456772, "step": 2742 }, { "clip_ratio": 0.0, "completion_length": 187.11384963989258, "epoch": 0.8193562840713913, "grad_norm": 0.41847744584083557, "kl": 0.279541015625, "learning_rate": 1.919650517005872e-06, "loss": 0.1852, "reward": 2.1004465222358704, "reward_std": 0.17353864759206772, "rewards/accuracy_reward": 0.1227678656578064, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.988839328289032, "step": 2743 }, { "clip_ratio": 0.0, "completion_length": 194.99108123779297, "epoch": 0.8196549921589127, "grad_norm": 0.37866145372390747, "kl": 0.210205078125, "learning_rate": 1.9135100901082025e-06, "loss": 0.2559, "reward": 2.0295759737491608, "reward_std": 0.16175233200192451, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.984933078289032, "step": 2744 }, { "clip_ratio": 0.0, "completion_length": 198.33036422729492, "epoch": 0.8199537002464342, "grad_norm": 0.34204041957855225, "kl": 0.254638671875, "learning_rate": 1.9073784605189914e-06, "loss": 0.2697, "reward": 2.025111675262451, "reward_std": 0.22104576602578163, "rewards/accuracy_reward": 0.06696429033763707, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9827009290456772, "step": 2745 }, { "clip_ratio": 0.0, "completion_length": 187.11384963989258, "epoch": 0.8202524083339556, "grad_norm": 0.8801470994949341, "kl": 0.269287109375, "learning_rate": 1.901255634908854e-06, "loss": 0.2781, "reward": 2.0876117050647736, "reward_std": 0.23254350945353508, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.984933078289032, "step": 2746 }, { "clip_ratio": 0.0, "completion_length": 206.3727798461914, "epoch": 0.8205511164214772, "grad_norm": 0.3645852506160736, "kl": 0.25390625, "learning_rate": 1.895141619938825e-06, "loss": 0.2223, "reward": 2.0602679550647736, "reward_std": 0.2043006718158722, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9866071790456772, "step": 2747 }, { "clip_ratio": 0.0, "completion_length": 204.11831283569336, "epoch": 0.8208498245089986, "grad_norm": 1.7185307741165161, "kl": 0.319580078125, "learning_rate": 1.8890364222603496e-06, "loss": 0.1671, "reward": 2.026227742433548, "reward_std": 0.14050180278718472, "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9882812798023224, "step": 2748 }, { "clip_ratio": 0.0, "completion_length": 232.11608505249023, "epoch": 0.8211485325965201, "grad_norm": 0.9013047218322754, "kl": 0.3447265625, "learning_rate": 1.8829400485152872e-06, "loss": 0.2785, "reward": 1.9921875596046448, "reward_std": 0.24239857494831085, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9787946790456772, "step": 2749 }, { "clip_ratio": 0.0, "completion_length": 222.40848922729492, "epoch": 0.8214472406840415, "grad_norm": 0.5561918020248413, "kl": 0.514404296875, "learning_rate": 1.8768525053358976e-06, "loss": 0.1975, "reward": 2.025669753551483, "reward_std": 0.21613361686468124, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9787946939468384, "step": 2750 }, { "clip_ratio": 0.0, "completion_length": 223.39063262939453, "epoch": 0.821745948771563, "grad_norm": 0.6829707622528076, "kl": 0.355712890625, "learning_rate": 1.8707737993448249e-06, "loss": 0.1774, "reward": 2.051897406578064, "reward_std": 0.15340609475970268, "rewards/accuracy_reward": 0.08035714598372579, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9893973618745804, "step": 2751 }, { "clip_ratio": 0.0, "completion_length": 235.94420623779297, "epoch": 0.8220446568590845, "grad_norm": 1.968773603439331, "kl": 0.688232421875, "learning_rate": 1.8647039371551124e-06, "loss": 0.2827, "reward": 2.0033483505249023, "reward_std": 0.20786288753151894, "rewards/accuracy_reward": 0.051339287078008056, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.981026828289032, "step": 2752 }, { "clip_ratio": 0.0, "completion_length": 220.4888458251953, "epoch": 0.822343364946606, "grad_norm": 0.1948918104171753, "kl": 0.20068359375, "learning_rate": 1.8586429253701676e-06, "loss": 0.0783, "reward": 2.0301340222358704, "reward_std": 0.08221010863780975, "rewards/accuracy_reward": 0.04687500209547579, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9921875149011612, "step": 2753 }, { "clip_ratio": 0.0, "completion_length": 237.23661422729492, "epoch": 0.8226420730341274, "grad_norm": 0.5454941987991333, "kl": 0.35498046875, "learning_rate": 1.852590770583782e-06, "loss": 0.0656, "reward": 2.0463170409202576, "reward_std": 0.1404966004192829, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9949776977300644, "step": 2754 }, { "clip_ratio": 0.0, "completion_length": 232.91296005249023, "epoch": 0.8229407811216489, "grad_norm": 0.8012772798538208, "kl": 0.42529296875, "learning_rate": 1.8465474793801086e-06, "loss": 0.3195, "reward": 2.105468839406967, "reward_std": 0.2802036665380001, "rewards/accuracy_reward": 0.15625000861473382, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9782366454601288, "step": 2755 }, { "clip_ratio": 0.0, "completion_length": 245.43081665039062, "epoch": 0.8232394892091703, "grad_norm": 0.702181339263916, "kl": 0.2138671875, "learning_rate": 1.8405130583336507e-06, "loss": 0.1677, "reward": 2.0357143878936768, "reward_std": 0.20126548781991005, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.986607164144516, "step": 2756 }, { "clip_ratio": 0.0, "completion_length": 264.6585006713867, "epoch": 0.8235381972966918, "grad_norm": 0.4602053761482239, "kl": 0.415283203125, "learning_rate": 1.8344875140092689e-06, "loss": 0.3316, "reward": 2.011718899011612, "reward_std": 0.2670277766883373, "rewards/accuracy_reward": 0.07812500279396772, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.976004496216774, "step": 2757 }, { "clip_ratio": 0.0, "completion_length": 243.4776954650879, "epoch": 0.8238369053842133, "grad_norm": 4.060537815093994, "kl": 0.70361328125, "learning_rate": 1.8284708529621687e-06, "loss": 0.3017, "reward": 2.0256697237491608, "reward_std": 0.2248808853328228, "rewards/accuracy_reward": 0.07142857369035482, "rewards/format_reward": 0.9732143133878708, "rewards/tag_count_reward": 0.9810268133878708, "step": 2758 }, { "clip_ratio": 0.0, "completion_length": 267.3571586608887, "epoch": 0.8241356134717347, "grad_norm": 0.9226413369178772, "kl": 0.464599609375, "learning_rate": 1.822463081737883e-06, "loss": 0.2645, "reward": 2.0200893580913544, "reward_std": 0.21698067337274551, "rewards/accuracy_reward": 0.06473214505240321, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.979910746216774, "step": 2759 }, { "clip_ratio": 0.0, "completion_length": 249.1339454650879, "epoch": 0.8244343215592562, "grad_norm": 1.9314242601394653, "kl": 0.4736328125, "learning_rate": 1.8164642068722782e-06, "loss": 0.2069, "reward": 2.127790242433548, "reward_std": 0.26848775148391724, "rewards/accuracy_reward": 0.1785714402794838, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9804687947034836, "step": 2760 }, { "clip_ratio": 0.0, "completion_length": 261.0959930419922, "epoch": 0.8247330296467776, "grad_norm": 1.0634512901306152, "kl": 0.378173828125, "learning_rate": 1.810474234891547e-06, "loss": 0.216, "reward": 2.051897406578064, "reward_std": 0.2285018190741539, "rewards/accuracy_reward": 0.09598214644938707, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9827009290456772, "step": 2761 }, { "clip_ratio": 0.0, "completion_length": 259.5602798461914, "epoch": 0.8250317377342992, "grad_norm": 2.0903098583221436, "kl": 0.654052734375, "learning_rate": 1.8044931723121861e-06, "loss": 0.2967, "reward": 2.04241082072258, "reward_std": 0.21602439507842064, "rewards/accuracy_reward": 0.09821428847499192, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9754464775323868, "step": 2762 }, { "clip_ratio": 0.0, "completion_length": 274.1986846923828, "epoch": 0.8253304458218206, "grad_norm": 2.106248378753662, "kl": 0.51513671875, "learning_rate": 1.798521025641009e-06, "loss": 0.2964, "reward": 1.977678656578064, "reward_std": 0.2447325261309743, "rewards/accuracy_reward": 0.03125000232830644, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.979910746216774, "step": 2763 }, { "clip_ratio": 0.0, "completion_length": 297.6428680419922, "epoch": 0.8256291539093421, "grad_norm": 1.7228339910507202, "kl": 0.604736328125, "learning_rate": 1.7925578013751233e-06, "loss": 0.2507, "reward": 2.0295759737491608, "reward_std": 0.20926208049058914, "rewards/accuracy_reward": 0.08258928847499192, "rewards/format_reward": 0.970982164144516, "rewards/tag_count_reward": 0.976004496216774, "step": 2764 }, { "clip_ratio": 0.0, "completion_length": 287.38170623779297, "epoch": 0.8259278619968635, "grad_norm": 0.44958433508872986, "kl": 0.34912109375, "learning_rate": 1.7866035060019338e-06, "loss": 0.2565, "reward": 2.029576003551483, "reward_std": 0.21973590925335884, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9827009290456772, "step": 2765 }, { "clip_ratio": 0.0, "completion_length": 253.13170623779297, "epoch": 0.826226570084385, "grad_norm": 0.9827542901039124, "kl": 0.8662109375, "learning_rate": 1.7806581459991324e-06, "loss": 0.3088, "reward": 2.014508992433548, "reward_std": 0.2570101246237755, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9787946939468384, "step": 2766 }, { "clip_ratio": 0.0, "completion_length": 293.54688262939453, "epoch": 0.8265252781719065, "grad_norm": 3.647418260574341, "kl": 0.610107421875, "learning_rate": 1.774721727834684e-06, "loss": 0.2174, "reward": 1.97209832072258, "reward_std": 0.24599283933639526, "rewards/accuracy_reward": 0.022321429569274187, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9765625298023224, "step": 2767 }, { "clip_ratio": 0.0, "completion_length": 274.8504638671875, "epoch": 0.826823986259428, "grad_norm": 0.47539472579956055, "kl": 0.416015625, "learning_rate": 1.7687942579668315e-06, "loss": 0.1678, "reward": 1.9776786863803864, "reward_std": 0.17681506276130676, "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9843750447034836, "step": 2768 }, { "clip_ratio": 0.0, "completion_length": 283.2678756713867, "epoch": 0.8271226943469494, "grad_norm": 2.644153118133545, "kl": 0.64453125, "learning_rate": 1.7628757428440846e-06, "loss": 0.1639, "reward": 2.2689732909202576, "reward_std": 0.21922863274812698, "rewards/accuracy_reward": 0.301339291036129, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9854911118745804, "step": 2769 }, { "clip_ratio": 0.0, "completion_length": 281.1272506713867, "epoch": 0.8274214024344709, "grad_norm": 1.4880603551864624, "kl": 0.6416015625, "learning_rate": 1.7569661889052015e-06, "loss": 0.4155, "reward": 2.0156250596046448, "reward_std": 0.2919059097766876, "rewards/accuracy_reward": 0.08928571688011289, "rewards/format_reward": 0.9575893133878708, "rewards/tag_count_reward": 0.9687500298023224, "step": 2770 }, { "clip_ratio": 0.0, "completion_length": 271.2790298461914, "epoch": 0.8277201105219923, "grad_norm": 1.3083817958831787, "kl": 0.544921875, "learning_rate": 1.7510656025792005e-06, "loss": 0.28, "reward": 2.0401786267757416, "reward_std": 0.23029758408665657, "rewards/accuracy_reward": 0.08705357881262898, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9799107760190964, "step": 2771 }, { "clip_ratio": 0.0, "completion_length": 279.2991142272949, "epoch": 0.8280188186095139, "grad_norm": 2.1789684295654297, "kl": 0.541015625, "learning_rate": 1.7451739902853448e-06, "loss": 0.2917, "reward": 1.9983260035514832, "reward_std": 0.2455492876470089, "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9782366454601288, "step": 2772 }, { "clip_ratio": 0.0, "completion_length": 300.13170623779297, "epoch": 0.8283175266970353, "grad_norm": 1.8859293460845947, "kl": 0.814453125, "learning_rate": 1.739291358433124e-06, "loss": 0.3809, "reward": 2.075334906578064, "reward_std": 0.33235054463148117, "rewards/accuracy_reward": 0.14732143888249993, "rewards/format_reward": 0.9575893133878708, "rewards/tag_count_reward": 0.9704241454601288, "step": 2773 }, { "clip_ratio": 0.0, "completion_length": 281.66295623779297, "epoch": 0.8286162347845568, "grad_norm": 1.4272562265396118, "kl": 0.5927734375, "learning_rate": 1.7334177134222696e-06, "loss": 0.2389, "reward": 2.1093751192092896, "reward_std": 0.31120385229587555, "rewards/accuracy_reward": 0.16294643469154835, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.979910746216774, "step": 2774 }, { "clip_ratio": 0.0, "completion_length": 299.2567138671875, "epoch": 0.8289149428720782, "grad_norm": 0.746364176273346, "kl": 0.537109375, "learning_rate": 1.7275530616427338e-06, "loss": 0.166, "reward": 2.0764509439468384, "reward_std": 0.22466200776398182, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.9754464477300644, "rewards/tag_count_reward": 0.982700914144516, "step": 2775 }, { "clip_ratio": 0.0, "completion_length": 285.98661041259766, "epoch": 0.8292136509595998, "grad_norm": 1.5080773830413818, "kl": 0.87890625, "learning_rate": 1.7216974094746764e-06, "loss": 0.4445, "reward": 2.0820313692092896, "reward_std": 0.3113076463341713, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.9464286118745804, "rewards/tag_count_reward": 0.9659598767757416, "step": 2776 }, { "clip_ratio": 0.0, "completion_length": 306.42188262939453, "epoch": 0.8295123590471212, "grad_norm": 1.7209219932556152, "kl": 0.58935546875, "learning_rate": 1.7158507632884801e-06, "loss": 0.334, "reward": 2.0156250596046448, "reward_std": 0.2888224348425865, "rewards/accuracy_reward": 0.08705357741564512, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.9709821939468384, "step": 2777 }, { "clip_ratio": 0.0, "completion_length": 273.50001525878906, "epoch": 0.8298110671346427, "grad_norm": 1.0556267499923706, "kl": 0.578125, "learning_rate": 1.7100131294447165e-06, "loss": 0.3622, "reward": 2.139509081840515, "reward_std": 0.28544751182198524, "rewards/accuracy_reward": 0.1941964328289032, "rewards/format_reward": 0.96651791036129, "rewards/tag_count_reward": 0.9787946939468384, "step": 2778 }, { "clip_ratio": 0.0, "completion_length": 298.6183204650879, "epoch": 0.8301097752221641, "grad_norm": 3.407560110092163, "kl": 0.90234375, "learning_rate": 1.7041845142941615e-06, "loss": 0.3387, "reward": 1.9453125894069672, "reward_std": 0.28422707319259644, "rewards/accuracy_reward": 0.03125000116415322, "rewards/format_reward": 0.9508928954601288, "rewards/tag_count_reward": 0.9631696790456772, "step": 2779 }, { "clip_ratio": 0.0, "completion_length": 281.06920623779297, "epoch": 0.8304084833096856, "grad_norm": 1.2304884195327759, "kl": 0.66455078125, "learning_rate": 1.6983649241777811e-06, "loss": 0.2987, "reward": 1.9866072237491608, "reward_std": 0.24954677745699883, "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.9598214626312256, "rewards/tag_count_reward": 0.9754464626312256, "step": 2780 }, { "clip_ratio": 0.0, "completion_length": 289.39957427978516, "epoch": 0.8307071913972071, "grad_norm": 14.224886894226074, "kl": 1.4775390625, "learning_rate": 1.692554365426713e-06, "loss": 0.6062, "reward": 1.9642857909202576, "reward_std": 0.39725133031606674, "rewards/accuracy_reward": 0.10937500838190317, "rewards/format_reward": 0.9151786118745804, "rewards/tag_count_reward": 0.9397321939468384, "step": 2781 }, { "clip_ratio": 0.0, "completion_length": 245.55804061889648, "epoch": 0.8310058994847286, "grad_norm": 1.99446702003479, "kl": 0.513671875, "learning_rate": 1.6867528443622772e-06, "loss": 0.2703, "reward": 2.02287957072258, "reward_std": 0.1755457241088152, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9849330633878708, "step": 2782 }, { "clip_ratio": 0.0, "completion_length": 249.8504638671875, "epoch": 0.83130460757225, "grad_norm": 0.971929669380188, "kl": 0.5166015625, "learning_rate": 1.6809603672959618e-06, "loss": 0.3085, "reward": 2.0904018580913544, "reward_std": 0.24538848921656609, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.9732143431901932, "rewards/tag_count_reward": 0.9832589775323868, "step": 2783 }, { "clip_ratio": 0.0, "completion_length": 266.7567024230957, "epoch": 0.8316033156597715, "grad_norm": 1.4628326892852783, "kl": 0.40625, "learning_rate": 1.6751769405294128e-06, "loss": 0.2367, "reward": 1.9860492050647736, "reward_std": 0.2355998083949089, "rewards/accuracy_reward": 0.037946428870782256, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9793527126312256, "step": 2784 }, { "clip_ratio": 0.0, "completion_length": 238.98885345458984, "epoch": 0.8319020237472929, "grad_norm": 0.4698634147644043, "kl": 0.3984375, "learning_rate": 1.6694025703544349e-06, "loss": 0.3017, "reward": 2.0786831378936768, "reward_std": 0.2795373536646366, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.9575893431901932, "rewards/tag_count_reward": 0.969308078289032, "step": 2785 }, { "clip_ratio": 0.0, "completion_length": 246.0334930419922, "epoch": 0.8322007318348145, "grad_norm": 1.036810040473938, "kl": 0.44140625, "learning_rate": 1.6636372630529718e-06, "loss": 0.2174, "reward": 2.0864956378936768, "reward_std": 0.28445882350206375, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9793527275323868, "step": 2786 }, { "clip_ratio": 0.0, "completion_length": 236.38840103149414, "epoch": 0.8324994399223359, "grad_norm": 2.6757192611694336, "kl": 0.610107421875, "learning_rate": 1.6578810248971144e-06, "loss": 0.4136, "reward": 1.987165242433548, "reward_std": 0.18047094903886318, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9782366454601288, "step": 2787 }, { "clip_ratio": 0.0, "completion_length": 227.1227798461914, "epoch": 0.8327981480098574, "grad_norm": 1.249306082725525, "kl": 0.5244140625, "learning_rate": 1.652133862149089e-06, "loss": 0.2596, "reward": 2.032366156578064, "reward_std": 0.15287749841809273, "rewards/accuracy_reward": 0.07142857694998384, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9832589626312256, "step": 2788 }, { "clip_ratio": 0.0, "completion_length": 223.6227798461914, "epoch": 0.8330968560973788, "grad_norm": 0.8600038886070251, "kl": 0.43701171875, "learning_rate": 1.6463957810612408e-06, "loss": 0.1107, "reward": 2.0424108505249023, "reward_std": 0.18028896488249302, "rewards/accuracy_reward": 0.06919643376022577, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9910714477300644, "step": 2789 }, { "clip_ratio": 0.0, "completion_length": 240.86162185668945, "epoch": 0.8333955641849004, "grad_norm": 0.7041929960250854, "kl": 0.5068359375, "learning_rate": 1.6406667878760418e-06, "loss": 0.3212, "reward": 2.0719867050647736, "reward_std": 0.24283045157790184, "rewards/accuracy_reward": 0.13392857951112092, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9737723618745804, "step": 2790 }, { "clip_ratio": 0.0, "completion_length": 246.6629638671875, "epoch": 0.8336942722724218, "grad_norm": 0.7160000801086426, "kl": 0.344482421875, "learning_rate": 1.6349468888260766e-06, "loss": 0.2057, "reward": 2.0753349661827087, "reward_std": 0.16599592193961143, "rewards/accuracy_reward": 0.11830357951112092, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9815848618745804, "step": 2791 }, { "clip_ratio": 0.0, "completion_length": 240.61608123779297, "epoch": 0.8339929803599433, "grad_norm": 1.892096757888794, "kl": 0.59521484375, "learning_rate": 1.629236090134031e-06, "loss": 0.2731, "reward": 2.029576003551483, "reward_std": 0.2121189385652542, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9804687947034836, "step": 2792 }, { "clip_ratio": 0.0, "completion_length": 232.7366180419922, "epoch": 0.8342916884474647, "grad_norm": 1.913203477859497, "kl": 0.505126953125, "learning_rate": 1.6235343980126973e-06, "loss": 0.2196, "reward": 2.0368304550647736, "reward_std": 0.1617984026670456, "rewards/accuracy_reward": 0.07589286426082253, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9832589626312256, "step": 2793 }, { "clip_ratio": 0.0, "completion_length": 221.9687614440918, "epoch": 0.8345903965349862, "grad_norm": 1.4511016607284546, "kl": 0.419921875, "learning_rate": 1.617841818664957e-06, "loss": 0.1932, "reward": 2.054687589406967, "reward_std": 0.14779189601540565, "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.9754464477300644, "rewards/tag_count_reward": 0.9832589477300644, "step": 2794 }, { "clip_ratio": 0.0, "completion_length": 204.63393783569336, "epoch": 0.8348891046225076, "grad_norm": 0.7191034555435181, "kl": 0.343994140625, "learning_rate": 1.6121583582837773e-06, "loss": 0.159, "reward": 2.1238840222358704, "reward_std": 0.1781312245875597, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9899553805589676, "step": 2795 }, { "clip_ratio": 0.0, "completion_length": 245.80804443359375, "epoch": 0.8351878127100292, "grad_norm": 0.548456072807312, "kl": 0.32666015625, "learning_rate": 1.6064840230522094e-06, "loss": 0.174, "reward": 2.0184152722358704, "reward_std": 0.1387758795171976, "rewards/accuracy_reward": 0.04910714412108064, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9871652275323868, "step": 2796 }, { "clip_ratio": 0.0, "completion_length": 226.4107208251953, "epoch": 0.8354865207975506, "grad_norm": 0.4567229449748993, "kl": 0.304931640625, "learning_rate": 1.6008188191433683e-06, "loss": 0.1516, "reward": 2.068638503551483, "reward_std": 0.23513518646359444, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9860491454601288, "step": 2797 }, { "clip_ratio": 0.0, "completion_length": 237.8303680419922, "epoch": 0.8357852288850721, "grad_norm": 2.129695177078247, "kl": 0.64599609375, "learning_rate": 1.5951627527204438e-06, "loss": 0.3596, "reward": 2.0502233505249023, "reward_std": 0.2653493359684944, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9765625447034836, "step": 2798 }, { "clip_ratio": 0.0, "completion_length": 253.91072463989258, "epoch": 0.8360839369725935, "grad_norm": 2.920947551727295, "kl": 0.4521484375, "learning_rate": 1.589515829936684e-06, "loss": 0.322, "reward": 2.154017984867096, "reward_std": 0.190768925473094, "rewards/accuracy_reward": 0.191964291036129, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9843750447034836, "step": 2799 }, { "clip_ratio": 0.0, "completion_length": 236.0000114440918, "epoch": 0.836382645060115, "grad_norm": 0.8431496024131775, "kl": 0.306640625, "learning_rate": 1.583878056935384e-06, "loss": 0.2555, "reward": 2.0290179550647736, "reward_std": 0.23189819604158401, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.98214291036129, "step": 2800 }, { "clip_ratio": 0.0, "completion_length": 251.31920623779297, "epoch": 0.8366813531476365, "grad_norm": 0.8461250066757202, "kl": 0.35205078125, "learning_rate": 1.5782494398498882e-06, "loss": 0.2673, "reward": 2.1875001192092896, "reward_std": 0.21570739150047302, "rewards/accuracy_reward": 0.2433035857975483, "rewards/format_reward": 0.96651791036129, "rewards/tag_count_reward": 0.9776786118745804, "step": 2801 }, { "clip_ratio": 0.0, "completion_length": 257.75224685668945, "epoch": 0.8369800612351579, "grad_norm": 11.052626609802246, "kl": 0.392333984375, "learning_rate": 1.5726299848035843e-06, "loss": 0.2411, "reward": 2.1138393580913544, "reward_std": 0.2408856637775898, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9799107611179352, "step": 2802 }, { "clip_ratio": 0.0, "completion_length": 270.0915298461914, "epoch": 0.8372787693226794, "grad_norm": 0.5064923167228699, "kl": 0.4462890625, "learning_rate": 1.567019697909884e-06, "loss": 0.2678, "reward": 2.060267925262451, "reward_std": 0.21120036020874977, "rewards/accuracy_reward": 0.11383928963914514, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.9776785969734192, "step": 2803 }, { "clip_ratio": 0.0, "completion_length": 239.63617324829102, "epoch": 0.8375774774102008, "grad_norm": 0.5673531293869019, "kl": 0.36328125, "learning_rate": 1.5614185852722308e-06, "loss": 0.2406, "reward": 2.051339417695999, "reward_std": 0.2426428571343422, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9776786267757416, "step": 2804 }, { "clip_ratio": 0.0, "completion_length": 224.57590103149414, "epoch": 0.8378761854977224, "grad_norm": 1.7019249200820923, "kl": 0.6455078125, "learning_rate": 1.5558266529840893e-06, "loss": 0.3192, "reward": 2.080357253551483, "reward_std": 0.193428672850132, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9843750447034836, "step": 2805 }, { "clip_ratio": 0.0, "completion_length": 239.8281364440918, "epoch": 0.8381748935852438, "grad_norm": 0.8597730994224548, "kl": 0.4873046875, "learning_rate": 1.55024390712893e-06, "loss": 0.309, "reward": 2.0814732909202576, "reward_std": 0.21313868463039398, "rewards/accuracy_reward": 0.13169643469154835, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9787946939468384, "step": 2806 }, { "clip_ratio": 0.0, "completion_length": 210.88616943359375, "epoch": 0.8384736016727653, "grad_norm": 2.1685714721679688, "kl": 0.54931640625, "learning_rate": 1.5446703537802344e-06, "loss": 0.2354, "reward": 2.102120578289032, "reward_std": 0.16292216628789902, "rewards/accuracy_reward": 0.1294642947614193, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9882812798023224, "step": 2807 }, { "clip_ratio": 0.0, "completion_length": 250.0535888671875, "epoch": 0.8387723097602867, "grad_norm": 0.6023011803627014, "kl": 0.33984375, "learning_rate": 1.5391059990014834e-06, "loss": 0.1625, "reward": 2.1082590222358704, "reward_std": 0.21662207692861557, "rewards/accuracy_reward": 0.14285715110599995, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9854911118745804, "step": 2808 }, { "clip_ratio": 0.0, "completion_length": 246.89733123779297, "epoch": 0.8390710178478082, "grad_norm": 1.6291648149490356, "kl": 0.38427734375, "learning_rate": 1.533550848846148e-06, "loss": 0.1138, "reward": 2.068080484867096, "reward_std": 0.17016959190368652, "rewards/accuracy_reward": 0.09151786100119352, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9899553954601288, "step": 2809 }, { "clip_ratio": 0.0, "completion_length": 238.16742324829102, "epoch": 0.8393697259353297, "grad_norm": 0.45624691247940063, "kl": 0.318603515625, "learning_rate": 1.5280049093576899e-06, "loss": 0.2185, "reward": 2.1339287161827087, "reward_std": 0.22813229262828827, "rewards/accuracy_reward": 0.1741071529686451, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9821428805589676, "step": 2810 }, { "clip_ratio": 0.0, "completion_length": 263.6808166503906, "epoch": 0.8396684340228512, "grad_norm": 0.2910226881504059, "kl": 0.27099609375, "learning_rate": 1.5224681865695422e-06, "loss": 0.1699, "reward": 2.031250149011612, "reward_std": 0.16638526320457458, "rewards/accuracy_reward": 0.06696429033763707, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9843750447034836, "step": 2811 }, { "clip_ratio": 0.0, "completion_length": 207.9977798461914, "epoch": 0.8399671421103726, "grad_norm": 0.8008852601051331, "kl": 0.373046875, "learning_rate": 1.5169406865051218e-06, "loss": 0.2778, "reward": 2.086495667695999, "reward_std": 0.1345471255481243, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.990513414144516, "step": 2812 }, { "clip_ratio": 0.0, "completion_length": 233.56697463989258, "epoch": 0.8402658501978941, "grad_norm": 1.462870717048645, "kl": 0.43017578125, "learning_rate": 1.5114224151778068e-06, "loss": 0.2251, "reward": 2.0452009439468384, "reward_std": 0.20425788313150406, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9871652275323868, "step": 2813 }, { "clip_ratio": 0.0, "completion_length": 237.04241943359375, "epoch": 0.8405645582854155, "grad_norm": 1.3577818870544434, "kl": 0.64697265625, "learning_rate": 1.505913378590932e-06, "loss": 0.2373, "reward": 2.0574777722358704, "reward_std": 0.26753170415759087, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.977120578289032, "step": 2814 }, { "clip_ratio": 0.0, "completion_length": 242.39286041259766, "epoch": 0.8408632663729371, "grad_norm": 1.666453242301941, "kl": 0.4130859375, "learning_rate": 1.5004135827377909e-06, "loss": 0.193, "reward": 2.0150670409202576, "reward_std": 0.1805722862482071, "rewards/accuracy_reward": 0.04464286006987095, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9882812649011612, "step": 2815 }, { "clip_ratio": 0.0, "completion_length": 212.36384963989258, "epoch": 0.8411619744604585, "grad_norm": 0.2708835005760193, "kl": 0.18017578125, "learning_rate": 1.4949230336016251e-06, "loss": 0.1127, "reward": 2.2114956378936768, "reward_std": 0.1712376121431589, "rewards/accuracy_reward": 0.2276785783469677, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9927455633878708, "step": 2816 }, { "clip_ratio": 0.0, "completion_length": 218.6741180419922, "epoch": 0.84146068254798, "grad_norm": 1.193528413772583, "kl": 0.6201171875, "learning_rate": 1.489441737155609e-06, "loss": 0.1982, "reward": 2.0898438096046448, "reward_std": 0.14110724069178104, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9849330931901932, "step": 2817 }, { "clip_ratio": 0.0, "completion_length": 245.5000114440918, "epoch": 0.8417593906355014, "grad_norm": 0.47214800119400024, "kl": 0.233642578125, "learning_rate": 1.4839696993628594e-06, "loss": 0.119, "reward": 2.036830425262451, "reward_std": 0.15391447115689516, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9921875149011612, "step": 2818 }, { "clip_ratio": 0.0, "completion_length": 230.43081283569336, "epoch": 0.842058098723023, "grad_norm": 0.5980554223060608, "kl": 0.250244140625, "learning_rate": 1.4785069261764184e-06, "loss": 0.1788, "reward": 2.08928582072258, "reward_std": 0.19945116341114044, "rewards/accuracy_reward": 0.11830357881262898, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9866071790456772, "step": 2819 }, { "clip_ratio": 0.0, "completion_length": 235.60045623779297, "epoch": 0.8423568068105444, "grad_norm": 1.6110479831695557, "kl": 0.281005859375, "learning_rate": 1.4730534235392435e-06, "loss": 0.1849, "reward": 1.9866072237491608, "reward_std": 0.14552469365298748, "rewards/accuracy_reward": 0.01562500069849193, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.988839328289032, "step": 2820 }, { "clip_ratio": 0.0, "completion_length": 226.7834930419922, "epoch": 0.8426555148980659, "grad_norm": 0.3805786669254303, "kl": 0.31298828125, "learning_rate": 1.4676091973842122e-06, "loss": 0.1434, "reward": 2.083705425262451, "reward_std": 0.14977047964930534, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9899553954601288, "step": 2821 }, { "clip_ratio": 0.0, "completion_length": 233.39509963989258, "epoch": 0.8429542229855873, "grad_norm": 0.6466024518013, "kl": 0.3896484375, "learning_rate": 1.4621742536341133e-06, "loss": 0.372, "reward": 2.041852742433548, "reward_std": 0.25771138072013855, "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.96651791036129, "rewards/tag_count_reward": 0.9793527275323868, "step": 2822 }, { "clip_ratio": 0.0, "completion_length": 216.90625762939453, "epoch": 0.8432529310731088, "grad_norm": 0.48398557305336, "kl": 0.433349609375, "learning_rate": 1.456748598201626e-06, "loss": 0.1303, "reward": 2.176339328289032, "reward_std": 0.23858597502112389, "rewards/accuracy_reward": 0.2165178693830967, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9821428805589676, "step": 2823 }, { "clip_ratio": 0.0, "completion_length": 245.41518783569336, "epoch": 0.8435516391606303, "grad_norm": 0.5021827220916748, "kl": 0.454833984375, "learning_rate": 1.451332236989339e-06, "loss": 0.2795, "reward": 2.075334846973419, "reward_std": 0.2317543961107731, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.977120578289032, "step": 2824 }, { "clip_ratio": 0.0, "completion_length": 244.24331665039062, "epoch": 0.8438503472481518, "grad_norm": 0.5505797266960144, "kl": 0.4423828125, "learning_rate": 1.4459251758897153e-06, "loss": 0.2563, "reward": 2.098772406578064, "reward_std": 0.22317923977971077, "rewards/accuracy_reward": 0.1540178693830967, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.976004496216774, "step": 2825 }, { "clip_ratio": 0.0, "completion_length": 233.56920623779297, "epoch": 0.8441490553356732, "grad_norm": 0.4825923442840576, "kl": 0.292236328125, "learning_rate": 1.4405274207851116e-06, "loss": 0.1845, "reward": 2.031250089406967, "reward_std": 0.17577189020812511, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.98214291036129, "step": 2826 }, { "clip_ratio": 0.0, "completion_length": 285.92635345458984, "epoch": 0.8444477634231947, "grad_norm": 0.648684561252594, "kl": 0.4111328125, "learning_rate": 1.4351389775477576e-06, "loss": 0.2205, "reward": 2.072544753551483, "reward_std": 0.2724580354988575, "rewards/accuracy_reward": 0.12946429196745157, "rewards/format_reward": 0.96651791036129, "rewards/tag_count_reward": 0.9765625596046448, "step": 2827 }, { "clip_ratio": 0.0, "completion_length": 253.34376525878906, "epoch": 0.8447464715107161, "grad_norm": 0.5226352214813232, "kl": 0.33642578125, "learning_rate": 1.4297598520397471e-06, "loss": 0.1784, "reward": 2.1551340222358704, "reward_std": 0.20790079329162836, "rewards/accuracy_reward": 0.18526786752045155, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.987723246216774, "step": 2828 }, { "clip_ratio": 0.0, "completion_length": 289.09822845458984, "epoch": 0.8450451795982377, "grad_norm": 0.3045816719532013, "kl": 0.285400390625, "learning_rate": 1.4243900501130437e-06, "loss": 0.1564, "reward": 2.080357253551483, "reward_std": 0.20071497932076454, "rewards/accuracy_reward": 0.11383928847499192, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9843750298023224, "step": 2829 }, { "clip_ratio": 0.0, "completion_length": 231.7991180419922, "epoch": 0.8453438876857591, "grad_norm": 0.43018725514411926, "kl": 0.221923828125, "learning_rate": 1.4190295776094677e-06, "loss": 0.1762, "reward": 2.1428571939468384, "reward_std": 0.22269437089562416, "rewards/accuracy_reward": 0.16741071548312902, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.988839328289032, "step": 2830 }, { "clip_ratio": 0.0, "completion_length": 257.04465103149414, "epoch": 0.8456425957732806, "grad_norm": 1.3719884157180786, "kl": 0.3267822265625, "learning_rate": 1.413678440360684e-06, "loss": 0.1652, "reward": 2.0385045409202576, "reward_std": 0.1279393993318081, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9893973469734192, "step": 2831 }, { "clip_ratio": 0.0, "completion_length": 263.5290298461914, "epoch": 0.845941303860802, "grad_norm": 2.6783063411712646, "kl": 0.3955078125, "learning_rate": 1.4083366441882074e-06, "loss": 0.2975, "reward": 2.089285761117935, "reward_std": 0.2679591216146946, "rewards/accuracy_reward": 0.15178572200238705, "rewards/format_reward": 0.9620536267757416, "rewards/tag_count_reward": 0.9754464626312256, "step": 2832 }, { "clip_ratio": 0.0, "completion_length": 261.6004524230957, "epoch": 0.8462400119483235, "grad_norm": 1.6694159507751465, "kl": 0.58056640625, "learning_rate": 1.4030041949033902e-06, "loss": 0.3462, "reward": 2.095424234867096, "reward_std": 0.23579758405685425, "rewards/accuracy_reward": 0.14955357951112092, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.977120578289032, "step": 2833 }, { "clip_ratio": 0.0, "completion_length": 253.8995704650879, "epoch": 0.846538720035845, "grad_norm": 0.983506977558136, "kl": 0.3583984375, "learning_rate": 1.3976810983074107e-06, "loss": 0.2117, "reward": 2.0055805146694183, "reward_std": 0.2079500826075673, "rewards/accuracy_reward": 0.049107145285233855, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9810268133878708, "step": 2834 }, { "clip_ratio": 0.0, "completion_length": 275.7723274230957, "epoch": 0.8468374281233665, "grad_norm": 0.525204598903656, "kl": 0.235595703125, "learning_rate": 1.392367360191278e-06, "loss": 0.1496, "reward": 2.0797992050647736, "reward_std": 0.17236362770199776, "rewards/accuracy_reward": 0.11383928824216127, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9860491305589676, "step": 2835 }, { "clip_ratio": 0.0, "completion_length": 272.9509048461914, "epoch": 0.8471361362108879, "grad_norm": 0.5133364796638489, "kl": 0.3046875, "learning_rate": 1.3870629863358221e-06, "loss": 0.1702, "reward": 2.0195313096046448, "reward_std": 0.16978085041046143, "rewards/accuracy_reward": 0.04910714412108064, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9882812947034836, "step": 2836 }, { "clip_ratio": 0.0, "completion_length": 281.35268783569336, "epoch": 0.8474348442984094, "grad_norm": 1.3605852127075195, "kl": 0.247314453125, "learning_rate": 1.3817679825116748e-06, "loss": 0.245, "reward": 2.066406399011612, "reward_std": 0.2909274324774742, "rewards/accuracy_reward": 0.12946429150179029, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9726562798023224, "step": 2837 }, { "clip_ratio": 0.0, "completion_length": 272.3906364440918, "epoch": 0.8477335523859308, "grad_norm": 12.906785011291504, "kl": 1.1923828125, "learning_rate": 1.3764823544792883e-06, "loss": 0.357, "reward": 2.098772406578064, "reward_std": 0.24722076207399368, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.984933078289032, "step": 2838 }, { "clip_ratio": 0.0, "completion_length": 279.99108505249023, "epoch": 0.8480322604734524, "grad_norm": 0.5416148900985718, "kl": 0.32666015625, "learning_rate": 1.3712061079889016e-06, "loss": 0.3222, "reward": 2.113839328289032, "reward_std": 0.33783771842718124, "rewards/accuracy_reward": 0.1674107164144516, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.9776785969734192, "step": 2839 }, { "clip_ratio": 0.0, "completion_length": 275.6763496398926, "epoch": 0.8483309685609738, "grad_norm": 0.9002395272254944, "kl": 0.5234375, "learning_rate": 1.3659392487805567e-06, "loss": 0.1399, "reward": 1.9821429550647736, "reward_std": 0.23911994509398937, "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.9553571790456772, "rewards/tag_count_reward": 0.9665178805589676, "step": 2840 }, { "clip_ratio": 0.0, "completion_length": 267.79466247558594, "epoch": 0.8486296766484953, "grad_norm": 1.591863751411438, "kl": 0.39404296875, "learning_rate": 1.3606817825840834e-06, "loss": 0.1986, "reward": 2.0931921005249023, "reward_std": 0.15055761393159628, "rewards/accuracy_reward": 0.12053571944124997, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9882812798023224, "step": 2841 }, { "clip_ratio": 0.0, "completion_length": 280.39064025878906, "epoch": 0.8489283847360167, "grad_norm": 0.6187447905540466, "kl": 0.39501953125, "learning_rate": 1.3554337151190833e-06, "loss": 0.1717, "reward": 2.094308078289032, "reward_std": 0.2070428691804409, "rewards/accuracy_reward": 0.13392857508733869, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9827009290456772, "step": 2842 }, { "clip_ratio": 0.0, "completion_length": 330.4799270629883, "epoch": 0.8492270928235381, "grad_norm": 0.45054444670677185, "kl": 0.281982421875, "learning_rate": 1.3501950520949436e-06, "loss": 0.206, "reward": 2.1344867050647736, "reward_std": 0.25820715352892876, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.9804687798023224, "step": 2843 }, { "clip_ratio": 0.0, "completion_length": 291.7232208251953, "epoch": 0.8495258009110597, "grad_norm": 0.810238242149353, "kl": 0.498046875, "learning_rate": 1.3449657992108167e-06, "loss": 0.2479, "reward": 2.1227679550647736, "reward_std": 0.2693120501935482, "rewards/accuracy_reward": 0.18080357578583062, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9754464775323868, "step": 2844 }, { "clip_ratio": 0.0, "completion_length": 276.4129524230957, "epoch": 0.8498245089985811, "grad_norm": 1.4285345077514648, "kl": 0.4462890625, "learning_rate": 1.339745962155613e-06, "loss": 0.3237, "reward": 2.0580358505249023, "reward_std": 0.27294548600912094, "rewards/accuracy_reward": 0.11160714644938707, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9776786118745804, "step": 2845 }, { "clip_ratio": 0.0, "completion_length": 300.8259048461914, "epoch": 0.8501232170861026, "grad_norm": 11.5316743850708, "kl": 0.46337890625, "learning_rate": 1.334535546608008e-06, "loss": 0.2884, "reward": 1.9827010035514832, "reward_std": 0.2994016297161579, "rewards/accuracy_reward": 0.05803571827709675, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.9670759439468384, "step": 2846 }, { "clip_ratio": 0.0, "completion_length": 298.1339416503906, "epoch": 0.850421925173624, "grad_norm": 0.7563071846961975, "kl": 0.343017578125, "learning_rate": 1.3293345582364225e-06, "loss": 0.1761, "reward": 1.9916295409202576, "reward_std": 0.2219349667429924, "rewards/accuracy_reward": 0.0446428582072258, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.97823666036129, "step": 2847 }, { "clip_ratio": 0.0, "completion_length": 307.17635345458984, "epoch": 0.8507206332611456, "grad_norm": 1.1602576971054077, "kl": 0.4892578125, "learning_rate": 1.3241430026990187e-06, "loss": 0.2865, "reward": 2.000000089406967, "reward_std": 0.2245631441473961, "rewards/accuracy_reward": 0.07812500116415322, "rewards/format_reward": 0.9508928954601288, "rewards/tag_count_reward": 0.9709821790456772, "step": 2848 }, { "clip_ratio": 0.0, "completion_length": 271.06251525878906, "epoch": 0.851019341348667, "grad_norm": 1.3810195922851562, "kl": 0.386962890625, "learning_rate": 1.3189608856437053e-06, "loss": 0.212, "reward": 2.1132813692092896, "reward_std": 0.20975584164261818, "rewards/accuracy_reward": 0.16294643748551607, "rewards/format_reward": 0.970982164144516, "rewards/tag_count_reward": 0.9793526977300644, "step": 2849 }, { "clip_ratio": 0.0, "completion_length": 286.0625114440918, "epoch": 0.8513180494361885, "grad_norm": 0.7389421463012695, "kl": 0.588623046875, "learning_rate": 1.3137882127081126e-06, "loss": 0.3077, "reward": 2.066964417695999, "reward_std": 0.28058280050754547, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.9575893431901932, "rewards/tag_count_reward": 0.973214328289032, "step": 2850 }, { "clip_ratio": 0.0, "completion_length": 295.9263458251953, "epoch": 0.8516167575237099, "grad_norm": 0.8475350737571716, "kl": 0.42822265625, "learning_rate": 1.3086249895196045e-06, "loss": 0.234, "reward": 2.116071492433548, "reward_std": 0.2544271405786276, "rewards/accuracy_reward": 0.1897321566939354, "rewards/format_reward": 0.9553571939468384, "rewards/tag_count_reward": 0.9709821939468384, "step": 2851 }, { "clip_ratio": 0.0, "completion_length": 255.69866943359375, "epoch": 0.8519154656112314, "grad_norm": 0.47251999378204346, "kl": 0.46533203125, "learning_rate": 1.3034712216952628e-06, "loss": 0.1642, "reward": 2.0262278020381927, "reward_std": 0.1998324654996395, "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9815848618745804, "step": 2852 }, { "clip_ratio": 0.0, "completion_length": 263.4308166503906, "epoch": 0.8522141736987529, "grad_norm": 0.30704858899116516, "kl": 0.2451171875, "learning_rate": 1.2983269148418797e-06, "loss": 0.1947, "reward": 2.0691965222358704, "reward_std": 0.18206141144037247, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9821428954601288, "step": 2853 }, { "clip_ratio": 0.0, "completion_length": 284.3303680419922, "epoch": 0.8525128817862744, "grad_norm": 1.043968677520752, "kl": 0.52734375, "learning_rate": 1.2931920745559566e-06, "loss": 0.3529, "reward": 2.020647406578064, "reward_std": 0.27932254411280155, "rewards/accuracy_reward": 0.08705357555299997, "rewards/format_reward": 0.9620535969734192, "rewards/tag_count_reward": 0.9715402126312256, "step": 2854 }, { "clip_ratio": 0.0, "completion_length": 289.7433166503906, "epoch": 0.8528115898737958, "grad_norm": 1.3937397003173828, "kl": 0.63232421875, "learning_rate": 1.2880667064237006e-06, "loss": 0.2812, "reward": 2.050781339406967, "reward_std": 0.2179020680487156, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9860491454601288, "step": 2855 }, { "clip_ratio": 0.0, "completion_length": 251.90403366088867, "epoch": 0.8531102979613173, "grad_norm": 0.9435926079750061, "kl": 0.357666015625, "learning_rate": 1.2829508160210036e-06, "loss": 0.2446, "reward": 2.1177456080913544, "reward_std": 0.15489982813596725, "rewards/accuracy_reward": 0.1428571455180645, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9905134290456772, "step": 2856 }, { "clip_ratio": 0.0, "completion_length": 271.4910888671875, "epoch": 0.8534090060488387, "grad_norm": 1.9343171119689941, "kl": 0.44140625, "learning_rate": 1.2778444089134567e-06, "loss": 0.1541, "reward": 2.0792411863803864, "reward_std": 0.18938759807497263, "rewards/accuracy_reward": 0.10937500605359674, "rewards/format_reward": 0.9821428656578064, "rewards/tag_count_reward": 0.9877232313156128, "step": 2857 }, { "clip_ratio": 0.0, "completion_length": 260.83483123779297, "epoch": 0.8537077141363603, "grad_norm": 0.2952578365802765, "kl": 0.27978515625, "learning_rate": 1.272747490656332e-06, "loss": 0.0902, "reward": 2.0491072833538055, "reward_std": 0.14496364071965218, "rewards/accuracy_reward": 0.07589286030270159, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.988839328289032, "step": 2858 }, { "clip_ratio": 0.0, "completion_length": 293.6227798461914, "epoch": 0.8540064222238817, "grad_norm": 0.29123103618621826, "kl": 0.242919921875, "learning_rate": 1.2676600667945715e-06, "loss": 0.1499, "reward": 2.0915179550647736, "reward_std": 0.1545933149755001, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9866071790456772, "step": 2859 }, { "clip_ratio": 0.0, "completion_length": 283.3683204650879, "epoch": 0.8543051303114032, "grad_norm": 4.116544246673584, "kl": 0.59130859375, "learning_rate": 1.2625821428627981e-06, "loss": 0.3573, "reward": 2.051897406578064, "reward_std": 0.2726474329829216, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9737723618745804, "step": 2860 }, { "clip_ratio": 0.0, "completion_length": 268.89733505249023, "epoch": 0.8546038383989246, "grad_norm": 5.299470901489258, "kl": 0.26611328125, "learning_rate": 1.2575137243852965e-06, "loss": 0.2305, "reward": 2.0948661863803864, "reward_std": 0.25947003066539764, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.981026828289032, "step": 2861 }, { "clip_ratio": 0.0, "completion_length": 257.8482246398926, "epoch": 0.8549025464864461, "grad_norm": 0.4239216446876526, "kl": 0.281494140625, "learning_rate": 1.2524548168760043e-06, "loss": 0.1876, "reward": 2.063058167695999, "reward_std": 0.16122281551361084, "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9827009290456772, "step": 2862 }, { "clip_ratio": 0.0, "completion_length": 300.4776916503906, "epoch": 0.8552012545739676, "grad_norm": 0.3872327506542206, "kl": 0.216064453125, "learning_rate": 1.2474054258385226e-06, "loss": 0.1643, "reward": 2.0418527722358704, "reward_std": 0.18536046892404556, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9815848618745804, "step": 2863 }, { "clip_ratio": 0.0, "completion_length": 278.98215103149414, "epoch": 0.8554999626614891, "grad_norm": 0.4331739842891693, "kl": 0.231689453125, "learning_rate": 1.2423655567660885e-06, "loss": 0.1728, "reward": 2.0412947237491608, "reward_std": 0.17390484921634197, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9877232611179352, "step": 2864 }, { "clip_ratio": 0.0, "completion_length": 273.1093940734863, "epoch": 0.8557986707490105, "grad_norm": 0.6597609519958496, "kl": 0.30712890625, "learning_rate": 1.2373352151415885e-06, "loss": 0.1347, "reward": 2.0708706080913544, "reward_std": 0.1665446273982525, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.98604916036129, "step": 2865 }, { "clip_ratio": 0.0, "completion_length": 289.4955520629883, "epoch": 0.856097378836532, "grad_norm": 0.3917545974254608, "kl": 0.263916015625, "learning_rate": 1.2323144064375435e-06, "loss": 0.2683, "reward": 2.007812589406967, "reward_std": 0.22198754735291004, "rewards/accuracy_reward": 0.06473214481957257, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9765625447034836, "step": 2866 }, { "clip_ratio": 0.0, "completion_length": 294.35938262939453, "epoch": 0.8563960869240534, "grad_norm": 0.8018951416015625, "kl": 0.1868896484375, "learning_rate": 1.2273031361160958e-06, "loss": 0.124, "reward": 2.0518975257873535, "reward_std": 0.16030766908079386, "rewards/accuracy_reward": 0.07812500488944352, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9893973469734192, "step": 2867 }, { "clip_ratio": 0.0, "completion_length": 270.24778747558594, "epoch": 0.856694795011575, "grad_norm": 1.0187315940856934, "kl": 0.218505859375, "learning_rate": 1.2223014096290199e-06, "loss": 0.153, "reward": 2.13616082072258, "reward_std": 0.13087391667068005, "rewards/accuracy_reward": 0.16741072246804833, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.986607164144516, "step": 2868 }, { "clip_ratio": 0.0, "completion_length": 281.48439025878906, "epoch": 0.8569935030990964, "grad_norm": 0.7187067270278931, "kl": 0.31787109375, "learning_rate": 1.217309232417705e-06, "loss": 0.2672, "reward": 2.0541296005249023, "reward_std": 0.21761633828282356, "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9804687947034836, "step": 2869 }, { "clip_ratio": 0.0, "completion_length": 278.95090103149414, "epoch": 0.8572922111866179, "grad_norm": 0.4146805703639984, "kl": 0.234619140625, "learning_rate": 1.212326609913147e-06, "loss": 0.0979, "reward": 1.993861734867096, "reward_std": 0.12628306169062853, "rewards/accuracy_reward": 0.01562500069849193, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9893973618745804, "step": 2870 }, { "clip_ratio": 0.0, "completion_length": 279.71875762939453, "epoch": 0.8575909192741393, "grad_norm": 1.5123836994171143, "kl": 0.429931640625, "learning_rate": 1.2073535475359533e-06, "loss": 0.152, "reward": 2.065290331840515, "reward_std": 0.17225131951272488, "rewards/accuracy_reward": 0.08928571734577417, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9893973469734192, "step": 2871 }, { "clip_ratio": 0.0, "completion_length": 291.4085006713867, "epoch": 0.8578896273616609, "grad_norm": 0.2613489031791687, "kl": 0.189697265625, "learning_rate": 1.2023900506963293e-06, "loss": 0.1417, "reward": 2.0809152722358704, "reward_std": 0.18208124861121178, "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9871652126312256, "step": 2872 }, { "clip_ratio": 0.0, "completion_length": 292.07814025878906, "epoch": 0.8581883354491823, "grad_norm": 0.937624990940094, "kl": 0.31787109375, "learning_rate": 1.1974361247940702e-06, "loss": 0.1887, "reward": 2.112165242433548, "reward_std": 0.24047375470399857, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9804687947034836, "step": 2873 }, { "clip_ratio": 0.0, "completion_length": 293.5446548461914, "epoch": 0.8584870435367038, "grad_norm": 0.3532610237598419, "kl": 0.234130859375, "learning_rate": 1.1924917752185628e-06, "loss": 0.2127, "reward": 2.1261161863803864, "reward_std": 0.18853824026882648, "rewards/accuracy_reward": 0.16741071757860482, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.981026828289032, "step": 2874 }, { "clip_ratio": 0.0, "completion_length": 277.6116180419922, "epoch": 0.8587857516242252, "grad_norm": 0.8280804753303528, "kl": 0.446044921875, "learning_rate": 1.1875570073487786e-06, "loss": 0.2629, "reward": 2.064174234867096, "reward_std": 0.28657127544283867, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.96651791036129, "rewards/tag_count_reward": 0.9726562798023224, "step": 2875 }, { "clip_ratio": 0.0, "completion_length": 291.0669746398926, "epoch": 0.8590844597117467, "grad_norm": 0.5083722472190857, "kl": 0.212890625, "learning_rate": 1.1826318265532543e-06, "loss": 0.1309, "reward": 2.010044753551483, "reward_std": 0.17392468452453613, "rewards/accuracy_reward": 0.040178573690354824, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9877232611179352, "step": 2876 }, { "clip_ratio": 0.0, "completion_length": 311.7723388671875, "epoch": 0.8593831677992682, "grad_norm": 0.6684111952781677, "kl": 0.414794921875, "learning_rate": 1.1777162381901108e-06, "loss": 0.2105, "reward": 2.1562500596046448, "reward_std": 0.21817069873213768, "rewards/accuracy_reward": 0.19642858393490314, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9843750298023224, "step": 2877 }, { "clip_ratio": 0.0, "completion_length": 293.50671005249023, "epoch": 0.8596818758867897, "grad_norm": 1.5088139772415161, "kl": 0.252685546875, "learning_rate": 1.1728102476070213e-06, "loss": 0.1852, "reward": 2.1222099661827087, "reward_std": 0.19923244882375002, "rewards/accuracy_reward": 0.16517857951112092, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9815848469734192, "step": 2878 }, { "clip_ratio": 0.0, "completion_length": 274.5870704650879, "epoch": 0.8599805839743111, "grad_norm": 0.387961208820343, "kl": 0.247802734375, "learning_rate": 1.1679138601412253e-06, "loss": 0.1953, "reward": 1.993861734867096, "reward_std": 0.1949845775961876, "rewards/accuracy_reward": 0.03348214365541935, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9827009290456772, "step": 2879 }, { "clip_ratio": 0.0, "completion_length": 267.9866180419922, "epoch": 0.8602792920618326, "grad_norm": 0.6844627261161804, "kl": 0.2587890625, "learning_rate": 1.1630270811195132e-06, "loss": 0.1261, "reward": 2.0703126192092896, "reward_std": 0.16870268993079662, "rewards/accuracy_reward": 0.09821429289877415, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.987723246216774, "step": 2880 }, { "clip_ratio": 0.0, "completion_length": 266.53125762939453, "epoch": 0.860578000149354, "grad_norm": 1.6492376327514648, "kl": 0.39013671875, "learning_rate": 1.1581499158582187e-06, "loss": 0.2807, "reward": 2.093191981315613, "reward_std": 0.3191555105149746, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9793527275323868, "step": 2881 }, { "clip_ratio": 0.0, "completion_length": 287.10715103149414, "epoch": 0.8608767082368756, "grad_norm": 1.3661446571350098, "kl": 0.5634765625, "learning_rate": 1.1532823696632223e-06, "loss": 0.2316, "reward": 2.017299175262451, "reward_std": 0.21888544410467148, "rewards/accuracy_reward": 0.06250000302679837, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9815848767757416, "step": 2882 }, { "clip_ratio": 0.0, "completion_length": 302.5357208251953, "epoch": 0.861175416324397, "grad_norm": 1.1861999034881592, "kl": 0.6904296875, "learning_rate": 1.1484244478299366e-06, "loss": 0.1932, "reward": 2.0569196939468384, "reward_std": 0.2263924516737461, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9810268133878708, "step": 2883 }, { "clip_ratio": 0.0, "completion_length": 278.5825996398926, "epoch": 0.8614741244119185, "grad_norm": 0.8477755784988403, "kl": 0.54345703125, "learning_rate": 1.1435761556433035e-06, "loss": 0.2564, "reward": 2.046875089406967, "reward_std": 0.22645770758390427, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9776786118745804, "step": 2884 }, { "clip_ratio": 0.0, "completion_length": 288.9933166503906, "epoch": 0.8617728324994399, "grad_norm": 1.7084839344024658, "kl": 0.381591796875, "learning_rate": 1.1387374983777888e-06, "loss": 0.2718, "reward": 2.0150670409202576, "reward_std": 0.3034399040043354, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.9748884290456772, "step": 2885 }, { "clip_ratio": 0.0, "completion_length": 275.4308166503906, "epoch": 0.8620715405869613, "grad_norm": 5.474015712738037, "kl": 0.306884765625, "learning_rate": 1.1339084812973823e-06, "loss": 0.1837, "reward": 2.044642925262451, "reward_std": 0.1831064447760582, "rewards/accuracy_reward": 0.08482143026776612, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9843750447034836, "step": 2886 }, { "clip_ratio": 0.0, "completion_length": 307.8973388671875, "epoch": 0.8623702486744829, "grad_norm": 1.5903263092041016, "kl": 0.444091796875, "learning_rate": 1.1290891096555746e-06, "loss": 0.2016, "reward": 2.070312649011612, "reward_std": 0.2599828317761421, "rewards/accuracy_reward": 0.13169643399305642, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9765625596046448, "step": 2887 }, { "clip_ratio": 0.0, "completion_length": 267.8370666503906, "epoch": 0.8626689567620043, "grad_norm": 1.3900812864303589, "kl": 0.259033203125, "learning_rate": 1.124279388695373e-06, "loss": 0.2043, "reward": 2.0965402722358704, "reward_std": 0.1529454868286848, "rewards/accuracy_reward": 0.12723214784637094, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9871652126312256, "step": 2888 }, { "clip_ratio": 0.0, "completion_length": 285.1696586608887, "epoch": 0.8629676648495258, "grad_norm": 0.8725254535675049, "kl": 0.47412109375, "learning_rate": 1.119479323649284e-06, "loss": 0.2184, "reward": 2.0011161267757416, "reward_std": 0.22813672572374344, "rewards/accuracy_reward": 0.0535714328289032, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9765625447034836, "step": 2889 }, { "clip_ratio": 0.0, "completion_length": 312.79466247558594, "epoch": 0.8632663729370472, "grad_norm": 0.848982572555542, "kl": 0.38427734375, "learning_rate": 1.1146889197393052e-06, "loss": 0.1485, "reward": 2.0496652126312256, "reward_std": 0.19270794838666916, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9871652275323868, "step": 2890 }, { "clip_ratio": 0.0, "completion_length": 308.3236770629883, "epoch": 0.8635650810245687, "grad_norm": 134.6466522216797, "kl": 0.717041015625, "learning_rate": 1.1099081821769297e-06, "loss": 0.2392, "reward": 2.082031339406967, "reward_std": 0.25807612389326096, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9748884439468384, "step": 2891 }, { "clip_ratio": 0.0, "completion_length": 301.7924270629883, "epoch": 0.8638637891120902, "grad_norm": 2.023984909057617, "kl": 0.568603515625, "learning_rate": 1.1051371161631265e-06, "loss": 0.2877, "reward": 1.987165242433548, "reward_std": 0.26718076318502426, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.9620536267757416, "rewards/tag_count_reward": 0.976004496216774, "step": 2892 }, { "clip_ratio": 0.0, "completion_length": 277.41741943359375, "epoch": 0.8641624971996117, "grad_norm": 0.8301739692687988, "kl": 0.258056640625, "learning_rate": 1.100375726888352e-06, "loss": 0.1848, "reward": 2.0825893878936768, "reward_std": 0.2318627815693617, "rewards/accuracy_reward": 0.11830357555299997, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9866071790456772, "step": 2893 }, { "clip_ratio": 0.0, "completion_length": 317.8236770629883, "epoch": 0.8644612052871331, "grad_norm": 0.8817212581634521, "kl": 0.308349609375, "learning_rate": 1.0956240195325308e-06, "loss": 0.2371, "reward": 2.00334832072258, "reward_std": 0.2957608997821808, "rewards/accuracy_reward": 0.06696428777649999, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.97433041036129, "step": 2894 }, { "clip_ratio": 0.0, "completion_length": 276.10046005249023, "epoch": 0.8647599133746546, "grad_norm": 0.6056427955627441, "kl": 0.3291015625, "learning_rate": 1.090881999265051e-06, "loss": 0.2324, "reward": 2.0507813096046448, "reward_std": 0.23336314782500267, "rewards/accuracy_reward": 0.10937500605359674, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9748884439468384, "step": 2895 }, { "clip_ratio": 0.0, "completion_length": 307.6495666503906, "epoch": 0.865058621462176, "grad_norm": 0.5844752192497253, "kl": 0.3564453125, "learning_rate": 1.0861496712447694e-06, "loss": 0.1669, "reward": 2.009486675262451, "reward_std": 0.1425915639847517, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.987165242433548, "step": 2896 }, { "clip_ratio": 0.0, "completion_length": 290.41072845458984, "epoch": 0.8653573295496976, "grad_norm": 0.9098533391952515, "kl": 0.464111328125, "learning_rate": 1.0814270406199967e-06, "loss": 0.1483, "reward": 2.0809152722358704, "reward_std": 0.1722504161298275, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.9821429252624512, "rewards/tag_count_reward": 0.9871652275323868, "step": 2897 }, { "clip_ratio": 0.0, "completion_length": 279.57366943359375, "epoch": 0.865656037637219, "grad_norm": 1.4244881868362427, "kl": 0.4345703125, "learning_rate": 1.0767141125284875e-06, "loss": 0.2155, "reward": 2.0965402722358704, "reward_std": 0.2167389877140522, "rewards/accuracy_reward": 0.13616071874275804, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9849330633878708, "step": 2898 }, { "clip_ratio": 0.0, "completion_length": 287.32143783569336, "epoch": 0.8659547457247405, "grad_norm": 0.7486311793327332, "kl": 0.39111328125, "learning_rate": 1.072010892097447e-06, "loss": 0.2937, "reward": 2.0608259737491608, "reward_std": 0.24817104637622833, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9737723618745804, "step": 2899 }, { "clip_ratio": 0.0, "completion_length": 277.8147506713867, "epoch": 0.8662534538122619, "grad_norm": 4.722463130950928, "kl": 0.728515625, "learning_rate": 1.0673173844435214e-06, "loss": 0.2269, "reward": 2.0563617050647736, "reward_std": 0.2384861782193184, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9782366454601288, "step": 2900 }, { "clip_ratio": 0.0, "completion_length": 284.6584892272949, "epoch": 0.8665521618997835, "grad_norm": 0.8499763607978821, "kl": 0.4853515625, "learning_rate": 1.062633594672783e-06, "loss": 0.2425, "reward": 2.0625001192092896, "reward_std": 0.23821984976530075, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9776786118745804, "step": 2901 }, { "clip_ratio": 0.0, "completion_length": 281.8549270629883, "epoch": 0.8668508699873049, "grad_norm": 2.2240240573883057, "kl": 0.435546875, "learning_rate": 1.0579595278807376e-06, "loss": 0.2562, "reward": 2.0345983505249023, "reward_std": 0.26944008469581604, "rewards/accuracy_reward": 0.0915178656578064, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9810268431901932, "step": 2902 }, { "clip_ratio": 0.0, "completion_length": 262.75224685668945, "epoch": 0.8671495780748264, "grad_norm": 2.1954345703125, "kl": 0.64208984375, "learning_rate": 1.0532951891523124e-06, "loss": 0.2707, "reward": 2.0390625596046448, "reward_std": 0.2533872025087476, "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.964285746216774, "rewards/tag_count_reward": 0.978794664144516, "step": 2903 }, { "clip_ratio": 0.0, "completion_length": 275.2656364440918, "epoch": 0.8674482861623478, "grad_norm": 1.1195521354675293, "kl": 0.54052734375, "learning_rate": 1.0486405835618496e-06, "loss": 0.2887, "reward": 2.1199777722358704, "reward_std": 0.2604141905903816, "rewards/accuracy_reward": 0.176339291036129, "rewards/format_reward": 0.964285746216774, "rewards/tag_count_reward": 0.9793527126312256, "step": 2904 }, { "clip_ratio": 0.0, "completion_length": 287.5223388671875, "epoch": 0.8677469942498693, "grad_norm": 69.9618148803711, "kl": 0.9326171875, "learning_rate": 1.0439957161731062e-06, "loss": 0.2827, "reward": 1.9977679550647736, "reward_std": 0.23552870005369186, "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.9575893133878708, "rewards/tag_count_reward": 0.973214328289032, "step": 2905 }, { "clip_ratio": 0.0, "completion_length": 283.5089416503906, "epoch": 0.8680457023373908, "grad_norm": 0.7988701462745667, "kl": 0.44580078125, "learning_rate": 1.039360592039238e-06, "loss": 0.1781, "reward": 2.161830425262451, "reward_std": 0.22853514179587364, "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9877232760190964, "step": 2906 }, { "clip_ratio": 0.0, "completion_length": 239.98885345458984, "epoch": 0.8683444104249123, "grad_norm": 0.9618731141090393, "kl": 0.28271484375, "learning_rate": 1.034735216202809e-06, "loss": 0.147, "reward": 2.0926340222358704, "reward_std": 0.16204943135380745, "rewards/accuracy_reward": 0.11607143143191934, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9899553954601288, "step": 2907 }, { "clip_ratio": 0.0, "completion_length": 272.8415222167969, "epoch": 0.8686431185124337, "grad_norm": 1.7964959144592285, "kl": 0.402587890625, "learning_rate": 1.0301195936957765e-06, "loss": 0.2576, "reward": 2.041852742433548, "reward_std": 0.22933850809931755, "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9815848618745804, "step": 2908 }, { "clip_ratio": 0.0, "completion_length": 298.1160888671875, "epoch": 0.8689418265999552, "grad_norm": 0.9303290843963623, "kl": 0.26513671875, "learning_rate": 1.0255137295394813e-06, "loss": 0.1442, "reward": 2.03459832072258, "reward_std": 0.1830823328346014, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9832589775323868, "step": 2909 }, { "clip_ratio": 0.0, "completion_length": 267.6227836608887, "epoch": 0.8692405346874766, "grad_norm": 37.08414077758789, "kl": 0.4990234375, "learning_rate": 1.0209176287446542e-06, "loss": 0.325, "reward": 2.0853795409202576, "reward_std": 0.2688302844762802, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.97823666036129, "step": 2910 }, { "clip_ratio": 0.0, "completion_length": 264.5111770629883, "epoch": 0.8695392427749982, "grad_norm": 0.6678333878517151, "kl": 0.328369140625, "learning_rate": 1.0163312963114035e-06, "loss": 0.1325, "reward": 2.014508992433548, "reward_std": 0.14023130014538765, "rewards/accuracy_reward": 0.0446428582072258, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.987723246216774, "step": 2911 }, { "clip_ratio": 0.0, "completion_length": 286.2924270629883, "epoch": 0.8698379508625196, "grad_norm": 0.847891092300415, "kl": 0.657470703125, "learning_rate": 1.011754737229208e-06, "loss": 0.2566, "reward": 2.0518974661827087, "reward_std": 0.28821197524666786, "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9760045111179352, "step": 2912 }, { "clip_ratio": 0.0, "completion_length": 256.6071586608887, "epoch": 0.8701366589500411, "grad_norm": 1.2320514917373657, "kl": 0.37353515625, "learning_rate": 1.0071879564769139e-06, "loss": 0.1659, "reward": 2.0669643580913544, "reward_std": 0.18326766602694988, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.9754464477300644, "rewards/tag_count_reward": 0.9843750298023224, "step": 2913 }, { "clip_ratio": 0.0, "completion_length": 294.2634048461914, "epoch": 0.8704353670375625, "grad_norm": 0.8879086971282959, "kl": 0.476318359375, "learning_rate": 1.0026309590227358e-06, "loss": 0.206, "reward": 2.0262277722358704, "reward_std": 0.2050051810219884, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.9665178805589676, "rewards/tag_count_reward": 0.977120578289032, "step": 2914 }, { "clip_ratio": 0.0, "completion_length": 240.1741180419922, "epoch": 0.870734075125084, "grad_norm": 5.242064952850342, "kl": 0.41552734375, "learning_rate": 9.980837498242357e-07, "loss": 0.3025, "reward": 2.1138393878936768, "reward_std": 0.24395752884447575, "rewards/accuracy_reward": 0.1629464328289032, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9821428954601288, "step": 2915 }, { "clip_ratio": 0.0, "completion_length": 264.839298248291, "epoch": 0.8710327832126055, "grad_norm": 0.4799134433269501, "kl": 0.227783203125, "learning_rate": 9.935463338283325e-07, "loss": 0.2313, "reward": 2.061383992433548, "reward_std": 0.20760376751422882, "rewards/accuracy_reward": 0.09821429010480642, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9832589626312256, "step": 2916 }, { "clip_ratio": 0.0, "completion_length": 251.08037185668945, "epoch": 0.871331491300127, "grad_norm": 0.9740557074546814, "kl": 0.266357421875, "learning_rate": 9.890187159712927e-07, "loss": 0.0845, "reward": 2.107701003551483, "reward_std": 0.11245571821928024, "rewards/accuracy_reward": 0.12053571920841932, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9960937798023224, "step": 2917 }, { "clip_ratio": 0.0, "completion_length": 267.2075958251953, "epoch": 0.8716301993876484, "grad_norm": 0.8688317537307739, "kl": 0.356201171875, "learning_rate": 9.845009011787166e-07, "loss": 0.1403, "reward": 1.997209906578064, "reward_std": 0.1410494912415743, "rewards/accuracy_reward": 0.020089287078008056, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.990513414144516, "step": 2918 }, { "clip_ratio": 0.0, "completion_length": 284.1272506713867, "epoch": 0.8719289074751699, "grad_norm": 0.6408266425132751, "kl": 0.3388671875, "learning_rate": 9.79992894365549e-07, "loss": 0.1839, "reward": 2.037388503551483, "reward_std": 0.19448136910796165, "rewards/accuracy_reward": 0.07142857508733869, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9860491454601288, "step": 2919 }, { "clip_ratio": 0.0, "completion_length": 298.7477798461914, "epoch": 0.8722276155626913, "grad_norm": 1.1763991117477417, "kl": 0.286865234375, "learning_rate": 9.754947004360537e-07, "loss": 0.3775, "reward": 1.9843750894069672, "reward_std": 0.31009018421173096, "rewards/accuracy_reward": 0.058035715483129025, "rewards/format_reward": 0.9553571939468384, "rewards/tag_count_reward": 0.9709821790456772, "step": 2920 }, { "clip_ratio": 0.0, "completion_length": 272.9285774230957, "epoch": 0.8725263236502129, "grad_norm": 0.4064851999282837, "kl": 0.21044921875, "learning_rate": 9.710063242838286e-07, "loss": 0.093, "reward": 2.074776828289032, "reward_std": 0.140178507193923, "rewards/accuracy_reward": 0.09375000232830644, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9921875298023224, "step": 2921 }, { "clip_ratio": 0.0, "completion_length": 248.2968864440918, "epoch": 0.8728250317377343, "grad_norm": 0.8086876273155212, "kl": 0.315185546875, "learning_rate": 9.665277707917875e-07, "loss": 0.1874, "reward": 2.0792412161827087, "reward_std": 0.1547437570989132, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9854911118745804, "step": 2922 }, { "clip_ratio": 0.0, "completion_length": 246.25223541259766, "epoch": 0.8731237398252558, "grad_norm": 0.7992217540740967, "kl": 0.26416015625, "learning_rate": 9.620590448321554e-07, "loss": 0.1955, "reward": 2.1612723767757416, "reward_std": 0.2295474112033844, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9893973618745804, "step": 2923 }, { "clip_ratio": 0.0, "completion_length": 288.0401840209961, "epoch": 0.8734224479127772, "grad_norm": 1.2198052406311035, "kl": 0.289306640625, "learning_rate": 9.576001512664678e-07, "loss": 0.2398, "reward": 2.039062589406967, "reward_std": 0.24669124558568, "rewards/accuracy_reward": 0.08258929220028222, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9810268133878708, "step": 2924 }, { "clip_ratio": 0.0, "completion_length": 276.0558204650879, "epoch": 0.8737211560002988, "grad_norm": 1.6017662286758423, "kl": 0.311279296875, "learning_rate": 9.531510949455681e-07, "loss": 0.1945, "reward": 2.076451003551483, "reward_std": 0.19323166646063328, "rewards/accuracy_reward": 0.10714286006987095, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.987165242433548, "step": 2925 }, { "clip_ratio": 0.0, "completion_length": 295.37947845458984, "epoch": 0.8740198640878202, "grad_norm": 0.5732828378677368, "kl": 0.39208984375, "learning_rate": 9.487118807095885e-07, "loss": 0.3105, "reward": 2.080357253551483, "reward_std": 0.3259578086435795, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.973214328289032, "step": 2926 }, { "clip_ratio": 0.0, "completion_length": 288.38170623779297, "epoch": 0.8743185721753417, "grad_norm": 0.5919833183288574, "kl": 0.3125, "learning_rate": 9.442825133879608e-07, "loss": 0.1569, "reward": 2.0619421005249023, "reward_std": 0.15864209085702896, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9860491454601288, "step": 2927 }, { "clip_ratio": 0.0, "completion_length": 273.9709892272949, "epoch": 0.8746172802628631, "grad_norm": 2.787226676940918, "kl": 0.41455078125, "learning_rate": 9.398629977994056e-07, "loss": 0.1437, "reward": 2.0357143878936768, "reward_std": 0.19948404282331467, "rewards/accuracy_reward": 0.06696429010480642, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.986607164144516, "step": 2928 }, { "clip_ratio": 0.0, "completion_length": 247.23215866088867, "epoch": 0.8749159883503845, "grad_norm": 2.62986421585083, "kl": 0.499267578125, "learning_rate": 9.354533387519171e-07, "loss": 0.3565, "reward": 2.0775671005249023, "reward_std": 0.2404867671430111, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9815848469734192, "step": 2929 }, { "clip_ratio": 0.0, "completion_length": 241.5223274230957, "epoch": 0.8752146964379061, "grad_norm": 0.48153406381607056, "kl": 0.296630859375, "learning_rate": 9.310535410427767e-07, "loss": 0.1481, "reward": 2.1372768878936768, "reward_std": 0.1736856084316969, "rewards/accuracy_reward": 0.15401786798611283, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9921875298023224, "step": 2930 }, { "clip_ratio": 0.0, "completion_length": 250.15849685668945, "epoch": 0.8755134045254275, "grad_norm": 0.9206944108009338, "kl": 0.30859375, "learning_rate": 9.266636094585301e-07, "loss": 0.1428, "reward": 2.052455425262451, "reward_std": 0.20783833414316177, "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.987723246216774, "step": 2931 }, { "clip_ratio": 0.0, "completion_length": 293.78126525878906, "epoch": 0.875812112612949, "grad_norm": 6.21597146987915, "kl": 0.43798828125, "learning_rate": 9.222835487749937e-07, "loss": 0.2726, "reward": 2.161830484867096, "reward_std": 0.27576233446598053, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9832589775323868, "step": 2932 }, { "clip_ratio": 0.0, "completion_length": 272.6116180419922, "epoch": 0.8761108207004704, "grad_norm": 0.8885441422462463, "kl": 0.28466796875, "learning_rate": 9.179133637572457e-07, "loss": 0.1302, "reward": 1.9955357611179352, "reward_std": 0.16471827775239944, "rewards/accuracy_reward": 0.029017859138548374, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.986607164144516, "step": 2933 }, { "clip_ratio": 0.0, "completion_length": 304.03126525878906, "epoch": 0.8764095287879919, "grad_norm": 0.7097525000572205, "kl": 0.41650390625, "learning_rate": 9.135530591596165e-07, "loss": 0.2111, "reward": 2.0334822237491608, "reward_std": 0.19950871169567108, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9866071939468384, "step": 2934 }, { "clip_ratio": 0.0, "completion_length": 275.97322845458984, "epoch": 0.8767082368755134, "grad_norm": 2.33776593208313, "kl": 0.505859375, "learning_rate": 9.092026397256914e-07, "loss": 0.2598, "reward": 2.0145090222358704, "reward_std": 0.22526071965694427, "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9787946790456772, "step": 2935 }, { "clip_ratio": 0.0, "completion_length": 258.74554443359375, "epoch": 0.8770069449630349, "grad_norm": 1.268784999847412, "kl": 0.3896484375, "learning_rate": 9.048621101883026e-07, "loss": 0.2163, "reward": 2.142299175262451, "reward_std": 0.1803802326321602, "rewards/accuracy_reward": 0.17633928847499192, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9860491454601288, "step": 2936 }, { "clip_ratio": 0.0, "completion_length": 294.21876525878906, "epoch": 0.8773056530505563, "grad_norm": 0.947383463382721, "kl": 0.642822265625, "learning_rate": 9.00531475269516e-07, "loss": 0.2742, "reward": 2.059151828289032, "reward_std": 0.20427340641617775, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.981026828289032, "step": 2937 }, { "clip_ratio": 0.0, "completion_length": 250.42189025878906, "epoch": 0.8776043611380778, "grad_norm": 0.49801933765411377, "kl": 0.28564453125, "learning_rate": 8.962107396806407e-07, "loss": 0.1815, "reward": 2.036830425262451, "reward_std": 0.15715552307665348, "rewards/accuracy_reward": 0.06696428707800806, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9877232611179352, "step": 2938 }, { "clip_ratio": 0.0, "completion_length": 279.6718864440918, "epoch": 0.8779030692255992, "grad_norm": 0.46802201867103577, "kl": 0.23681640625, "learning_rate": 8.918999081222157e-07, "loss": 0.2385, "reward": 2.0239956378936768, "reward_std": 0.19946424663066864, "rewards/accuracy_reward": 0.06250000325962901, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9815848767757416, "step": 2939 }, { "clip_ratio": 0.0, "completion_length": 262.5982208251953, "epoch": 0.8782017773131208, "grad_norm": 0.4845745861530304, "kl": 0.2421875, "learning_rate": 8.875989852839984e-07, "loss": 0.1654, "reward": 2.088169753551483, "reward_std": 0.1922741737216711, "rewards/accuracy_reward": 0.12053571874275804, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9854911118745804, "step": 2940 }, { "clip_ratio": 0.0, "completion_length": 277.1093864440918, "epoch": 0.8785004854006422, "grad_norm": 0.9747012853622437, "kl": 0.43359375, "learning_rate": 8.833079758449748e-07, "loss": 0.1531, "reward": 2.1992188692092896, "reward_std": 0.16692808642983437, "rewards/accuracy_reward": 0.2343750186264515, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.984933078289032, "step": 2941 }, { "clip_ratio": 0.0, "completion_length": 290.15626525878906, "epoch": 0.8787991934881637, "grad_norm": 1.381899118423462, "kl": 0.374755859375, "learning_rate": 8.79026884473343e-07, "loss": 0.2378, "reward": 2.0625000596046448, "reward_std": 0.2581752873957157, "rewards/accuracy_reward": 0.10044643329456449, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9843750447034836, "step": 2942 }, { "clip_ratio": 0.0, "completion_length": 285.51564025878906, "epoch": 0.8790979015756851, "grad_norm": 0.3614780306816101, "kl": 0.37158203125, "learning_rate": 8.747557158265074e-07, "loss": 0.2015, "reward": 2.029017895460129, "reward_std": 0.21965042501688004, "rewards/accuracy_reward": 0.06919643399305642, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9821428805589676, "step": 2943 }, { "clip_ratio": 0.0, "completion_length": 300.65626525878906, "epoch": 0.8793966096632067, "grad_norm": 0.7939281463623047, "kl": 0.4658203125, "learning_rate": 8.704944745510846e-07, "loss": 0.1968, "reward": 2.0619420409202576, "reward_std": 0.19508006237447262, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9815848767757416, "step": 2944 }, { "clip_ratio": 0.0, "completion_length": 289.3169746398926, "epoch": 0.8796953177507281, "grad_norm": 0.4234409034252167, "kl": 0.1912841796875, "learning_rate": 8.66243165282884e-07, "loss": 0.1695, "reward": 2.053013503551483, "reward_std": 0.1776741947978735, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9838170111179352, "step": 2945 }, { "clip_ratio": 0.0, "completion_length": 277.7589340209961, "epoch": 0.8799940258382496, "grad_norm": 0.7829691767692566, "kl": 0.3876953125, "learning_rate": 8.620017926469149e-07, "loss": 0.2547, "reward": 2.075334906578064, "reward_std": 0.220154894515872, "rewards/accuracy_reward": 0.12500000488944352, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9793527275323868, "step": 2946 }, { "clip_ratio": 0.0, "completion_length": 275.2477836608887, "epoch": 0.880292733925771, "grad_norm": 0.6048048734664917, "kl": 0.365234375, "learning_rate": 8.577703612573784e-07, "loss": 0.2017, "reward": 1.9771206378936768, "reward_std": 0.1863947007805109, "rewards/accuracy_reward": 0.022321430267766118, "rewards/format_reward": 0.9754464477300644, "rewards/tag_count_reward": 0.9793527126312256, "step": 2947 }, { "clip_ratio": 0.0, "completion_length": 265.8102722167969, "epoch": 0.8805914420132925, "grad_norm": 0.4205069839954376, "kl": 0.3037109375, "learning_rate": 8.535488757176513e-07, "loss": 0.1085, "reward": 2.142299234867096, "reward_std": 0.17907260172069073, "rewards/accuracy_reward": 0.16517857648432255, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.990513414144516, "step": 2948 }, { "clip_ratio": 0.0, "completion_length": 242.01787185668945, "epoch": 0.880890150100814, "grad_norm": 0.3546193838119507, "kl": 0.2431640625, "learning_rate": 8.493373406202987e-07, "loss": 0.143, "reward": 2.0948662161827087, "reward_std": 0.2075724545866251, "rewards/accuracy_reward": 0.12053572246804833, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9899553954601288, "step": 2949 }, { "clip_ratio": 0.0, "completion_length": 250.4352798461914, "epoch": 0.8811888581883355, "grad_norm": 0.4033135175704956, "kl": 0.21484375, "learning_rate": 8.4513576054706e-07, "loss": 0.1064, "reward": 2.149553656578064, "reward_std": 0.1464936099946499, "rewards/accuracy_reward": 0.1674107201397419, "rewards/format_reward": 0.9888392984867096, "rewards/tag_count_reward": 0.993303582072258, "step": 2950 }, { "clip_ratio": 0.0, "completion_length": 254.34822463989258, "epoch": 0.8814875662758569, "grad_norm": 0.644675612449646, "kl": 0.53369140625, "learning_rate": 8.409441400688401e-07, "loss": 0.1281, "reward": 2.046875089406967, "reward_std": 0.2049717679619789, "rewards/accuracy_reward": 0.08928572223521769, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9821428954601288, "step": 2951 }, { "clip_ratio": 0.0, "completion_length": 255.71206665039062, "epoch": 0.8817862743633784, "grad_norm": 1.3424794673919678, "kl": 0.415771484375, "learning_rate": 8.3676248374571e-07, "loss": 0.2451, "reward": 2.017857253551483, "reward_std": 0.20761023834347725, "rewards/accuracy_reward": 0.06250000116415322, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9799107760190964, "step": 2952 }, { "clip_ratio": 0.0, "completion_length": 287.40402603149414, "epoch": 0.8820849824508998, "grad_norm": 0.4644409120082855, "kl": 0.281494140625, "learning_rate": 8.325907961269064e-07, "loss": 0.1316, "reward": 2.0636161863803864, "reward_std": 0.153646532446146, "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9854910969734192, "step": 2953 }, { "clip_ratio": 0.0, "completion_length": 285.5245590209961, "epoch": 0.8823836905384214, "grad_norm": 0.8675491809844971, "kl": 0.239501953125, "learning_rate": 8.284290817508122e-07, "loss": 0.1637, "reward": 2.0066964626312256, "reward_std": 0.1776847243309021, "rewards/accuracy_reward": 0.040178571827709675, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9866071790456772, "step": 2954 }, { "clip_ratio": 0.0, "completion_length": 241.42858505249023, "epoch": 0.8826823986259428, "grad_norm": 0.4809974133968353, "kl": 0.28076171875, "learning_rate": 8.24277345144967e-07, "loss": 0.1723, "reward": 2.1004465222358704, "reward_std": 0.19723611045628786, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9888393133878708, "step": 2955 }, { "clip_ratio": 0.0, "completion_length": 260.62947845458984, "epoch": 0.8829811067134643, "grad_norm": 1.3645497560501099, "kl": 0.281005859375, "learning_rate": 8.201355908260544e-07, "loss": 0.1942, "reward": 2.1160715222358704, "reward_std": 0.17835709266364574, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9910714626312256, "step": 2956 }, { "clip_ratio": 0.0, "completion_length": 284.8727798461914, "epoch": 0.8832798148009857, "grad_norm": 16.899658203125, "kl": 1.330322265625, "learning_rate": 8.160038232998935e-07, "loss": 0.2974, "reward": 2.0139509439468384, "reward_std": 0.16320021077990532, "rewards/accuracy_reward": 0.04910714388824999, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.984933078289032, "step": 2957 }, { "clip_ratio": 0.0, "completion_length": 271.1495666503906, "epoch": 0.8835785228885072, "grad_norm": 0.4080846309661865, "kl": 0.3642578125, "learning_rate": 8.118820470614463e-07, "loss": 0.2556, "reward": 2.0011161863803864, "reward_std": 0.24246570095419884, "rewards/accuracy_reward": 0.053571431897580624, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9765625447034836, "step": 2958 }, { "clip_ratio": 0.0, "completion_length": 292.0692138671875, "epoch": 0.8838772309760287, "grad_norm": 0.48714226484298706, "kl": 0.295166015625, "learning_rate": 8.077702665947973e-07, "loss": 0.1858, "reward": 2.0820313692092896, "reward_std": 0.16944725904613733, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9860491454601288, "step": 2959 }, { "clip_ratio": 0.0, "completion_length": 277.7924270629883, "epoch": 0.8841759390635502, "grad_norm": 0.4379011392593384, "kl": 0.26220703125, "learning_rate": 8.036684863731636e-07, "loss": 0.148, "reward": 2.0530135333538055, "reward_std": 0.13795342948287725, "rewards/accuracy_reward": 0.0803571492433548, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9882812649011612, "step": 2960 }, { "clip_ratio": 0.0, "completion_length": 292.1540412902832, "epoch": 0.8844746471510716, "grad_norm": 0.3562096953392029, "kl": 0.36669921875, "learning_rate": 7.995767108588814e-07, "loss": 0.1767, "reward": 2.0881697237491608, "reward_std": 0.25588780269026756, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9832589775323868, "step": 2961 }, { "clip_ratio": 0.0, "completion_length": 281.6584930419922, "epoch": 0.8847733552385931, "grad_norm": 0.7858858704566956, "kl": 0.250732421875, "learning_rate": 7.954949445033966e-07, "loss": 0.1424, "reward": 2.168526828289032, "reward_std": 0.1582580730319023, "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9921875298023224, "step": 2962 }, { "clip_ratio": 0.0, "completion_length": 281.25001525878906, "epoch": 0.8850720633261145, "grad_norm": 6.359358787536621, "kl": 0.72216796875, "learning_rate": 7.914231917472748e-07, "loss": 0.368, "reward": 1.988839328289032, "reward_std": 0.22062667831778526, "rewards/accuracy_reward": 0.037946428870782256, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9799107611179352, "step": 2963 }, { "clip_ratio": 0.0, "completion_length": 258.4531364440918, "epoch": 0.8853707714136361, "grad_norm": 0.6036217212677002, "kl": 0.33251953125, "learning_rate": 7.873614570201838e-07, "loss": 0.211, "reward": 2.1908482909202576, "reward_std": 0.24692068621516228, "rewards/accuracy_reward": 0.22767858393490314, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9854910969734192, "step": 2964 }, { "clip_ratio": 0.0, "completion_length": 292.80358123779297, "epoch": 0.8856694795011575, "grad_norm": 6.310649871826172, "kl": 0.286376953125, "learning_rate": 7.833097447408911e-07, "loss": 0.192, "reward": 1.96428582072258, "reward_std": 0.17337798047810793, "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9821428954601288, "step": 2965 }, { "clip_ratio": 0.0, "completion_length": 277.7009048461914, "epoch": 0.885968187588679, "grad_norm": 0.8968981504440308, "kl": 0.67041015625, "learning_rate": 7.792680593172619e-07, "loss": 0.3736, "reward": 2.0145090222358704, "reward_std": 0.22247854992747307, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.9765625447034836, "step": 2966 }, { "clip_ratio": 0.0, "completion_length": 273.65402603149414, "epoch": 0.8862668956762004, "grad_norm": 1.0460559129714966, "kl": 0.18310546875, "learning_rate": 7.75236405146258e-07, "loss": 0.1891, "reward": 2.133928656578064, "reward_std": 0.1899685300886631, "rewards/accuracy_reward": 0.16294643841683865, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9888393133878708, "step": 2967 }, { "clip_ratio": 0.0, "completion_length": 273.7991180419922, "epoch": 0.886565603763722, "grad_norm": 0.7879588007926941, "kl": 0.35888671875, "learning_rate": 7.712147866139197e-07, "loss": 0.1677, "reward": 2.0373884737491608, "reward_std": 0.1588907241821289, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9905134290456772, "step": 2968 }, { "clip_ratio": 0.0, "completion_length": 258.5669708251953, "epoch": 0.8868643118512434, "grad_norm": 1.9509867429733276, "kl": 0.39453125, "learning_rate": 7.672032080953751e-07, "loss": 0.1497, "reward": 2.09709832072258, "reward_std": 0.11223187018185854, "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9966518133878708, "step": 2969 }, { "clip_ratio": 0.0, "completion_length": 289.3928680419922, "epoch": 0.8871630199387649, "grad_norm": 1.0006736516952515, "kl": 0.34912109375, "learning_rate": 7.632016739548309e-07, "loss": 0.227, "reward": 2.070870667695999, "reward_std": 0.2859703302383423, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9793527275323868, "step": 2970 }, { "clip_ratio": 0.0, "completion_length": 294.48885345458984, "epoch": 0.8874617280262863, "grad_norm": 1.4228570461273193, "kl": 0.353271484375, "learning_rate": 7.592101885455594e-07, "loss": 0.2403, "reward": 2.0373884439468384, "reward_std": 0.2528439089655876, "rewards/accuracy_reward": 0.09375000861473382, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9771205931901932, "step": 2971 }, { "clip_ratio": 0.0, "completion_length": 276.5334892272949, "epoch": 0.8877604361138077, "grad_norm": 1.1436222791671753, "kl": 0.359130859375, "learning_rate": 7.552287562099103e-07, "loss": 0.0241, "reward": 2.0574777722358704, "reward_std": 0.06800232641398907, "rewards/accuracy_reward": 0.06473214412108064, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9972098469734192, "step": 2972 }, { "clip_ratio": 0.0, "completion_length": 277.7500114440918, "epoch": 0.8880591442013293, "grad_norm": 1.150019645690918, "kl": 0.364013671875, "learning_rate": 7.512573812792878e-07, "loss": 0.2531, "reward": 2.1255581378936768, "reward_std": 0.2416427619755268, "rewards/accuracy_reward": 0.17857143771834671, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.9782366454601288, "step": 2973 }, { "clip_ratio": 0.0, "completion_length": 273.17189025878906, "epoch": 0.8883578522888507, "grad_norm": 0.3847915232181549, "kl": 0.24755859375, "learning_rate": 7.472960680741603e-07, "loss": 0.1642, "reward": 2.1668527722358704, "reward_std": 0.20773157477378845, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9927455484867096, "step": 2974 }, { "clip_ratio": 0.0, "completion_length": 279.6004638671875, "epoch": 0.8886565603763722, "grad_norm": 0.3372030556201935, "kl": 0.2548828125, "learning_rate": 7.433448209040495e-07, "loss": 0.1251, "reward": 2.0775670409202576, "reward_std": 0.1434371629729867, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.9843750149011612, "rewards/tag_count_reward": 0.990513414144516, "step": 2975 }, { "clip_ratio": 0.0, "completion_length": 289.00893783569336, "epoch": 0.8889552684638936, "grad_norm": 1.8527469635009766, "kl": 0.5625, "learning_rate": 7.394036440675223e-07, "loss": 0.3007, "reward": 2.1367188692092896, "reward_std": 0.26825398579239845, "rewards/accuracy_reward": 0.1808035841677338, "rewards/format_reward": 0.9732143133878708, "rewards/tag_count_reward": 0.9827009439468384, "step": 2976 }, { "clip_ratio": 0.0, "completion_length": 274.84152603149414, "epoch": 0.8892539765514151, "grad_norm": 0.5997388362884521, "kl": 0.6435546875, "learning_rate": 7.354725418521947e-07, "loss": 0.1364, "reward": 2.0792411267757416, "reward_std": 0.16818481963127851, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9810268133878708, "step": 2977 }, { "clip_ratio": 0.0, "completion_length": 300.00671005249023, "epoch": 0.8895526846389366, "grad_norm": 0.9852425456047058, "kl": 0.55908203125, "learning_rate": 7.315515185347222e-07, "loss": 0.2196, "reward": 2.165178656578064, "reward_std": 0.20135420188307762, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9866071790456772, "step": 2978 }, { "clip_ratio": 0.0, "completion_length": 283.51340103149414, "epoch": 0.8898513927264581, "grad_norm": 1.0046671628952026, "kl": 0.401123046875, "learning_rate": 7.276405783807894e-07, "loss": 0.2159, "reward": 2.1244420409202576, "reward_std": 0.23473627865314484, "rewards/accuracy_reward": 0.15401786379516125, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9905134290456772, "step": 2979 }, { "clip_ratio": 0.0, "completion_length": 288.20983123779297, "epoch": 0.8901501008139795, "grad_norm": 1.148683786392212, "kl": 0.267578125, "learning_rate": 7.237397256451195e-07, "loss": 0.2303, "reward": 2.0195313692092896, "reward_std": 0.21599629893898964, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.9732143431901932, "rewards/tag_count_reward": 0.9838170111179352, "step": 2980 }, { "clip_ratio": 0.0, "completion_length": 305.31920623779297, "epoch": 0.890448808901501, "grad_norm": 0.8128341436386108, "kl": 0.349609375, "learning_rate": 7.198489645714579e-07, "loss": 0.1696, "reward": 2.0608260333538055, "reward_std": 0.18707757629454136, "rewards/accuracy_reward": 0.10044643096625805, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.984933078289032, "step": 2981 }, { "clip_ratio": 0.0, "completion_length": 328.30582427978516, "epoch": 0.8907475169890224, "grad_norm": 1.5809729099273682, "kl": 0.473876953125, "learning_rate": 7.159682993925687e-07, "loss": 0.2424, "reward": 2.055245667695999, "reward_std": 0.2494329996407032, "rewards/accuracy_reward": 0.10714286006987095, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9815848618745804, "step": 2982 }, { "clip_ratio": 0.0, "completion_length": 303.2366256713867, "epoch": 0.891046225076544, "grad_norm": 0.9765303730964661, "kl": 0.5576171875, "learning_rate": 7.12097734330236e-07, "loss": 0.2363, "reward": 2.031250089406967, "reward_std": 0.2231335062533617, "rewards/accuracy_reward": 0.08035714854486287, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9799107611179352, "step": 2983 }, { "clip_ratio": 0.0, "completion_length": 310.5558204650879, "epoch": 0.8913449331640654, "grad_norm": 0.8965809941291809, "kl": 0.360107421875, "learning_rate": 7.082372735952591e-07, "loss": 0.1833, "reward": 2.0474331080913544, "reward_std": 0.27572811022400856, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.984933078289032, "step": 2984 }, { "clip_ratio": 0.0, "completion_length": 302.4799270629883, "epoch": 0.8916436412515869, "grad_norm": 1.0725507736206055, "kl": 0.4560546875, "learning_rate": 7.043869213874355e-07, "loss": 0.1988, "reward": 1.9598215520381927, "reward_std": 0.2062030928209424, "rewards/accuracy_reward": 0.015625000465661287, "rewards/format_reward": 0.964285746216774, "rewards/tag_count_reward": 0.979910746216774, "step": 2985 }, { "clip_ratio": 0.0, "completion_length": 273.792423248291, "epoch": 0.8919423493391083, "grad_norm": 1.038622498512268, "kl": 0.28955078125, "learning_rate": 7.005466818955753e-07, "loss": 0.2651, "reward": 2.0407367050647736, "reward_std": 0.26034674793481827, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9827009290456772, "step": 2986 }, { "clip_ratio": 0.0, "completion_length": 279.29241943359375, "epoch": 0.8922410574266298, "grad_norm": 2.253253936767578, "kl": 0.448486328125, "learning_rate": 6.96716559297479e-07, "loss": 0.1719, "reward": 2.0418527722358704, "reward_std": 0.19827308133244514, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9882812947034836, "step": 2987 }, { "clip_ratio": 0.0, "completion_length": 299.24108123779297, "epoch": 0.8925397655141513, "grad_norm": 0.7612330317497253, "kl": 0.484375, "learning_rate": 6.928965577599467e-07, "loss": 0.2238, "reward": 2.1908483505249023, "reward_std": 0.24986915662884712, "rewards/accuracy_reward": 0.2366071492433548, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9832589775323868, "step": 2988 }, { "clip_ratio": 0.0, "completion_length": 299.3616256713867, "epoch": 0.8928384736016728, "grad_norm": 0.8153628706932068, "kl": 0.44970703125, "learning_rate": 6.890866814387676e-07, "loss": 0.2426, "reward": 2.098772406578064, "reward_std": 0.25279444828629494, "rewards/accuracy_reward": 0.15401786798611283, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.97823666036129, "step": 2989 }, { "clip_ratio": 0.0, "completion_length": 283.7633972167969, "epoch": 0.8931371816891942, "grad_norm": 0.4217347204685211, "kl": 0.31005859375, "learning_rate": 6.852869344787084e-07, "loss": 0.1523, "reward": 2.0731027722358704, "reward_std": 0.16408739425241947, "rewards/accuracy_reward": 0.10491072130389512, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9882813096046448, "step": 2990 }, { "clip_ratio": 0.0, "completion_length": 275.7209930419922, "epoch": 0.8934358897767157, "grad_norm": 3.4491593837738037, "kl": 0.732421875, "learning_rate": 6.814973210135256e-07, "loss": 0.2105, "reward": 2.079799175262451, "reward_std": 0.21632660552859306, "rewards/accuracy_reward": 0.1294642947614193, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9793527126312256, "step": 2991 }, { "clip_ratio": 0.0, "completion_length": 290.5759048461914, "epoch": 0.8937345978642371, "grad_norm": 0.8990910053253174, "kl": 0.25830078125, "learning_rate": 6.777178451659472e-07, "loss": 0.1583, "reward": 2.0898438692092896, "reward_std": 0.23846864327788353, "rewards/accuracy_reward": 0.133928582072258, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9804687798023224, "step": 2992 }, { "clip_ratio": 0.0, "completion_length": 296.7633972167969, "epoch": 0.8940333059517587, "grad_norm": 1.043928623199463, "kl": 0.316162109375, "learning_rate": 6.739485110476707e-07, "loss": 0.1348, "reward": 2.04631707072258, "reward_std": 0.1839427947998047, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9882812798023224, "step": 2993 }, { "clip_ratio": 0.0, "completion_length": 272.3951110839844, "epoch": 0.8943320140392801, "grad_norm": 3.589860439300537, "kl": 0.6826171875, "learning_rate": 6.701893227593614e-07, "loss": 0.28, "reward": 2.0178572833538055, "reward_std": 0.23095464147627354, "rewards/accuracy_reward": 0.06473214575089514, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9821428954601288, "step": 2994 }, { "clip_ratio": 0.0, "completion_length": 275.1116180419922, "epoch": 0.8946307221268016, "grad_norm": 0.5403110980987549, "kl": 0.313720703125, "learning_rate": 6.664402843906515e-07, "loss": 0.1196, "reward": 2.0128349363803864, "reward_std": 0.14110480342060328, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9905134290456772, "step": 2995 }, { "clip_ratio": 0.0, "completion_length": 258.073673248291, "epoch": 0.894929430214323, "grad_norm": 1.2157559394836426, "kl": 0.44873046875, "learning_rate": 6.627014000201237e-07, "loss": 0.3531, "reward": 2.1216518878936768, "reward_std": 0.2695283442735672, "rewards/accuracy_reward": 0.1674107201397419, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9832589626312256, "step": 2996 }, { "clip_ratio": 0.0, "completion_length": 281.45760345458984, "epoch": 0.8952281383018446, "grad_norm": 0.9054875373840332, "kl": 0.454345703125, "learning_rate": 6.58972673715319e-07, "loss": 0.2449, "reward": 2.026227831840515, "reward_std": 0.22091418504714966, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9882813096046448, "step": 2997 }, { "clip_ratio": 0.0, "completion_length": 285.4643020629883, "epoch": 0.895526846389366, "grad_norm": 1.3215065002441406, "kl": 0.423828125, "learning_rate": 6.552541095327281e-07, "loss": 0.263, "reward": 2.0206474363803864, "reward_std": 0.2543798200786114, "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9804687947034836, "step": 2998 }, { "clip_ratio": 0.0, "completion_length": 272.4196548461914, "epoch": 0.8958255544768875, "grad_norm": 1.4078807830810547, "kl": 0.683837890625, "learning_rate": 6.515457115177804e-07, "loss": 0.2334, "reward": 2.021205425262451, "reward_std": 0.28620829805731773, "rewards/accuracy_reward": 0.0803571492433548, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9787946790456772, "step": 2999 }, { "clip_ratio": 0.0, "completion_length": 251.08259963989258, "epoch": 0.8961242625644089, "grad_norm": 1.4404140710830688, "kl": 0.48828125, "learning_rate": 6.478474837048532e-07, "loss": 0.379, "reward": 2.071986734867096, "reward_std": 0.2420032974332571, "rewards/accuracy_reward": 0.12723214901052415, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9827009290456772, "step": 3000 }, { "clip_ratio": 0.0, "completion_length": 269.8705520629883, "epoch": 0.8964229706519304, "grad_norm": 0.8319172263145447, "kl": 0.399658203125, "learning_rate": 6.441594301172527e-07, "loss": 0.227, "reward": 2.056361645460129, "reward_std": 0.17855114303529263, "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.984933078289032, "step": 3001 }, { "clip_ratio": 0.0, "completion_length": 278.1651916503906, "epoch": 0.8967216787394519, "grad_norm": 1.022654414176941, "kl": 0.384765625, "learning_rate": 6.404815547672216e-07, "loss": 0.261, "reward": 2.1216519474983215, "reward_std": 0.25308534130454063, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9854911267757416, "step": 3002 }, { "clip_ratio": 0.0, "completion_length": 276.308048248291, "epoch": 0.8970203868269734, "grad_norm": 2.029531955718994, "kl": 0.668212890625, "learning_rate": 6.368138616559283e-07, "loss": 0.2731, "reward": 2.012834906578064, "reward_std": 0.20315281674265862, "rewards/accuracy_reward": 0.051339286379516125, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9860491454601288, "step": 3003 }, { "clip_ratio": 0.0, "completion_length": 259.7790298461914, "epoch": 0.8973190949144948, "grad_norm": 1.5087624788284302, "kl": 0.2958984375, "learning_rate": 6.331563547734621e-07, "loss": 0.118, "reward": 2.111607253551483, "reward_std": 0.14169546961784363, "rewards/accuracy_reward": 0.12723214831203222, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9955357611179352, "step": 3004 }, { "clip_ratio": 0.0, "completion_length": 264.9709930419922, "epoch": 0.8976178030020163, "grad_norm": 1.3826167583465576, "kl": 0.448974609375, "learning_rate": 6.295090380988323e-07, "loss": 0.1249, "reward": 2.087053656578064, "reward_std": 0.20609963685274124, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9933036118745804, "step": 3005 }, { "clip_ratio": 0.0, "completion_length": 279.5424270629883, "epoch": 0.8979165110895377, "grad_norm": 1.2381504774093628, "kl": 0.42822265625, "learning_rate": 6.258719155999637e-07, "loss": 0.1667, "reward": 2.107142925262451, "reward_std": 0.2676101140677929, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9866071939468384, "step": 3006 }, { "clip_ratio": 0.0, "completion_length": 258.75671005249023, "epoch": 0.8982152191770593, "grad_norm": 0.9324179887771606, "kl": 0.4501953125, "learning_rate": 6.222449912336859e-07, "loss": 0.2425, "reward": 2.1049108505249023, "reward_std": 0.23140954971313477, "rewards/accuracy_reward": 0.1406250037252903, "rewards/format_reward": 0.975446492433548, "rewards/tag_count_reward": 0.988839328289032, "step": 3007 }, { "clip_ratio": 0.0, "completion_length": 303.5000228881836, "epoch": 0.8985139272645807, "grad_norm": 0.98171466588974, "kl": 0.4345703125, "learning_rate": 6.18628268945739e-07, "loss": 0.3214, "reward": 2.121093839406967, "reward_std": 0.3078821934759617, "rewards/accuracy_reward": 0.18750000861473382, "rewards/format_reward": 0.9531250298023224, "rewards/tag_count_reward": 0.9804687947034836, "step": 3008 }, { "clip_ratio": 0.0, "completion_length": 236.26563262939453, "epoch": 0.8988126353521022, "grad_norm": 9.446365356445312, "kl": 1.019775390625, "learning_rate": 6.150217526707636e-07, "loss": 0.2105, "reward": 2.1043527722358704, "reward_std": 0.20520227029919624, "rewards/accuracy_reward": 0.12276786472648382, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9927455633878708, "step": 3009 }, { "clip_ratio": 0.0, "completion_length": 276.4977836608887, "epoch": 0.8991113434396236, "grad_norm": 1.159996509552002, "kl": 0.439453125, "learning_rate": 6.114254463322933e-07, "loss": 0.294, "reward": 2.04631707072258, "reward_std": 0.35847119241952896, "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.948660746216774, "rewards/tag_count_reward": 0.9726562798023224, "step": 3010 }, { "clip_ratio": 0.0, "completion_length": 250.41741943359375, "epoch": 0.8994100515271451, "grad_norm": 1.4382803440093994, "kl": 0.52587890625, "learning_rate": 6.078393538427574e-07, "loss": 0.2655, "reward": 2.099888503551483, "reward_std": 0.24568297155201435, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9838170111179352, "step": 3011 }, { "clip_ratio": 0.0, "completion_length": 261.636173248291, "epoch": 0.8997087596146666, "grad_norm": 0.8310350179672241, "kl": 0.27587890625, "learning_rate": 6.042634791034763e-07, "loss": 0.2794, "reward": 2.0390626192092896, "reward_std": 0.1654769480228424, "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9854910969734192, "step": 3012 }, { "clip_ratio": 0.0, "completion_length": 251.24554443359375, "epoch": 0.9000074677021881, "grad_norm": 0.6832495927810669, "kl": 0.388671875, "learning_rate": 6.00697826004647e-07, "loss": 0.304, "reward": 2.2053572237491608, "reward_std": 0.20452043786644936, "rewards/accuracy_reward": 0.2455357287544757, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9843750447034836, "step": 3013 }, { "clip_ratio": 0.0, "completion_length": 249.96875762939453, "epoch": 0.9003061757897095, "grad_norm": 0.8215959072113037, "kl": 0.38134765625, "learning_rate": 5.971423984253544e-07, "loss": 0.2712, "reward": 2.0585938692092896, "reward_std": 0.23624766990542412, "rewards/accuracy_reward": 0.10491071757860482, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9827009439468384, "step": 3014 }, { "clip_ratio": 0.0, "completion_length": 282.48438262939453, "epoch": 0.9006048838772309, "grad_norm": 0.9175580739974976, "kl": 0.40087890625, "learning_rate": 5.93597200233551e-07, "loss": 0.2774, "reward": 2.0212054550647736, "reward_std": 0.2222334947437048, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.9620535969734192, "rewards/tag_count_reward": 0.9787946939468384, "step": 3015 }, { "clip_ratio": 0.0, "completion_length": 265.05135345458984, "epoch": 0.9009035919647524, "grad_norm": 0.777262270450592, "kl": 0.347900390625, "learning_rate": 5.900622352860675e-07, "loss": 0.1823, "reward": 2.1060268878936768, "reward_std": 0.16965249925851822, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.9799107313156128, "rewards/tag_count_reward": 0.9899553954601288, "step": 3016 }, { "clip_ratio": 0.0, "completion_length": 277.98439025878906, "epoch": 0.9012023000522739, "grad_norm": 1.541563868522644, "kl": 0.50390625, "learning_rate": 5.865375074286006e-07, "loss": 0.3298, "reward": 2.0597099363803864, "reward_std": 0.32768674194812775, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.95089291036129, "rewards/tag_count_reward": 0.9748884439468384, "step": 3017 }, { "clip_ratio": 0.0, "completion_length": 277.9620666503906, "epoch": 0.9015010081397954, "grad_norm": 1.3840631246566772, "kl": 0.5673828125, "learning_rate": 5.830230204957044e-07, "loss": 0.2835, "reward": 2.031250149011612, "reward_std": 0.2723316475749016, "rewards/accuracy_reward": 0.10267857671715319, "rewards/format_reward": 0.9553571790456772, "rewards/tag_count_reward": 0.973214328289032, "step": 3018 }, { "clip_ratio": 0.0, "completion_length": 258.03571701049805, "epoch": 0.9017997162273168, "grad_norm": 0.788969099521637, "kl": 0.48828125, "learning_rate": 5.795187783108003e-07, "loss": 0.2676, "reward": 1.9983260035514832, "reward_std": 0.1944243535399437, "rewards/accuracy_reward": 0.046875000931322575, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.9827009290456772, "step": 3019 }, { "clip_ratio": 0.0, "completion_length": 246.6116180419922, "epoch": 0.9020984243148383, "grad_norm": 0.5213146805763245, "kl": 0.30029296875, "learning_rate": 5.7602478468616e-07, "loss": 0.0988, "reward": 2.1796876192092896, "reward_std": 0.1471894783899188, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9966518133878708, "step": 3020 }, { "clip_ratio": 0.0, "completion_length": 265.84376525878906, "epoch": 0.9023971324023597, "grad_norm": 1.2004528045654297, "kl": 0.325927734375, "learning_rate": 5.72541043422904e-07, "loss": 0.1995, "reward": 2.132812589406967, "reward_std": 0.26708970218896866, "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9832589775323868, "step": 3021 }, { "clip_ratio": 0.0, "completion_length": 256.0357246398926, "epoch": 0.9026958404898813, "grad_norm": 0.5770425796508789, "kl": 0.3720703125, "learning_rate": 5.690675583110028e-07, "loss": 0.2627, "reward": 2.0552456080913544, "reward_std": 0.23145877569913864, "rewards/accuracy_reward": 0.10491071920841932, "rewards/format_reward": 0.964285746216774, "rewards/tag_count_reward": 0.98604916036129, "step": 3022 }, { "clip_ratio": 0.0, "completion_length": 236.7209930419922, "epoch": 0.9029945485774027, "grad_norm": 1.138218879699707, "kl": 0.358154296875, "learning_rate": 5.656043331292682e-07, "loss": 0.1642, "reward": 2.1635045409202576, "reward_std": 0.14038199931383133, "rewards/accuracy_reward": 0.17633929289877415, "rewards/format_reward": 0.9910714477300644, "rewards/tag_count_reward": 0.9960937798023224, "step": 3023 }, { "clip_ratio": 0.0, "completion_length": 270.3750114440918, "epoch": 0.9032932566649242, "grad_norm": 1.1904997825622559, "kl": 0.334228515625, "learning_rate": 5.621513716453475e-07, "loss": 0.1916, "reward": 2.0452009439468384, "reward_std": 0.17798166163265705, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9893973618745804, "step": 3024 }, { "clip_ratio": 0.0, "completion_length": 265.042423248291, "epoch": 0.9035919647524456, "grad_norm": 2.0671374797821045, "kl": 0.82373046875, "learning_rate": 5.58708677615728e-07, "loss": 0.3922, "reward": 2.0625001192092896, "reward_std": 0.2996407188475132, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.9508928954601288, "rewards/tag_count_reward": 0.973214328289032, "step": 3025 }, { "clip_ratio": 0.0, "completion_length": 270.29689025878906, "epoch": 0.9038906728399672, "grad_norm": 0.5190010070800781, "kl": 0.2841796875, "learning_rate": 5.552762547857194e-07, "loss": 0.148, "reward": 2.001674145460129, "reward_std": 0.12978562340140343, "rewards/accuracy_reward": 0.022321430267766118, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9927455931901932, "step": 3026 }, { "clip_ratio": 0.0, "completion_length": 245.78572845458984, "epoch": 0.9041893809274886, "grad_norm": 0.448390394449234, "kl": 0.2734375, "learning_rate": 5.518541068894622e-07, "loss": 0.1798, "reward": 2.073660761117935, "reward_std": 0.1821199506521225, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.988839328289032, "step": 3027 }, { "clip_ratio": 0.0, "completion_length": 257.28126525878906, "epoch": 0.9044880890150101, "grad_norm": 0.7431661486625671, "kl": 0.595947265625, "learning_rate": 5.484422376499222e-07, "loss": 0.2055, "reward": 2.0686384737491608, "reward_std": 0.17695347592234612, "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.9732143431901932, "rewards/tag_count_reward": 0.9838170111179352, "step": 3028 }, { "clip_ratio": 0.0, "completion_length": 245.75893783569336, "epoch": 0.9047867971025315, "grad_norm": 1.211181640625, "kl": 0.247314453125, "learning_rate": 5.45040650778873e-07, "loss": 0.164, "reward": 2.111607253551483, "reward_std": 0.2006942443549633, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9955357611179352, "step": 3029 }, { "clip_ratio": 0.0, "completion_length": 264.1852798461914, "epoch": 0.905085505190053, "grad_norm": 0.7955739498138428, "kl": 0.36962890625, "learning_rate": 5.416493499769094e-07, "loss": 0.1749, "reward": 2.0636161863803864, "reward_std": 0.1429333444684744, "rewards/accuracy_reward": 0.0870535783469677, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9921875447034836, "step": 3030 }, { "clip_ratio": 0.0, "completion_length": 273.0848388671875, "epoch": 0.9053842132775745, "grad_norm": 0.8758342266082764, "kl": 0.25927734375, "learning_rate": 5.382683389334375e-07, "loss": 0.2534, "reward": 2.1495536863803864, "reward_std": 0.24862253665924072, "rewards/accuracy_reward": 0.19196429196745157, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9821428805589676, "step": 3031 }, { "clip_ratio": 0.0, "completion_length": 284.7254638671875, "epoch": 0.905682921365096, "grad_norm": 1.9718972444534302, "kl": 0.428466796875, "learning_rate": 5.348976213266621e-07, "loss": 0.0814, "reward": 2.047991156578064, "reward_std": 0.17440651915967464, "rewards/accuracy_reward": 0.07142857322469354, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9899553954601288, "step": 3032 }, { "clip_ratio": 0.0, "completion_length": 255.35046005249023, "epoch": 0.9059816294526174, "grad_norm": 0.39496588706970215, "kl": 0.331298828125, "learning_rate": 5.315372008235941e-07, "loss": 0.1934, "reward": 2.0217635333538055, "reward_std": 0.17117638513445854, "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9882812798023224, "step": 3033 }, { "clip_ratio": 0.0, "completion_length": 260.2745704650879, "epoch": 0.9062803375401389, "grad_norm": 0.5046716928482056, "kl": 0.374267578125, "learning_rate": 5.28187081080045e-07, "loss": 0.2751, "reward": 2.040736734867096, "reward_std": 0.26224610581994057, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9804687947034836, "step": 3034 }, { "clip_ratio": 0.0, "completion_length": 251.55358505249023, "epoch": 0.9065790456276603, "grad_norm": 22.782756805419922, "kl": 0.363037109375, "learning_rate": 5.248472657406123e-07, "loss": 0.1452, "reward": 2.0825893580913544, "reward_std": 0.11375453509390354, "rewards/accuracy_reward": 0.10267857578583062, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9910714626312256, "step": 3035 }, { "clip_ratio": 0.0, "completion_length": 252.12054443359375, "epoch": 0.9068777537151819, "grad_norm": 12.199629783630371, "kl": 1.135986328125, "learning_rate": 5.2151775843869e-07, "loss": 0.2975, "reward": 2.1199777126312256, "reward_std": 0.21321552246809006, "rewards/accuracy_reward": 0.15848214738070965, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.983816996216774, "step": 3036 }, { "clip_ratio": 0.0, "completion_length": 237.07144165039062, "epoch": 0.9071764618027033, "grad_norm": 0.31103429198265076, "kl": 0.27978515625, "learning_rate": 5.181985627964559e-07, "loss": 0.103, "reward": 2.1662947833538055, "reward_std": 0.18657347187399864, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9921875298023224, "step": 3037 }, { "clip_ratio": 0.0, "completion_length": 248.9687614440918, "epoch": 0.9074751698902248, "grad_norm": 1.2856038808822632, "kl": 0.36572265625, "learning_rate": 5.148896824248683e-07, "loss": 0.1336, "reward": 2.0943081378936768, "reward_std": 0.1166070718318224, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9893973618745804, "step": 3038 }, { "clip_ratio": 0.0, "completion_length": 249.3660888671875, "epoch": 0.9077738779777462, "grad_norm": 1.0636917352676392, "kl": 0.30322265625, "learning_rate": 5.115911209236669e-07, "loss": 0.1659, "reward": 2.099888503551483, "reward_std": 0.22109122574329376, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.983816996216774, "step": 3039 }, { "clip_ratio": 0.0, "completion_length": 263.9263572692871, "epoch": 0.9080725860652678, "grad_norm": 0.6961156725883484, "kl": 0.287841796875, "learning_rate": 5.083028818813607e-07, "loss": 0.0736, "reward": 2.0708706378936768, "reward_std": 0.16042655892670155, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.9843750149011612, "rewards/tag_count_reward": 0.9882812649011612, "step": 3040 }, { "clip_ratio": 0.0, "completion_length": 231.10492324829102, "epoch": 0.9083712941527892, "grad_norm": 1.6654808521270752, "kl": 0.48828125, "learning_rate": 5.050249688752329e-07, "loss": 0.2654, "reward": 2.089843839406967, "reward_std": 0.2125486470758915, "rewards/accuracy_reward": 0.13392857694998384, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9804687947034836, "step": 3041 }, { "clip_ratio": 0.0, "completion_length": 243.27679824829102, "epoch": 0.9086700022403107, "grad_norm": 0.7424488663673401, "kl": 0.275634765625, "learning_rate": 5.01757385471332e-07, "loss": 0.1926, "reward": 2.1043527722358704, "reward_std": 0.22458931803703308, "rewards/accuracy_reward": 0.14508929289877415, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.983816996216774, "step": 3042 }, { "clip_ratio": 0.0, "completion_length": 241.91296005249023, "epoch": 0.9089687103278321, "grad_norm": 0.882906973361969, "kl": 0.28369140625, "learning_rate": 4.985001352244667e-07, "loss": 0.1732, "reward": 2.098772406578064, "reward_std": 0.15088721364736557, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9893973618745804, "step": 3043 }, { "clip_ratio": 0.0, "completion_length": 241.49777221679688, "epoch": 0.9092674184153536, "grad_norm": 4.976792812347412, "kl": 0.24072265625, "learning_rate": 4.95253221678208e-07, "loss": 0.224, "reward": 2.1026787161827087, "reward_std": 0.1881659496575594, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9843750447034836, "step": 3044 }, { "clip_ratio": 0.0, "completion_length": 247.30581665039062, "epoch": 0.909566126502875, "grad_norm": 0.7314548492431641, "kl": 0.32958984375, "learning_rate": 4.920166483648792e-07, "loss": 0.2403, "reward": 2.040736645460129, "reward_std": 0.20157730393111706, "rewards/accuracy_reward": 0.08258929033763707, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9827009290456772, "step": 3045 }, { "clip_ratio": 0.0, "completion_length": 260.9241180419922, "epoch": 0.9098648345903966, "grad_norm": 0.608651340007782, "kl": 0.2158203125, "learning_rate": 4.887904188055537e-07, "loss": 0.1203, "reward": 1.9916295111179352, "reward_std": 0.11332485266029835, "rewards/accuracy_reward": 0.01562500116415322, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9893973469734192, "step": 3046 }, { "clip_ratio": 0.0, "completion_length": 277.85491943359375, "epoch": 0.910163542677918, "grad_norm": 0.6654749512672424, "kl": 0.310302734375, "learning_rate": 4.855745365100539e-07, "loss": 0.2064, "reward": 2.028459906578064, "reward_std": 0.19947568327188492, "rewards/accuracy_reward": 0.05580357275903225, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9882812798023224, "step": 3047 }, { "clip_ratio": 0.0, "completion_length": 217.51787185668945, "epoch": 0.9104622507654395, "grad_norm": 0.6669544577598572, "kl": 0.269775390625, "learning_rate": 4.823690049769448e-07, "loss": 0.1066, "reward": 2.228236734867096, "reward_std": 0.17498589307069778, "rewards/accuracy_reward": 0.2388392984867096, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9960937649011612, "step": 3048 }, { "clip_ratio": 0.0, "completion_length": 241.24777221679688, "epoch": 0.9107609588529609, "grad_norm": 0.39640939235687256, "kl": 0.220703125, "learning_rate": 4.791738276935299e-07, "loss": 0.0694, "reward": 2.099888503551483, "reward_std": 0.11040736455470324, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.9910714328289032, "rewards/tag_count_reward": 0.9927455484867096, "step": 3049 }, { "clip_ratio": 0.0, "completion_length": 224.7834930419922, "epoch": 0.9110596669404825, "grad_norm": 1.3583825826644897, "kl": 0.615234375, "learning_rate": 4.759890081358487e-07, "loss": 0.2761, "reward": 2.0904018878936768, "reward_std": 0.18683182075619698, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9810268431901932, "step": 3050 }, { "clip_ratio": 0.0, "completion_length": 303.4397430419922, "epoch": 0.9113583750280039, "grad_norm": 0.9148844480514526, "kl": 0.3056640625, "learning_rate": 4.7281454976867535e-07, "loss": 0.2997, "reward": 2.020647406578064, "reward_std": 0.2573734447360039, "rewards/accuracy_reward": 0.07812500558793545, "rewards/format_reward": 0.9665178805589676, "rewards/tag_count_reward": 0.9760045111179352, "step": 3051 }, { "clip_ratio": 0.0, "completion_length": 285.2500228881836, "epoch": 0.9116570831155254, "grad_norm": 0.6392362713813782, "kl": 0.43701171875, "learning_rate": 4.696504560455051e-07, "loss": 0.1793, "reward": 2.032366156578064, "reward_std": 0.2018474005162716, "rewards/accuracy_reward": 0.08035714877769351, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.981026828289032, "step": 3052 }, { "clip_ratio": 0.0, "completion_length": 263.65626525878906, "epoch": 0.9119557912030468, "grad_norm": 0.6691222190856934, "kl": 0.229736328125, "learning_rate": 4.664967304085655e-07, "loss": 0.0511, "reward": 2.1595982909202576, "reward_std": 0.10698319971561432, "rewards/accuracy_reward": 0.1718750037252903, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.994419664144516, "step": 3053 }, { "clip_ratio": 0.0, "completion_length": 260.9040298461914, "epoch": 0.9122544992905683, "grad_norm": 76.72960662841797, "kl": 0.419189453125, "learning_rate": 4.6335337628879874e-07, "loss": 0.2273, "reward": 2.0585938096046448, "reward_std": 0.2120499201118946, "rewards/accuracy_reward": 0.08258929010480642, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9893973767757416, "step": 3054 }, { "clip_ratio": 0.0, "completion_length": 290.8995666503906, "epoch": 0.9125532073780898, "grad_norm": 1.8259001970291138, "kl": 0.755859375, "learning_rate": 4.602203971058661e-07, "loss": 0.3099, "reward": 2.0820313096046448, "reward_std": 0.25162869319319725, "rewards/accuracy_reward": 0.14508928847499192, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9748884290456772, "step": 3055 }, { "clip_ratio": 0.0, "completion_length": 259.70983505249023, "epoch": 0.9128519154656113, "grad_norm": 0.47545087337493896, "kl": 0.30517578125, "learning_rate": 4.570977962681444e-07, "loss": 0.1722, "reward": 2.1947545409202576, "reward_std": 0.2065982185304165, "rewards/accuracy_reward": 0.227678582072258, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.984933078289032, "step": 3056 }, { "clip_ratio": 0.0, "completion_length": 230.10715103149414, "epoch": 0.9131506235531327, "grad_norm": 0.6110081672668457, "kl": 0.245361328125, "learning_rate": 4.539855771727131e-07, "loss": 0.143, "reward": 2.204241156578064, "reward_std": 0.15699037909507751, "rewards/accuracy_reward": 0.2254464328289032, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9921875447034836, "step": 3057 }, { "clip_ratio": 0.0, "completion_length": 275.89064025878906, "epoch": 0.9134493316406541, "grad_norm": 0.7396734356880188, "kl": 0.372314453125, "learning_rate": 4.508837432053648e-07, "loss": 0.1829, "reward": 2.0652902722358704, "reward_std": 0.15334712713956833, "rewards/accuracy_reward": 0.09375000232830644, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9871652275323868, "step": 3058 }, { "clip_ratio": 0.0, "completion_length": 278.9709930419922, "epoch": 0.9137480397281756, "grad_norm": 1.7421045303344727, "kl": 0.32470703125, "learning_rate": 4.477922977405913e-07, "loss": 0.2701, "reward": 1.9821429252624512, "reward_std": 0.1762460470199585, "rewards/accuracy_reward": 0.022321428870782256, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9821428954601288, "step": 3059 }, { "clip_ratio": 0.0, "completion_length": 246.92412185668945, "epoch": 0.9140467478156971, "grad_norm": 0.7313366532325745, "kl": 0.205078125, "learning_rate": 4.4471124414157905e-07, "loss": 0.1702, "reward": 2.0647322237491608, "reward_std": 0.15510740876197815, "rewards/accuracy_reward": 0.09375000232830644, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9866071939468384, "step": 3060 }, { "clip_ratio": 0.0, "completion_length": 240.85269165039062, "epoch": 0.9143454559032186, "grad_norm": 21.16804313659668, "kl": 0.34619140625, "learning_rate": 4.4164058576021464e-07, "loss": 0.2769, "reward": 2.0591519474983215, "reward_std": 0.21767956763505936, "rewards/accuracy_reward": 0.10044643562287092, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9832589626312256, "step": 3061 }, { "clip_ratio": 0.0, "completion_length": 251.2991180419922, "epoch": 0.91464416399074, "grad_norm": 0.8491114377975464, "kl": 0.209716796875, "learning_rate": 4.3858032593707357e-07, "loss": 0.1463, "reward": 1.9854911863803864, "reward_std": 0.11248911451548338, "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9899553954601288, "step": 3062 }, { "clip_ratio": 0.0, "completion_length": 239.3259048461914, "epoch": 0.9149428720782615, "grad_norm": 0.8716181516647339, "kl": 0.459228515625, "learning_rate": 4.355304680014172e-07, "loss": 0.1655, "reward": 2.0496652126312256, "reward_std": 0.20091258734464645, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9849330633878708, "step": 3063 }, { "clip_ratio": 0.0, "completion_length": 226.24777603149414, "epoch": 0.9152415801657829, "grad_norm": 1.0195214748382568, "kl": 0.263671875, "learning_rate": 4.3249101527119253e-07, "loss": 0.1862, "reward": 2.0898438096046448, "reward_std": 0.23436012491583824, "rewards/accuracy_reward": 0.11830357927829027, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9871652275323868, "step": 3064 }, { "clip_ratio": 0.0, "completion_length": 292.6138496398926, "epoch": 0.9155402882533045, "grad_norm": 0.40345850586891174, "kl": 0.195068359375, "learning_rate": 4.29461971053029e-07, "loss": 0.1363, "reward": 2.0898438692092896, "reward_std": 0.17877568304538727, "rewards/accuracy_reward": 0.12946428917348385, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9827009290456772, "step": 3065 }, { "clip_ratio": 0.0, "completion_length": 275.08260345458984, "epoch": 0.9158389963408259, "grad_norm": 0.9565253853797913, "kl": 0.2491455078125, "learning_rate": 4.264433386422251e-07, "loss": 0.1335, "reward": 2.050781399011612, "reward_std": 0.15226679481565952, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9882812798023224, "step": 3066 }, { "clip_ratio": 0.0, "completion_length": 262.3080520629883, "epoch": 0.9161377044283474, "grad_norm": 0.5682371854782104, "kl": 0.34814453125, "learning_rate": 4.2343512132276055e-07, "loss": 0.2346, "reward": 2.0920759737491608, "reward_std": 0.23572054877877235, "rewards/accuracy_reward": 0.14508929220028222, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9782366454601288, "step": 3067 }, { "clip_ratio": 0.0, "completion_length": 278.2321472167969, "epoch": 0.9164364125158688, "grad_norm": 0.3989465534687042, "kl": 0.257080078125, "learning_rate": 4.2043732236727973e-07, "loss": 0.1291, "reward": 2.0892857909202576, "reward_std": 0.252856707200408, "rewards/accuracy_reward": 0.12946429196745157, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9843750298023224, "step": 3068 }, { "clip_ratio": 0.0, "completion_length": 293.20983505249023, "epoch": 0.9167351206033904, "grad_norm": 0.8154208660125732, "kl": 0.305419921875, "learning_rate": 4.1744994503709277e-07, "loss": 0.1479, "reward": 2.09709832072258, "reward_std": 0.2072261944413185, "rewards/accuracy_reward": 0.13616071990691125, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9832589626312256, "step": 3069 }, { "clip_ratio": 0.0, "completion_length": 262.67858123779297, "epoch": 0.9170338286909118, "grad_norm": 0.5321177244186401, "kl": 0.250244140625, "learning_rate": 4.144729925821767e-07, "loss": 0.2886, "reward": 2.1216518878936768, "reward_std": 0.2644714303314686, "rewards/accuracy_reward": 0.16741072572767735, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9787946939468384, "step": 3070 }, { "clip_ratio": 0.0, "completion_length": 270.2053680419922, "epoch": 0.9173325367784333, "grad_norm": 0.40664583444595337, "kl": 0.226318359375, "learning_rate": 4.115064682411607e-07, "loss": 0.1675, "reward": 2.1065849363803864, "reward_std": 0.21682333573698997, "rewards/accuracy_reward": 0.13392857881262898, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9882812798023224, "step": 3071 }, { "clip_ratio": 0.0, "completion_length": 305.01341247558594, "epoch": 0.9176312448659547, "grad_norm": 9.871896743774414, "kl": 0.41796875, "learning_rate": 4.0855037524133443e-07, "loss": 0.1588, "reward": 1.9938617050647736, "reward_std": 0.15252192690968513, "rewards/accuracy_reward": 0.02455357275903225, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9893973618745804, "step": 3072 }, { "clip_ratio": 0.0, "completion_length": 285.6897430419922, "epoch": 0.9179299529534762, "grad_norm": 0.4106616675853729, "kl": 0.38134765625, "learning_rate": 4.0560471679863654e-07, "loss": 0.2203, "reward": 2.062500089406967, "reward_std": 0.2051515094935894, "rewards/accuracy_reward": 0.10491071757860482, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9799107611179352, "step": 3073 }, { "clip_ratio": 0.0, "completion_length": 273.4977836608887, "epoch": 0.9182286610409977, "grad_norm": 0.7741965055465698, "kl": 0.1951904296875, "learning_rate": 4.026694961176547e-07, "loss": 0.1308, "reward": 2.0870536267757416, "reward_std": 0.16083713062107563, "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9888393133878708, "step": 3074 }, { "clip_ratio": 0.0, "completion_length": 291.3839416503906, "epoch": 0.9185273691285192, "grad_norm": 1.9714410305023193, "kl": 0.41650390625, "learning_rate": 3.9974471639162236e-07, "loss": 0.1177, "reward": 2.0820313692092896, "reward_std": 0.1933456677943468, "rewards/accuracy_reward": 0.11160715157166123, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9882812947034836, "step": 3075 }, { "clip_ratio": 0.0, "completion_length": 269.76564025878906, "epoch": 0.9188260772160406, "grad_norm": 0.8441991806030273, "kl": 0.233154296875, "learning_rate": 3.968303808024121e-07, "loss": 0.2173, "reward": 2.1171876192092896, "reward_std": 0.2093433476984501, "rewards/accuracy_reward": 0.15848215157166123, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9832589626312256, "step": 3076 }, { "clip_ratio": 0.0, "completion_length": 272.3013496398926, "epoch": 0.9191247853035621, "grad_norm": 0.29394295811653137, "kl": 0.244384765625, "learning_rate": 3.939264925205355e-07, "loss": 0.064, "reward": 2.146205425262451, "reward_std": 0.17950067296624184, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.994419664144516, "step": 3077 }, { "clip_ratio": 0.0, "completion_length": 235.85269165039062, "epoch": 0.9194234933910835, "grad_norm": 0.3167065978050232, "kl": 0.1826171875, "learning_rate": 3.910330547051389e-07, "loss": 0.1455, "reward": 2.1049107909202576, "reward_std": 0.18261897936463356, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9933035969734192, "step": 3078 }, { "clip_ratio": 0.0, "completion_length": 255.18974685668945, "epoch": 0.9197222014786051, "grad_norm": 1.093669056892395, "kl": 0.48779296875, "learning_rate": 3.881500705039998e-07, "loss": 0.2685, "reward": 2.037388503551483, "reward_std": 0.1911709327250719, "rewards/accuracy_reward": 0.08258929336443543, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9793527275323868, "step": 3079 }, { "clip_ratio": 0.0, "completion_length": 258.9397430419922, "epoch": 0.9200209095661265, "grad_norm": 5.514294147491455, "kl": 0.3212890625, "learning_rate": 3.852775430535194e-07, "loss": 0.2005, "reward": 2.1210938692092896, "reward_std": 0.188827283680439, "rewards/accuracy_reward": 0.15848215110599995, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.984933078289032, "step": 3080 }, { "clip_ratio": 0.0, "completion_length": 299.6585006713867, "epoch": 0.920319617653648, "grad_norm": 0.7512946724891663, "kl": 0.205322265625, "learning_rate": 3.8241547547873016e-07, "loss": 0.1815, "reward": 2.051897406578064, "reward_std": 0.18888752721250057, "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.9754464477300644, "rewards/tag_count_reward": 0.9804687947034836, "step": 3081 }, { "clip_ratio": 0.0, "completion_length": 265.5089416503906, "epoch": 0.9206183257411694, "grad_norm": 0.4682846665382385, "kl": 0.33984375, "learning_rate": 3.795638708932781e-07, "loss": 0.1147, "reward": 2.0691964626312256, "reward_std": 0.12761100381612778, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9933036118745804, "step": 3082 }, { "clip_ratio": 0.0, "completion_length": 278.29466247558594, "epoch": 0.920917033828691, "grad_norm": 0.45237913727760315, "kl": 0.193603515625, "learning_rate": 3.7672273239942936e-07, "loss": 0.1191, "reward": 2.189174234867096, "reward_std": 0.2038479670882225, "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9905134290456772, "step": 3083 }, { "clip_ratio": 0.0, "completion_length": 257.1584930419922, "epoch": 0.9212157419162124, "grad_norm": 0.46746453642845154, "kl": 0.21142578125, "learning_rate": 3.738920630880671e-07, "loss": 0.1141, "reward": 2.141741156578064, "reward_std": 0.1919008158147335, "rewards/accuracy_reward": 0.165178582072258, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9899553805589676, "step": 3084 }, { "clip_ratio": 0.0, "completion_length": 243.45983123779297, "epoch": 0.9215144500037339, "grad_norm": 0.8577303886413574, "kl": 0.192626953125, "learning_rate": 3.7107186603867917e-07, "loss": 0.1272, "reward": 2.1037946939468384, "reward_std": 0.146028570830822, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.987723246216774, "step": 3085 }, { "clip_ratio": 0.0, "completion_length": 259.80581283569336, "epoch": 0.9218131580912553, "grad_norm": 0.9735211133956909, "kl": 0.34521484375, "learning_rate": 3.682621443193635e-07, "loss": 0.181, "reward": 2.080357253551483, "reward_std": 0.18447329476475716, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9888393133878708, "step": 3086 }, { "clip_ratio": 0.0, "completion_length": 289.83929443359375, "epoch": 0.9221118661787768, "grad_norm": 0.8405665159225464, "kl": 0.36572265625, "learning_rate": 3.654629009868249e-07, "loss": 0.1974, "reward": 1.9949777722358704, "reward_std": 0.1723542995750904, "rewards/accuracy_reward": 0.0334821455180645, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9838170111179352, "step": 3087 }, { "clip_ratio": 0.0, "completion_length": 299.39510345458984, "epoch": 0.9224105742662982, "grad_norm": 2.0873019695281982, "kl": 0.312255859375, "learning_rate": 3.6267413908636304e-07, "loss": 0.266, "reward": 2.017857253551483, "reward_std": 0.21986988559365273, "rewards/accuracy_reward": 0.055803575087338686, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9843750447034836, "step": 3088 }, { "clip_ratio": 0.0, "completion_length": 288.8080520629883, "epoch": 0.9227092823538198, "grad_norm": 0.738541841506958, "kl": 0.2965087890625, "learning_rate": 3.5989586165187884e-07, "loss": 0.2118, "reward": 2.0301340222358704, "reward_std": 0.24900260008871555, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9832589626312256, "step": 3089 }, { "clip_ratio": 0.0, "completion_length": 271.1696586608887, "epoch": 0.9230079904413412, "grad_norm": 1.1840494871139526, "kl": 0.395263671875, "learning_rate": 3.571280717058656e-07, "loss": 0.2039, "reward": 2.071986675262451, "reward_std": 0.21718373708426952, "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.982700914144516, "step": 3090 }, { "clip_ratio": 0.0, "completion_length": 281.62947845458984, "epoch": 0.9233066985288627, "grad_norm": 0.3470137119293213, "kl": 0.23974609375, "learning_rate": 3.54370772259407e-07, "loss": 0.0894, "reward": 2.0982143580913544, "reward_std": 0.10116522200405598, "rewards/accuracy_reward": 0.10937500861473382, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.995535746216774, "step": 3091 }, { "clip_ratio": 0.0, "completion_length": 270.00224685668945, "epoch": 0.9236054066163841, "grad_norm": 0.8017109632492065, "kl": 0.22998046875, "learning_rate": 3.5162396631217453e-07, "loss": 0.0854, "reward": 2.0195313692092896, "reward_std": 0.11469315830618143, "rewards/accuracy_reward": 0.03571428777649999, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9927455633878708, "step": 3092 }, { "clip_ratio": 0.0, "completion_length": 247.18974685668945, "epoch": 0.9239041147039057, "grad_norm": 37.04221725463867, "kl": 0.374755859375, "learning_rate": 3.4888765685242465e-07, "loss": 0.2712, "reward": 2.083147406578064, "reward_std": 0.2211399283260107, "rewards/accuracy_reward": 0.12500000651925802, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9827009290456772, "step": 3093 }, { "clip_ratio": 0.0, "completion_length": 263.89733123779297, "epoch": 0.9242028227914271, "grad_norm": 0.5102553367614746, "kl": 0.21337890625, "learning_rate": 3.4616184685699273e-07, "loss": 0.068, "reward": 2.1439733505249023, "reward_std": 0.16899272054433823, "rewards/accuracy_reward": 0.15625000558793545, "rewards/format_reward": 0.993303582072258, "rewards/tag_count_reward": 0.9944196492433548, "step": 3094 }, { "clip_ratio": 0.0, "completion_length": 256.823673248291, "epoch": 0.9245015308789486, "grad_norm": 1.3782615661621094, "kl": 0.30078125, "learning_rate": 3.4344653929129554e-07, "loss": 0.1573, "reward": 2.0474331378936768, "reward_std": 0.1702339993789792, "rewards/accuracy_reward": 0.07589286169968545, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9893973469734192, "step": 3095 }, { "clip_ratio": 0.0, "completion_length": 294.6049270629883, "epoch": 0.92480023896647, "grad_norm": 0.5344149470329285, "kl": 0.2392578125, "learning_rate": 3.4074173710931804e-07, "loss": 0.162, "reward": 2.015625089406967, "reward_std": 0.17612393200397491, "rewards/accuracy_reward": 0.04241071501746774, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.988839328289032, "step": 3096 }, { "clip_ratio": 0.0, "completion_length": 290.3102836608887, "epoch": 0.9250989470539915, "grad_norm": 0.8387165665626526, "kl": 0.431640625, "learning_rate": 3.380474432536207e-07, "loss": 0.115, "reward": 2.0089286863803864, "reward_std": 0.167688624933362, "rewards/accuracy_reward": 0.05580357322469354, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.979910746216774, "step": 3097 }, { "clip_ratio": 0.0, "completion_length": 256.4442138671875, "epoch": 0.925397655141513, "grad_norm": 1.8232675790786743, "kl": 0.404296875, "learning_rate": 3.3536366065533456e-07, "loss": 0.1638, "reward": 2.0139510333538055, "reward_std": 0.1511822920292616, "rewards/accuracy_reward": 0.03794643096625805, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9893973469734192, "step": 3098 }, { "clip_ratio": 0.0, "completion_length": 282.1741180419922, "epoch": 0.9256963632290345, "grad_norm": 289.8348083496094, "kl": 0.421142578125, "learning_rate": 3.326903922341473e-07, "loss": 0.2699, "reward": 2.1406251192092896, "reward_std": 0.27298905327916145, "rewards/accuracy_reward": 0.2075892947614193, "rewards/format_reward": 0.9598214626312256, "rewards/tag_count_reward": 0.973214328289032, "step": 3099 }, { "clip_ratio": 0.0, "completion_length": 302.1651916503906, "epoch": 0.9259950713165559, "grad_norm": 0.34560468792915344, "kl": 0.1990966796875, "learning_rate": 3.30027640898315e-07, "loss": 0.0682, "reward": 2.237165242433548, "reward_std": 0.1747539835050702, "rewards/accuracy_reward": 0.2656250046566129, "rewards/format_reward": 0.9821428656578064, "rewards/tag_count_reward": 0.989397332072258, "step": 3100 }, { "clip_ratio": 0.0, "completion_length": 257.27233123779297, "epoch": 0.9262937794040773, "grad_norm": 0.503835916519165, "kl": 0.41552734375, "learning_rate": 3.2737540954465244e-07, "loss": 0.2281, "reward": 2.1199777126312256, "reward_std": 0.1976589448750019, "rewards/accuracy_reward": 0.1540178693830967, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9882812947034836, "step": 3101 }, { "clip_ratio": 0.0, "completion_length": 277.38171768188477, "epoch": 0.9265924874915988, "grad_norm": 0.27225157618522644, "kl": 0.216796875, "learning_rate": 3.247337010585228e-07, "loss": 0.1011, "reward": 2.031250089406967, "reward_std": 0.11506268568336964, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9888393133878708, "step": 3102 }, { "clip_ratio": 0.0, "completion_length": 264.08706283569336, "epoch": 0.9268911955791203, "grad_norm": 6.823877811431885, "kl": 0.279296875, "learning_rate": 3.221025183138493e-07, "loss": 0.1323, "reward": 2.0558036863803864, "reward_std": 0.20499888062477112, "rewards/accuracy_reward": 0.08258928847499192, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9888393133878708, "step": 3103 }, { "clip_ratio": 0.0, "completion_length": 328.17189025878906, "epoch": 0.9271899036666418, "grad_norm": 0.6655233502388, "kl": 0.3779296875, "learning_rate": 3.194818641731012e-07, "loss": 0.2173, "reward": 2.142857253551483, "reward_std": 0.25642042979598045, "rewards/accuracy_reward": 0.20535715762525797, "rewards/format_reward": 0.9620536267757416, "rewards/tag_count_reward": 0.9754464775323868, "step": 3104 }, { "clip_ratio": 0.0, "completion_length": 261.6294746398926, "epoch": 0.9274886117541632, "grad_norm": 0.5930857062339783, "kl": 0.448486328125, "learning_rate": 3.168717414872902e-07, "loss": 0.2495, "reward": 2.0803571939468384, "reward_std": 0.24253302067518234, "rewards/accuracy_reward": 0.12053572200238705, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9821428805589676, "step": 3105 }, { "clip_ratio": 0.0, "completion_length": 282.2768020629883, "epoch": 0.9277873198416847, "grad_norm": 1.8069514036178589, "kl": 0.261962890625, "learning_rate": 3.1427215309597693e-07, "loss": 0.2101, "reward": 2.0959821939468384, "reward_std": 0.21087980084121227, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9843750298023224, "step": 3106 }, { "clip_ratio": 0.0, "completion_length": 298.2812728881836, "epoch": 0.9280860279292061, "grad_norm": 0.2927801012992859, "kl": 0.18310546875, "learning_rate": 3.1168310182725814e-07, "loss": 0.1205, "reward": 2.1439733505249023, "reward_std": 0.19539373368024826, "rewards/accuracy_reward": 0.16964286682195961, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.987723246216774, "step": 3107 }, { "clip_ratio": 0.0, "completion_length": 290.80804443359375, "epoch": 0.9283847360167277, "grad_norm": 0.5956535339355469, "kl": 0.33984375, "learning_rate": 3.0910459049776633e-07, "loss": 0.2129, "reward": 2.039062589406967, "reward_std": 0.21512559428811073, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9787946939468384, "step": 3108 }, { "clip_ratio": 0.0, "completion_length": 259.5134048461914, "epoch": 0.9286834441042491, "grad_norm": 0.6047071814537048, "kl": 0.192626953125, "learning_rate": 3.0653662191267087e-07, "loss": 0.085, "reward": 2.1953126192092896, "reward_std": 0.22918003425002098, "rewards/accuracy_reward": 0.20758929289877415, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.994419664144516, "step": 3109 }, { "clip_ratio": 0.0, "completion_length": 273.08260345458984, "epoch": 0.9289821521917706, "grad_norm": 128.18223571777344, "kl": 0.329833984375, "learning_rate": 3.039791988656693e-07, "loss": 0.1695, "reward": 2.024553656578064, "reward_std": 0.17295844480395317, "rewards/accuracy_reward": 0.06250000349245965, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9843750447034836, "step": 3110 }, { "clip_ratio": 0.0, "completion_length": 250.03795623779297, "epoch": 0.929280860279292, "grad_norm": 0.7361783385276794, "kl": 0.369873046875, "learning_rate": 3.0143232413898607e-07, "loss": 0.2581, "reward": 2.037388503551483, "reward_std": 0.1813585739582777, "rewards/accuracy_reward": 0.06919643213041127, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9882812947034836, "step": 3111 }, { "clip_ratio": 0.0, "completion_length": 280.4687614440918, "epoch": 0.9295795683668135, "grad_norm": 1.045465111732483, "kl": 0.40478515625, "learning_rate": 2.9889600050337363e-07, "loss": 0.1752, "reward": 2.1902902722358704, "reward_std": 0.2385418340563774, "rewards/accuracy_reward": 0.2299107275903225, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.984933078289032, "step": 3112 }, { "clip_ratio": 0.0, "completion_length": 277.21429443359375, "epoch": 0.929878276454335, "grad_norm": 0.817097544670105, "kl": 0.267333984375, "learning_rate": 2.9637023071810155e-07, "loss": 0.0955, "reward": 2.131138503551483, "reward_std": 0.16662171483039856, "rewards/accuracy_reward": 0.14285714738070965, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9972098618745804, "step": 3113 }, { "clip_ratio": 0.0, "completion_length": 304.4464416503906, "epoch": 0.9301769845418565, "grad_norm": 1.7860087156295776, "kl": 0.356689453125, "learning_rate": 2.938550175309607e-07, "loss": 0.1815, "reward": 2.0491072237491608, "reward_std": 0.23408632539212704, "rewards/accuracy_reward": 0.09151786123402417, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9843750298023224, "step": 3114 }, { "clip_ratio": 0.0, "completion_length": 294.3281364440918, "epoch": 0.9304756926293779, "grad_norm": 0.6138404607772827, "kl": 0.225830078125, "learning_rate": 2.9135036367825773e-07, "loss": 0.1824, "reward": 2.0786831378936768, "reward_std": 0.19013542868196964, "rewards/accuracy_reward": 0.10937500838190317, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9871652126312256, "step": 3115 }, { "clip_ratio": 0.0, "completion_length": 272.95760345458984, "epoch": 0.9307744007168994, "grad_norm": 2.942750930786133, "kl": 0.712646484375, "learning_rate": 2.888562718848076e-07, "loss": 0.2604, "reward": 2.1300223767757416, "reward_std": 0.22335541620850563, "rewards/accuracy_reward": 0.1741071455180645, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9804687947034836, "step": 3116 }, { "clip_ratio": 0.0, "completion_length": 311.2276916503906, "epoch": 0.9310731088044208, "grad_norm": 1.0676602125167847, "kl": 0.2763671875, "learning_rate": 2.863727448639386e-07, "loss": 0.2962, "reward": 2.2109375596046448, "reward_std": 0.3344794027507305, "rewards/accuracy_reward": 0.2700893022119999, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9743303954601288, "step": 3117 }, { "clip_ratio": 0.0, "completion_length": 287.198673248291, "epoch": 0.9313718168919424, "grad_norm": 2.22121000289917, "kl": 0.525146484375, "learning_rate": 2.838997853174874e-07, "loss": 0.3063, "reward": 2.001674175262451, "reward_std": 0.22914112359285355, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.9620535969734192, "rewards/tag_count_reward": 0.974888414144516, "step": 3118 }, { "clip_ratio": 0.0, "completion_length": 296.97991943359375, "epoch": 0.9316705249794638, "grad_norm": 0.6036094427108765, "kl": 0.24951171875, "learning_rate": 2.8143739593578854e-07, "loss": 0.1054, "reward": 2.142299175262451, "reward_std": 0.1653799321502447, "rewards/accuracy_reward": 0.16964286752045155, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9905134439468384, "step": 3119 }, { "clip_ratio": 0.0, "completion_length": 299.73216247558594, "epoch": 0.9319692330669853, "grad_norm": 2.650709867477417, "kl": 0.576171875, "learning_rate": 2.7898557939768254e-07, "loss": 0.2398, "reward": 2.0831474363803864, "reward_std": 0.23213340528309345, "rewards/accuracy_reward": 0.1406250037252903, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9782366454601288, "step": 3120 }, { "clip_ratio": 0.0, "completion_length": 270.7254638671875, "epoch": 0.9322679411545067, "grad_norm": 0.808188796043396, "kl": 0.32763671875, "learning_rate": 2.7654433837050245e-07, "loss": 0.2433, "reward": 2.0429688096046448, "reward_std": 0.2613872140645981, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.9732143133878708, "rewards/tag_count_reward": 0.9804687947034836, "step": 3121 }, { "clip_ratio": 0.0, "completion_length": 282.62947845458984, "epoch": 0.9325666492420283, "grad_norm": 1.3245550394058228, "kl": 0.255859375, "learning_rate": 2.741136755100815e-07, "loss": 0.0696, "reward": 2.0864956378936768, "reward_std": 0.17579114064574242, "rewards/accuracy_reward": 0.11383928963914514, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9905134290456772, "step": 3122 }, { "clip_ratio": 0.0, "completion_length": 314.46653747558594, "epoch": 0.9328653573295497, "grad_norm": 1.214498519897461, "kl": 0.533203125, "learning_rate": 2.7169359346074344e-07, "loss": 0.219, "reward": 2.0518973767757416, "reward_std": 0.20408043265342712, "rewards/accuracy_reward": 0.09151786309666932, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9849330633878708, "step": 3123 }, { "clip_ratio": 0.0, "completion_length": 294.8549270629883, "epoch": 0.9331640654170712, "grad_norm": 0.5608884692192078, "kl": 0.251953125, "learning_rate": 2.6928409485529773e-07, "loss": 0.1667, "reward": 2.093750089406967, "reward_std": 0.22831793501973152, "rewards/accuracy_reward": 0.12946429336443543, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9843750447034836, "step": 3124 }, { "clip_ratio": 0.0, "completion_length": 297.88840103149414, "epoch": 0.9334627735045926, "grad_norm": 0.7981370687484741, "kl": 0.3173828125, "learning_rate": 2.6688518231504535e-07, "loss": 0.1243, "reward": 2.0920759737491608, "reward_std": 0.13865437917411327, "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9871652126312256, "step": 3125 }, { "clip_ratio": 0.0, "completion_length": 305.2366256713867, "epoch": 0.9337614815921141, "grad_norm": 0.5700733661651611, "kl": 0.271484375, "learning_rate": 2.6449685844976645e-07, "loss": 0.2223, "reward": 2.064174175262451, "reward_std": 0.18285726383328438, "rewards/accuracy_reward": 0.09821428963914514, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9860491454601288, "step": 3126 }, { "clip_ratio": 0.0, "completion_length": 285.6651916503906, "epoch": 0.9340601896796356, "grad_norm": 1.3507276773452759, "kl": 0.53662109375, "learning_rate": 2.621191258577238e-07, "loss": 0.2369, "reward": 2.017299175262451, "reward_std": 0.23815695941448212, "rewards/accuracy_reward": 0.0736607201397419, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.977120578289032, "step": 3127 }, { "clip_ratio": 0.0, "completion_length": 302.6183166503906, "epoch": 0.9343588977671571, "grad_norm": 0.6556432247161865, "kl": 0.310791015625, "learning_rate": 2.5975198712565706e-07, "loss": 0.2315, "reward": 2.185826003551483, "reward_std": 0.23991841822862625, "rewards/accuracy_reward": 0.2366071566939354, "rewards/format_reward": 0.9709821939468384, "rewards/tag_count_reward": 0.9782366305589676, "step": 3128 }, { "clip_ratio": 0.0, "completion_length": 280.40179443359375, "epoch": 0.9346576058546785, "grad_norm": 0.9634621739387512, "kl": 0.369140625, "learning_rate": 2.573954448287819e-07, "loss": 0.195, "reward": 1.9988840818405151, "reward_std": 0.18135487847030163, "rewards/accuracy_reward": 0.035714287078008056, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9877232611179352, "step": 3129 }, { "clip_ratio": 0.0, "completion_length": 235.45983505249023, "epoch": 0.9349563139422, "grad_norm": 0.5645443797111511, "kl": 0.32568359375, "learning_rate": 2.5504950153078413e-07, "loss": 0.1985, "reward": 2.0373885333538055, "reward_std": 0.18100661225616932, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9860491454601288, "step": 3130 }, { "clip_ratio": 0.0, "completion_length": 282.53126525878906, "epoch": 0.9352550220297214, "grad_norm": 0.7280839085578918, "kl": 0.40771484375, "learning_rate": 2.527141597838212e-07, "loss": 0.2126, "reward": 2.035714417695999, "reward_std": 0.21736213564872742, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9843750298023224, "step": 3131 }, { "clip_ratio": 0.0, "completion_length": 261.8169746398926, "epoch": 0.935553730117243, "grad_norm": 1.0892854928970337, "kl": 0.591796875, "learning_rate": 2.5038942212851637e-07, "loss": 0.1155, "reward": 2.0094867050647736, "reward_std": 0.16483353544026613, "rewards/accuracy_reward": 0.04687500116415322, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9849330633878708, "step": 3132 }, { "clip_ratio": 0.0, "completion_length": 290.6696548461914, "epoch": 0.9358524382047644, "grad_norm": 0.7350673079490662, "kl": 0.3984375, "learning_rate": 2.4807529109395544e-07, "loss": 0.3265, "reward": 2.0998884737491608, "reward_std": 0.31678539514541626, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.977120578289032, "step": 3133 }, { "clip_ratio": 0.0, "completion_length": 282.67188262939453, "epoch": 0.9361511462922859, "grad_norm": 0.6072381138801575, "kl": 0.43994140625, "learning_rate": 2.4577176919768687e-07, "loss": 0.2074, "reward": 2.0647321939468384, "reward_std": 0.2044332306832075, "rewards/accuracy_reward": 0.11607143143191934, "rewards/format_reward": 0.9687500596046448, "rewards/tag_count_reward": 0.9799107611179352, "step": 3134 }, { "clip_ratio": 0.0, "completion_length": 272.2522430419922, "epoch": 0.9364498543798073, "grad_norm": 1.070470929145813, "kl": 0.3056640625, "learning_rate": 2.4347885894571487e-07, "loss": 0.1895, "reward": 2.083705484867096, "reward_std": 0.15126267075538635, "rewards/accuracy_reward": 0.11383929220028222, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.987723246216774, "step": 3135 }, { "clip_ratio": 0.0, "completion_length": 308.45537185668945, "epoch": 0.9367485624673288, "grad_norm": 1.3308638334274292, "kl": 0.2744140625, "learning_rate": 2.4119656283250304e-07, "loss": 0.1276, "reward": 2.032366156578064, "reward_std": 0.14103236980736256, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9854911267757416, "step": 3136 }, { "clip_ratio": 0.0, "completion_length": 284.4955520629883, "epoch": 0.9370472705548503, "grad_norm": 0.8253914713859558, "kl": 0.42822265625, "learning_rate": 2.389248833409663e-07, "loss": 0.2552, "reward": 2.0669643878936768, "reward_std": 0.22273854352533817, "rewards/accuracy_reward": 0.13616071757860482, "rewards/format_reward": 0.957589328289032, "rewards/tag_count_reward": 0.9732143133878708, "step": 3137 }, { "clip_ratio": 0.0, "completion_length": 307.8549270629883, "epoch": 0.9373459786423718, "grad_norm": 0.5158113241195679, "kl": 0.275390625, "learning_rate": 2.366638229424667e-07, "loss": 0.1793, "reward": 2.080357313156128, "reward_std": 0.20561711862683296, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9866071790456772, "step": 3138 }, { "clip_ratio": 0.0, "completion_length": 255.90402603149414, "epoch": 0.9376446867298932, "grad_norm": 0.616855800151825, "kl": 0.244384765625, "learning_rate": 2.344133840968188e-07, "loss": 0.1329, "reward": 2.1473215222358704, "reward_std": 0.20044521614909172, "rewards/accuracy_reward": 0.1674107238650322, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9910714775323868, "step": 3139 }, { "clip_ratio": 0.0, "completion_length": 301.23661041259766, "epoch": 0.9379433948174147, "grad_norm": 0.7747586369514465, "kl": 0.1966552734375, "learning_rate": 2.3217356925227973e-07, "loss": 0.1645, "reward": 2.005022406578064, "reward_std": 0.11413238383829594, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9871652126312256, "step": 3140 }, { "clip_ratio": 0.0, "completion_length": 285.81921005249023, "epoch": 0.9382421029049361, "grad_norm": 0.40055835247039795, "kl": 0.250732421875, "learning_rate": 2.2994438084554594e-07, "loss": 0.1065, "reward": 2.0407367050647736, "reward_std": 0.13202444836497307, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9916295111179352, "step": 3141 }, { "clip_ratio": 0.0, "completion_length": 248.7634048461914, "epoch": 0.9385408109924577, "grad_norm": 0.7252879738807678, "kl": 0.350341796875, "learning_rate": 2.2772582130175747e-07, "loss": 0.1925, "reward": 2.1997768878936768, "reward_std": 0.16212442237883806, "rewards/accuracy_reward": 0.22321429662406445, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9899553805589676, "step": 3142 }, { "clip_ratio": 0.0, "completion_length": 290.4910888671875, "epoch": 0.9388395190799791, "grad_norm": 0.663550078868866, "kl": 0.432373046875, "learning_rate": 2.2551789303449034e-07, "loss": 0.2748, "reward": 2.0055804550647736, "reward_std": 0.23222580924630165, "rewards/accuracy_reward": 0.05803571455180645, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9787946939468384, "step": 3143 }, { "clip_ratio": 0.0, "completion_length": 296.1160774230957, "epoch": 0.9391382271675005, "grad_norm": 0.8385772705078125, "kl": 0.41064453125, "learning_rate": 2.2332059844575317e-07, "loss": 0.2736, "reward": 2.029576003551483, "reward_std": 0.21803703159093857, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.9665178805589676, "rewards/tag_count_reward": 0.9804687947034836, "step": 3144 }, { "clip_ratio": 0.0, "completion_length": 291.81921005249023, "epoch": 0.939436935255022, "grad_norm": 0.44105786085128784, "kl": 0.272705078125, "learning_rate": 2.2113393992598596e-07, "loss": 0.115, "reward": 2.013951003551483, "reward_std": 0.19861062709242105, "rewards/accuracy_reward": 0.05357142956927419, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9827009290456772, "step": 3145 }, { "clip_ratio": 0.0, "completion_length": 260.66296005249023, "epoch": 0.9397356433425434, "grad_norm": 1.9960774183273315, "kl": 0.74609375, "learning_rate": 2.1895791985406257e-07, "loss": 0.1779, "reward": 2.201451003551483, "reward_std": 0.2816121205687523, "rewards/accuracy_reward": 0.2477678693830967, "rewards/format_reward": 0.9732143431901932, "rewards/tag_count_reward": 0.9804687947034836, "step": 3146 }, { "clip_ratio": 0.0, "completion_length": 286.2879524230957, "epoch": 0.940034351430065, "grad_norm": 0.5549787282943726, "kl": 0.198974609375, "learning_rate": 2.1679254059727594e-07, "loss": 0.1789, "reward": 2.074776917695999, "reward_std": 0.21873332187533379, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9832589775323868, "step": 3147 }, { "clip_ratio": 0.0, "completion_length": 257.8080406188965, "epoch": 0.9403330595175864, "grad_norm": 1.0788522958755493, "kl": 0.447509765625, "learning_rate": 2.1463780451134841e-07, "loss": 0.1723, "reward": 2.0507813692092896, "reward_std": 0.13302847556769848, "rewards/accuracy_reward": 0.07812500488944352, "rewards/format_reward": 0.9843750596046448, "rewards/tag_count_reward": 0.9882812947034836, "step": 3148 }, { "clip_ratio": 0.0, "completion_length": 295.64064025878906, "epoch": 0.9406317676051079, "grad_norm": 0.3297185003757477, "kl": 0.200439453125, "learning_rate": 2.124937139404204e-07, "loss": 0.0691, "reward": 2.0719867050647736, "reward_std": 0.2134438157081604, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9871652126312256, "step": 3149 }, { "clip_ratio": 0.0, "completion_length": 269.26340103149414, "epoch": 0.9409304756926293, "grad_norm": 0.8211467862129211, "kl": 0.328125, "learning_rate": 2.103602712170527e-07, "loss": 0.3575, "reward": 2.117187649011612, "reward_std": 0.27511426620185375, "rewards/accuracy_reward": 0.16294643888249993, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.981026828289032, "step": 3150 }, { "clip_ratio": 0.0, "completion_length": 331.9665298461914, "epoch": 0.9412291837801509, "grad_norm": 0.4835298955440521, "kl": 0.225341796875, "learning_rate": 2.0823747866222322e-07, "loss": 0.1597, "reward": 2.020647406578064, "reward_std": 0.20401785522699356, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.9665178805589676, "rewards/tag_count_reward": 0.984933078289032, "step": 3151 }, { "clip_ratio": 0.0, "completion_length": 298.1763610839844, "epoch": 0.9415278918676723, "grad_norm": 0.7343883514404297, "kl": 0.31689453125, "learning_rate": 2.0612533858531902e-07, "loss": 0.1351, "reward": 2.103794753551483, "reward_std": 0.14210401847958565, "rewards/accuracy_reward": 0.13169643515720963, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.987723246216774, "step": 3152 }, { "clip_ratio": 0.0, "completion_length": 282.22991943359375, "epoch": 0.9418265999551938, "grad_norm": 0.6275607347488403, "kl": 0.219970703125, "learning_rate": 2.0402385328414543e-07, "loss": 0.2158, "reward": 2.014508992433548, "reward_std": 0.22169330436736345, "rewards/accuracy_reward": 0.05580357322469354, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9832589626312256, "step": 3153 }, { "clip_ratio": 0.0, "completion_length": 301.6852798461914, "epoch": 0.9421253080427152, "grad_norm": 1.178055763244629, "kl": 0.342529296875, "learning_rate": 2.019330250449103e-07, "loss": 0.1526, "reward": 2.029017984867096, "reward_std": 0.1319657862186432, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.988839328289032, "step": 3154 }, { "clip_ratio": 0.0, "completion_length": 295.6049270629883, "epoch": 0.9424240161302367, "grad_norm": 0.41349953413009644, "kl": 0.229736328125, "learning_rate": 1.998528561422297e-07, "loss": 0.1469, "reward": 2.0714287161827087, "reward_std": 0.1698397435247898, "rewards/accuracy_reward": 0.09821428824216127, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.988839328289032, "step": 3155 }, { "clip_ratio": 0.0, "completion_length": 292.8973388671875, "epoch": 0.9427227242177582, "grad_norm": 0.8675327897071838, "kl": 0.374755859375, "learning_rate": 1.9778334883912342e-07, "loss": 0.1551, "reward": 2.013392984867096, "reward_std": 0.20808405987918377, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9821428954601288, "step": 3156 }, { "clip_ratio": 0.0, "completion_length": 267.91519927978516, "epoch": 0.9430214323052797, "grad_norm": 0.45766884088516235, "kl": 0.2158203125, "learning_rate": 1.9572450538701493e-07, "loss": 0.1415, "reward": 2.0820313692092896, "reward_std": 0.16734597459435463, "rewards/accuracy_reward": 0.10491071874275804, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9905134290456772, "step": 3157 }, { "clip_ratio": 0.0, "completion_length": 313.95090103149414, "epoch": 0.9433201403928011, "grad_norm": 0.4675556421279907, "kl": 0.33642578125, "learning_rate": 1.9367632802572033e-07, "loss": 0.1838, "reward": 2.016741156578064, "reward_std": 0.19244518503546715, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.9732143431901932, "rewards/tag_count_reward": 0.9810268431901932, "step": 3158 }, { "clip_ratio": 0.0, "completion_length": 304.1339416503906, "epoch": 0.9436188484803226, "grad_norm": 0.41433319449424744, "kl": 0.227294921875, "learning_rate": 1.9163881898345836e-07, "loss": 0.12, "reward": 2.1607143878936768, "reward_std": 0.20117424987256527, "rewards/accuracy_reward": 0.1897321492433548, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.988839328289032, "step": 3159 }, { "clip_ratio": 0.0, "completion_length": 267.4129638671875, "epoch": 0.943917556567844, "grad_norm": 0.25521236658096313, "kl": 0.42333984375, "learning_rate": 1.8961198047683926e-07, "loss": 0.0664, "reward": 2.1333706378936768, "reward_std": 0.13513538241386414, "rewards/accuracy_reward": 0.15625000488944352, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9905134290456772, "step": 3160 }, { "clip_ratio": 0.0, "completion_length": 298.28125762939453, "epoch": 0.9442162646553656, "grad_norm": 0.7265112400054932, "kl": 0.271728515625, "learning_rate": 1.8759581471086363e-07, "loss": 0.1568, "reward": 2.0675224363803864, "reward_std": 0.21244286745786667, "rewards/accuracy_reward": 0.09598214761354029, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9871652126312256, "step": 3161 }, { "clip_ratio": 0.0, "completion_length": 304.06697845458984, "epoch": 0.944514972742887, "grad_norm": 3.0370218753814697, "kl": 0.303955078125, "learning_rate": 1.855903238789225e-07, "loss": 0.1956, "reward": 2.0318081080913544, "reward_std": 0.2380322441458702, "rewards/accuracy_reward": 0.08482143143191934, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9804687947034836, "step": 3162 }, { "clip_ratio": 0.0, "completion_length": 284.6026916503906, "epoch": 0.9448136808304085, "grad_norm": 0.5050740838050842, "kl": 0.243408203125, "learning_rate": 1.8359551016279398e-07, "loss": 0.1725, "reward": 2.0937501192092896, "reward_std": 0.1896650856360793, "rewards/accuracy_reward": 0.12053572130389512, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9888393133878708, "step": 3163 }, { "clip_ratio": 0.0, "completion_length": 300.75001525878906, "epoch": 0.9451123889179299, "grad_norm": 2.574799060821533, "kl": 0.205078125, "learning_rate": 1.8161137573263877e-07, "loss": 0.0657, "reward": 2.0491072237491608, "reward_std": 0.1385838482528925, "rewards/accuracy_reward": 0.07142857392318547, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9910714626312256, "step": 3164 }, { "clip_ratio": 0.0, "completion_length": 276.1473388671875, "epoch": 0.9454110970054515, "grad_norm": 1.3834137916564941, "kl": 0.27587890625, "learning_rate": 1.7963792274700242e-07, "loss": 0.1726, "reward": 2.0396206378936768, "reward_std": 0.1969960816204548, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9882812798023224, "step": 3165 }, { "clip_ratio": 0.0, "completion_length": 302.7321548461914, "epoch": 0.9457098050929729, "grad_norm": 0.677958071231842, "kl": 0.273681640625, "learning_rate": 1.7767515335280538e-07, "loss": 0.2274, "reward": 2.008928656578064, "reward_std": 0.2117429655045271, "rewards/accuracy_reward": 0.05357143236324191, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.98214291036129, "step": 3166 }, { "clip_ratio": 0.0, "completion_length": 300.98216247558594, "epoch": 0.9460085131804944, "grad_norm": 0.9488958120346069, "kl": 0.215576171875, "learning_rate": 1.757230696853518e-07, "loss": 0.1618, "reward": 2.047991156578064, "reward_std": 0.22158344089984894, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9899553954601288, "step": 3167 }, { "clip_ratio": 0.0, "completion_length": 276.5312614440918, "epoch": 0.9463072212680158, "grad_norm": 1.0603970289230347, "kl": 0.369140625, "learning_rate": 1.7378167386831512e-07, "loss": 0.1802, "reward": 2.0630581378936768, "reward_std": 0.15936411544680595, "rewards/accuracy_reward": 0.08928571944124997, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9893973618745804, "step": 3168 }, { "clip_ratio": 0.0, "completion_length": 253.37055206298828, "epoch": 0.9466059293555373, "grad_norm": 0.45983991026878357, "kl": 0.27978515625, "learning_rate": 1.7185096801374368e-07, "loss": 0.1267, "reward": 2.1997768878936768, "reward_std": 0.1670045405626297, "rewards/accuracy_reward": 0.2209821566939354, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9921875298023224, "step": 3169 }, { "clip_ratio": 0.0, "completion_length": 310.2678756713867, "epoch": 0.9469046374430587, "grad_norm": 1.3858489990234375, "kl": 0.419921875, "learning_rate": 1.699309542220584e-07, "loss": 0.3249, "reward": 2.0111608505249023, "reward_std": 0.27635224908590317, "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9776786118745804, "step": 3170 }, { "clip_ratio": 0.0, "completion_length": 319.9933166503906, "epoch": 0.9472033455305803, "grad_norm": 1.4788070917129517, "kl": 0.343017578125, "learning_rate": 1.68021634582044e-07, "loss": 0.2474, "reward": 2.0022322237491608, "reward_std": 0.2089819461107254, "rewards/accuracy_reward": 0.05133928940631449, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.9821428954601288, "step": 3171 }, { "clip_ratio": 0.0, "completion_length": 270.1875190734863, "epoch": 0.9475020536181017, "grad_norm": 0.7200450301170349, "kl": 0.29736328125, "learning_rate": 1.661230111708534e-07, "loss": 0.0803, "reward": 2.109375089406967, "reward_std": 0.09019743651151657, "rewards/accuracy_reward": 0.12723215110599995, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9933035969734192, "step": 3172 }, { "clip_ratio": 0.0, "completion_length": 243.9665298461914, "epoch": 0.9478007617056232, "grad_norm": 0.9042991995811462, "kl": 0.43798828125, "learning_rate": 1.6423508605400318e-07, "loss": 0.1904, "reward": 2.1941965222358704, "reward_std": 0.2341431798413396, "rewards/accuracy_reward": 0.2254464328289032, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.986607164144516, "step": 3173 }, { "clip_ratio": 0.0, "completion_length": 286.22769927978516, "epoch": 0.9480994697931446, "grad_norm": 0.471902996301651, "kl": 0.273193359375, "learning_rate": 1.6235786128537046e-07, "loss": 0.1856, "reward": 1.9771206080913544, "reward_std": 0.13485773652791977, "rewards/accuracy_reward": 0.008928572060540318, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9882812798023224, "step": 3174 }, { "clip_ratio": 0.0, "completion_length": 284.75670623779297, "epoch": 0.9483981778806662, "grad_norm": 0.6027974486351013, "kl": 0.287109375, "learning_rate": 1.604913389071927e-07, "loss": 0.1178, "reward": 2.155134081840515, "reward_std": 0.18704111501574516, "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9899553805589676, "step": 3175 }, { "clip_ratio": 0.0, "completion_length": 269.3727836608887, "epoch": 0.9486968859681876, "grad_norm": 0.6019591689109802, "kl": 0.37744140625, "learning_rate": 1.586355209500634e-07, "loss": 0.2723, "reward": 2.0898438692092896, "reward_std": 0.24440956488251686, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.9754464477300644, "rewards/tag_count_reward": 0.9804687798023224, "step": 3176 }, { "clip_ratio": 0.0, "completion_length": 293.94643783569336, "epoch": 0.9489955940557091, "grad_norm": 0.744193434715271, "kl": 0.288330078125, "learning_rate": 1.5679040943292867e-07, "loss": 0.0878, "reward": 2.0351563692092896, "reward_std": 0.152957109734416, "rewards/accuracy_reward": 0.05803571850992739, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9905134290456772, "step": 3177 }, { "clip_ratio": 0.0, "completion_length": 287.3415298461914, "epoch": 0.9492943021432305, "grad_norm": 0.8690394759178162, "kl": 0.26611328125, "learning_rate": 1.549560063630906e-07, "loss": 0.1239, "reward": 2.073102742433548, "reward_std": 0.2354080267250538, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9860491454601288, "step": 3178 }, { "clip_ratio": 0.0, "completion_length": 303.0714416503906, "epoch": 0.949593010230752, "grad_norm": 0.5713120698928833, "kl": 0.4599609375, "learning_rate": 1.5313231373619953e-07, "loss": 0.2466, "reward": 2.041294753551483, "reward_std": 0.23480946198105812, "rewards/accuracy_reward": 0.09821429220028222, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.9743303954601288, "step": 3179 }, { "clip_ratio": 0.0, "completion_length": 297.60938262939453, "epoch": 0.9498917183182735, "grad_norm": 0.4891367256641388, "kl": 0.353515625, "learning_rate": 1.5131933353625394e-07, "loss": 0.2546, "reward": 2.154017984867096, "reward_std": 0.2612443119287491, "rewards/accuracy_reward": 0.21205357648432255, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9754464775323868, "step": 3180 }, { "clip_ratio": 0.0, "completion_length": 267.2299270629883, "epoch": 0.950190426405795, "grad_norm": 1.0520228147506714, "kl": 0.4814453125, "learning_rate": 1.495170677356006e-07, "loss": 0.2384, "reward": 2.0736607909202576, "reward_std": 0.1952420026063919, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9866071939468384, "step": 3181 }, { "clip_ratio": 0.0, "completion_length": 267.0424270629883, "epoch": 0.9504891344933164, "grad_norm": 0.3892724812030792, "kl": 0.424072265625, "learning_rate": 1.4772551829492444e-07, "loss": 0.1865, "reward": 2.125558167695999, "reward_std": 0.20151981711387634, "rewards/accuracy_reward": 0.15848215110599995, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9871652275323868, "step": 3182 }, { "clip_ratio": 0.0, "completion_length": 295.2366256713867, "epoch": 0.9507878425808379, "grad_norm": 0.5697357058525085, "kl": 0.26123046875, "learning_rate": 1.459446871632586e-07, "loss": 0.1568, "reward": 2.133928656578064, "reward_std": 0.17860056832432747, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9843750447034836, "step": 3183 }, { "clip_ratio": 0.0, "completion_length": 280.01341247558594, "epoch": 0.9510865506683593, "grad_norm": 0.978269636631012, "kl": 0.35009765625, "learning_rate": 1.4417457627797226e-07, "loss": 0.1295, "reward": 2.138951003551483, "reward_std": 0.22659551724791527, "rewards/accuracy_reward": 0.1674107201397419, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9893973618745804, "step": 3184 }, { "clip_ratio": 0.0, "completion_length": 351.40626525878906, "epoch": 0.9513852587558809, "grad_norm": 3.441490888595581, "kl": 0.656494140625, "learning_rate": 1.424151875647717e-07, "loss": 0.2041, "reward": 2.0742188692092896, "reward_std": 0.23910923302173615, "rewards/accuracy_reward": 0.12946428824216127, "rewards/format_reward": 0.96651791036129, "rewards/tag_count_reward": 0.97823666036129, "step": 3185 }, { "clip_ratio": 0.0, "completion_length": 293.0669708251953, "epoch": 0.9516839668434023, "grad_norm": 1.389347791671753, "kl": 0.49951171875, "learning_rate": 1.406665229377002e-07, "loss": 0.3117, "reward": 2.0301340520381927, "reward_std": 0.2774460092186928, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9765625596046448, "step": 3186 }, { "clip_ratio": 0.0, "completion_length": 301.51341247558594, "epoch": 0.9519826749309237, "grad_norm": 0.750626802444458, "kl": 0.23486328125, "learning_rate": 1.389285842991339e-07, "loss": 0.2144, "reward": 2.025111675262451, "reward_std": 0.21263299509882927, "rewards/accuracy_reward": 0.06696429080329835, "rewards/format_reward": 0.975446492433548, "rewards/tag_count_reward": 0.9827009439468384, "step": 3187 }, { "clip_ratio": 0.0, "completion_length": 249.33483505249023, "epoch": 0.9522813830184452, "grad_norm": 0.5544939041137695, "kl": 0.3251953125, "learning_rate": 1.3720137353977814e-07, "loss": 0.1473, "reward": 2.110491156578064, "reward_std": 0.18999655172228813, "rewards/accuracy_reward": 0.1294642947614193, "rewards/format_reward": 0.9888392984867096, "rewards/tag_count_reward": 0.9921875149011612, "step": 3188 }, { "clip_ratio": 0.0, "completion_length": 295.84822845458984, "epoch": 0.9525800911059666, "grad_norm": 1.4163662195205688, "kl": 0.5732421875, "learning_rate": 1.354848925386698e-07, "loss": 0.2475, "reward": 2.0775670409202576, "reward_std": 0.1957293078303337, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9815848618745804, "step": 3189 }, { "clip_ratio": 0.0, "completion_length": 276.79688262939453, "epoch": 0.9528787991934882, "grad_norm": 0.8292403221130371, "kl": 0.19580078125, "learning_rate": 1.3377914316317186e-07, "loss": 0.2213, "reward": 2.0474331378936768, "reward_std": 0.18259330466389656, "rewards/accuracy_reward": 0.08928571850992739, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9827009439468384, "step": 3190 }, { "clip_ratio": 0.0, "completion_length": 295.45983123779297, "epoch": 0.9531775072810096, "grad_norm": 0.6436880230903625, "kl": 0.2890625, "learning_rate": 1.3208412726897324e-07, "loss": 0.1844, "reward": 2.1880581378936768, "reward_std": 0.24711088836193085, "rewards/accuracy_reward": 0.22767858300358057, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.984933078289032, "step": 3191 }, { "clip_ratio": 0.0, "completion_length": 286.16072845458984, "epoch": 0.9534762153685311, "grad_norm": 0.644318699836731, "kl": 0.470703125, "learning_rate": 1.3039984670008443e-07, "loss": 0.297, "reward": 2.0825893580913544, "reward_std": 0.2487937957048416, "rewards/accuracy_reward": 0.14062500605359674, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9776786267757416, "step": 3192 }, { "clip_ratio": 0.0, "completion_length": 322.0580596923828, "epoch": 0.9537749234560525, "grad_norm": 0.9535675048828125, "kl": 0.32470703125, "learning_rate": 1.2872630328883752e-07, "loss": 0.1226, "reward": 2.1132813692092896, "reward_std": 0.15116829797625542, "rewards/accuracy_reward": 0.14732143399305642, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9882812947034836, "step": 3193 }, { "clip_ratio": 0.0, "completion_length": 309.8102798461914, "epoch": 0.954073631543574, "grad_norm": 0.9650722146034241, "kl": 0.44482421875, "learning_rate": 1.2706349885588276e-07, "loss": 0.3179, "reward": 2.015625089406967, "reward_std": 0.2780231609940529, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.9508928954601288, "rewards/tag_count_reward": 0.96651791036129, "step": 3194 }, { "clip_ratio": 0.0, "completion_length": 261.31028747558594, "epoch": 0.9543723396310955, "grad_norm": 0.5416017770767212, "kl": 0.247314453125, "learning_rate": 1.2541143521019093e-07, "loss": 0.134, "reward": 2.196428656578064, "reward_std": 0.17746852152049541, "rewards/accuracy_reward": 0.2165178656578064, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9910714626312256, "step": 3195 }, { "clip_ratio": 0.0, "completion_length": 279.10268783569336, "epoch": 0.954671047718617, "grad_norm": 0.5474120378494263, "kl": 0.2509765625, "learning_rate": 1.2377011414904327e-07, "loss": 0.0634, "reward": 2.1160714626312256, "reward_std": 0.1727980189025402, "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9933035969734192, "step": 3196 }, { "clip_ratio": 0.0, "completion_length": 263.43751525878906, "epoch": 0.9549697558061384, "grad_norm": 0.5452228784561157, "kl": 0.24560546875, "learning_rate": 1.2213953745803587e-07, "loss": 0.1025, "reward": 2.2081474661827087, "reward_std": 0.18203586339950562, "rewards/accuracy_reward": 0.216517873108387, "rewards/format_reward": 0.9955357313156128, "rewards/tag_count_reward": 0.9960937649011612, "step": 3197 }, { "clip_ratio": 0.0, "completion_length": 290.1607208251953, "epoch": 0.9552684638936599, "grad_norm": 0.6134467124938965, "kl": 0.288818359375, "learning_rate": 1.2051970691107972e-07, "loss": 0.1858, "reward": 2.125558167695999, "reward_std": 0.1828716117888689, "rewards/accuracy_reward": 0.15625000977888703, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9893973618745804, "step": 3198 }, { "clip_ratio": 0.0, "completion_length": 286.66741943359375, "epoch": 0.9555671719811814, "grad_norm": 1.819289207458496, "kl": 0.351806640625, "learning_rate": 1.1891062427038746e-07, "loss": 0.2432, "reward": 2.08537957072258, "reward_std": 0.2526726983487606, "rewards/accuracy_reward": 0.13169643469154835, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9804687947034836, "step": 3199 }, { "clip_ratio": 0.0, "completion_length": 267.72099685668945, "epoch": 0.9558658800687029, "grad_norm": 0.39747723937034607, "kl": 0.215576171875, "learning_rate": 1.1731229128648546e-07, "loss": 0.1836, "reward": 2.0943081378936768, "reward_std": 0.20168601348996162, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.9821429252624512, "rewards/tag_count_reward": 0.9849330931901932, "step": 3200 }, { "clip_ratio": 0.0, "completion_length": 301.5714416503906, "epoch": 0.9561645881562243, "grad_norm": 0.3928349018096924, "kl": 0.2119140625, "learning_rate": 1.1572470969820282e-07, "loss": 0.1923, "reward": 2.0546876192092896, "reward_std": 0.21846067998558283, "rewards/accuracy_reward": 0.08705357671715319, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9854910969734192, "step": 3201 }, { "clip_ratio": 0.0, "completion_length": 264.53796768188477, "epoch": 0.9564632962437458, "grad_norm": 0.941106379032135, "kl": 0.361083984375, "learning_rate": 1.1414788123267351e-07, "loss": 0.0752, "reward": 2.0803572237491608, "reward_std": 0.11270185559988022, "rewards/accuracy_reward": 0.09598215110599995, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9933036118745804, "step": 3202 }, { "clip_ratio": 0.0, "completion_length": 276.9553680419922, "epoch": 0.9567620043312672, "grad_norm": 0.5218517184257507, "kl": 0.33984375, "learning_rate": 1.1258180760533089e-07, "loss": 0.1465, "reward": 2.0686384737491608, "reward_std": 0.18063968420028687, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9882812798023224, "step": 3203 }, { "clip_ratio": 0.0, "completion_length": 310.96876525878906, "epoch": 0.9570607124187888, "grad_norm": 0.4600362181663513, "kl": 0.230224609375, "learning_rate": 1.110264905199121e-07, "loss": 0.1416, "reward": 2.052455425262451, "reward_std": 0.18390361033380032, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9899553954601288, "step": 3204 }, { "clip_ratio": 0.0, "completion_length": 322.12278747558594, "epoch": 0.9573594205063102, "grad_norm": 1.1061396598815918, "kl": 0.50927734375, "learning_rate": 1.0948193166844701e-07, "loss": 0.2059, "reward": 2.048549175262451, "reward_std": 0.19236348755657673, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.983816996216774, "step": 3205 }, { "clip_ratio": 0.0, "completion_length": 287.2901916503906, "epoch": 0.9576581285938317, "grad_norm": 0.6749728322029114, "kl": 0.314453125, "learning_rate": 1.0794813273126592e-07, "loss": 0.1312, "reward": 2.1177456080913544, "reward_std": 0.24952857941389084, "rewards/accuracy_reward": 0.14955358114093542, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9882812947034836, "step": 3206 }, { "clip_ratio": 0.0, "completion_length": 286.04689025878906, "epoch": 0.9579568366813531, "grad_norm": 0.6876540184020996, "kl": 0.320068359375, "learning_rate": 1.0642509537698964e-07, "loss": 0.1323, "reward": 1.989397406578064, "reward_std": 0.12752745859324932, "rewards/accuracy_reward": 0.01785714295692742, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9893973469734192, "step": 3207 }, { "clip_ratio": 0.0, "completion_length": 325.9576110839844, "epoch": 0.9582555447688746, "grad_norm": 0.47985705733299255, "kl": 0.349365234375, "learning_rate": 1.049128212625361e-07, "loss": 0.2395, "reward": 2.0742188692092896, "reward_std": 0.26463310420513153, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.970982164144516, "rewards/tag_count_reward": 0.976004496216774, "step": 3208 }, { "clip_ratio": 0.0, "completion_length": 288.61385345458984, "epoch": 0.9585542528563961, "grad_norm": 0.5777809619903564, "kl": 0.17724609375, "learning_rate": 1.0341131203311039e-07, "loss": 0.1505, "reward": 2.107142925262451, "reward_std": 0.12497804500162601, "rewards/accuracy_reward": 0.1361607238650322, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9910714626312256, "step": 3209 }, { "clip_ratio": 0.0, "completion_length": 284.45760345458984, "epoch": 0.9588529609439176, "grad_norm": 2.051304817199707, "kl": 0.82421875, "learning_rate": 1.0192056932220695e-07, "loss": 0.1429, "reward": 2.138392984867096, "reward_std": 0.2140660509467125, "rewards/accuracy_reward": 0.1763392984867096, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9843750447034836, "step": 3210 }, { "clip_ratio": 0.0, "completion_length": 286.6897506713867, "epoch": 0.959151669031439, "grad_norm": 0.5798931121826172, "kl": 0.330322265625, "learning_rate": 1.004405947516085e-07, "loss": 0.1282, "reward": 2.0373884737491608, "reward_std": 0.17403072863817215, "rewards/accuracy_reward": 0.06696429033763707, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9860491454601288, "step": 3211 }, { "clip_ratio": 0.0, "completion_length": 282.4062614440918, "epoch": 0.9594503771189605, "grad_norm": 0.6244561076164246, "kl": 0.3916015625, "learning_rate": 9.897138993138156e-08, "loss": 0.1687, "reward": 2.025111675262451, "reward_std": 0.18420894630253315, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9871652275323868, "step": 3212 }, { "clip_ratio": 0.0, "completion_length": 253.56474685668945, "epoch": 0.9597490852064819, "grad_norm": 0.8022235631942749, "kl": 0.28076171875, "learning_rate": 9.751295645987647e-08, "loss": 0.133, "reward": 2.0513393878936768, "reward_std": 0.12078525312244892, "rewards/accuracy_reward": 0.07589286123402417, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9910714477300644, "step": 3213 }, { "clip_ratio": 0.0, "completion_length": 281.65179443359375, "epoch": 0.9600477932940035, "grad_norm": 0.5557938814163208, "kl": 0.267333984375, "learning_rate": 9.606529592372738e-08, "loss": 0.1911, "reward": 2.0926340520381927, "reward_std": 0.1473214365541935, "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9877232611179352, "step": 3214 }, { "clip_ratio": 0.0, "completion_length": 285.8571548461914, "epoch": 0.9603465013815249, "grad_norm": 1.285169243812561, "kl": 0.53369140625, "learning_rate": 9.462840989784671e-08, "loss": 0.2983, "reward": 2.079799234867096, "reward_std": 0.24719484150409698, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.9687500596046448, "rewards/tag_count_reward": 0.983816996216774, "step": 3215 }, { "clip_ratio": 0.0, "completion_length": 326.0223388671875, "epoch": 0.9606452094690464, "grad_norm": 0.9028376936912537, "kl": 0.1851806640625, "learning_rate": 9.320229994542518e-08, "loss": 0.1524, "reward": 2.114397406578064, "reward_std": 0.24371778219938278, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9893973618745804, "step": 3216 }, { "clip_ratio": 0.0, "completion_length": 288.245548248291, "epoch": 0.9609439175565678, "grad_norm": 0.4401918053627014, "kl": 0.294189453125, "learning_rate": 9.17869676179306e-08, "loss": 0.1364, "reward": 2.0463171005249023, "reward_std": 0.16731526516377926, "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.98604916036129, "step": 3217 }, { "clip_ratio": 0.0, "completion_length": 267.22322845458984, "epoch": 0.9612426256440894, "grad_norm": 0.5330562591552734, "kl": 0.21826171875, "learning_rate": 9.038241445510687e-08, "loss": 0.1421, "reward": 2.056361675262451, "reward_std": 0.1696792859584093, "rewards/accuracy_reward": 0.08035714854486287, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9893973618745804, "step": 3218 }, { "clip_ratio": 0.0, "completion_length": 285.62501525878906, "epoch": 0.9615413337316108, "grad_norm": 0.9005562663078308, "kl": 0.258056640625, "learning_rate": 8.898864198496837e-08, "loss": 0.1118, "reward": 2.0290179550647736, "reward_std": 0.15705479681491852, "rewards/accuracy_reward": 0.05133928777649999, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9910714626312256, "step": 3219 }, { "clip_ratio": 0.0, "completion_length": 318.18751525878906, "epoch": 0.9618400418191323, "grad_norm": 1.0369740724563599, "kl": 0.291259765625, "learning_rate": 8.760565172380443e-08, "loss": 0.1255, "reward": 2.056919753551483, "reward_std": 0.17358743399381638, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9832589626312256, "step": 3220 }, { "clip_ratio": 0.0, "completion_length": 325.3727798461914, "epoch": 0.9621387499066537, "grad_norm": 0.4263221025466919, "kl": 0.264892578125, "learning_rate": 8.62334451761715e-08, "loss": 0.1663, "reward": 2.076451003551483, "reward_std": 0.22732684388756752, "rewards/accuracy_reward": 0.12053572130389512, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9827009290456772, "step": 3221 }, { "clip_ratio": 0.0, "completion_length": 272.62501525878906, "epoch": 0.9624374579941752, "grad_norm": 0.7902252674102783, "kl": 0.400390625, "learning_rate": 8.487202383489656e-08, "loss": 0.1382, "reward": 2.126674234867096, "reward_std": 0.16058737598359585, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9905134439468384, "step": 3222 }, { "clip_ratio": 0.0, "completion_length": 286.3058166503906, "epoch": 0.9627361660816967, "grad_norm": 1.0270602703094482, "kl": 0.468994140625, "learning_rate": 8.352138918107377e-08, "loss": 0.334, "reward": 2.0373884737491608, "reward_std": 0.34105031937360764, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9748884439468384, "step": 3223 }, { "clip_ratio": 0.0, "completion_length": 247.36384963989258, "epoch": 0.9630348741692182, "grad_norm": 0.7833601236343384, "kl": 0.284912109375, "learning_rate": 8.218154268405998e-08, "loss": 0.166, "reward": 2.1015626192092896, "reward_std": 0.21964595839381218, "rewards/accuracy_reward": 0.13169643841683865, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.987723246216774, "step": 3224 }, { "clip_ratio": 0.0, "completion_length": 277.01787185668945, "epoch": 0.9633335822567396, "grad_norm": 0.5623294115066528, "kl": 0.18017578125, "learning_rate": 8.085248580147586e-08, "loss": 0.1481, "reward": 2.1411831378936768, "reward_std": 0.1879043411463499, "rewards/accuracy_reward": 0.1718750111758709, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9871652126312256, "step": 3225 }, { "clip_ratio": 0.0, "completion_length": 252.12723922729492, "epoch": 0.9636322903442611, "grad_norm": 0.5059893131256104, "kl": 0.24755859375, "learning_rate": 7.953421997920818e-08, "loss": 0.1215, "reward": 2.0719867050647736, "reward_std": 0.11893540434539318, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.9888393133878708, "rewards/tag_count_reward": 0.9893973469734192, "step": 3226 }, { "clip_ratio": 0.0, "completion_length": 310.8549270629883, "epoch": 0.9639309984317825, "grad_norm": 1.479960560798645, "kl": 0.4853515625, "learning_rate": 7.822674665139751e-08, "loss": 0.1622, "reward": 2.07756707072258, "reward_std": 0.1255411645397544, "rewards/accuracy_reward": 0.10267857764847577, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9927455633878708, "step": 3227 }, { "clip_ratio": 0.0, "completion_length": 300.23662185668945, "epoch": 0.9642297065193041, "grad_norm": 609.6476440429688, "kl": 0.366943359375, "learning_rate": 7.693006724044827e-08, "loss": 0.1376, "reward": 2.1718750596046448, "reward_std": 0.23512903600931168, "rewards/accuracy_reward": 0.2098214365541935, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9843750298023224, "step": 3228 }, { "clip_ratio": 0.0, "completion_length": 313.7745704650879, "epoch": 0.9645284146068255, "grad_norm": 0.8143172264099121, "kl": 0.283203125, "learning_rate": 7.564418315702093e-08, "loss": 0.2096, "reward": 2.109933078289032, "reward_std": 0.25172020122408867, "rewards/accuracy_reward": 0.15848215017467737, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9827009439468384, "step": 3229 }, { "clip_ratio": 0.0, "completion_length": 286.6272506713867, "epoch": 0.9648271226943469, "grad_norm": 0.47849148511886597, "kl": 0.23779296875, "learning_rate": 7.436909580003093e-08, "loss": 0.1465, "reward": 2.087611734867096, "reward_std": 0.2267274707555771, "rewards/accuracy_reward": 0.11830357741564512, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9871652275323868, "step": 3230 }, { "clip_ratio": 0.0, "completion_length": 283.64064025878906, "epoch": 0.9651258307818684, "grad_norm": 3.9414846897125244, "kl": 0.466064453125, "learning_rate": 7.310480655664864e-08, "loss": 0.1798, "reward": 2.068638503551483, "reward_std": 0.15460349805653095, "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9882812947034836, "step": 3231 }, { "clip_ratio": 0.0, "completion_length": 300.4040412902832, "epoch": 0.9654245388693898, "grad_norm": 1.3560664653778076, "kl": 0.4619140625, "learning_rate": 7.185131680229606e-08, "loss": 0.1716, "reward": 2.0608259439468384, "reward_std": 0.1722700521349907, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9938616305589676, "step": 3232 }, { "clip_ratio": 0.0, "completion_length": 282.03125762939453, "epoch": 0.9657232469569114, "grad_norm": 0.9037994146347046, "kl": 0.353759765625, "learning_rate": 7.060862790064793e-08, "loss": 0.1711, "reward": 2.095424175262451, "reward_std": 0.26933974027633667, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.983816996216774, "step": 3233 }, { "clip_ratio": 0.0, "completion_length": 306.4308166503906, "epoch": 0.9660219550444328, "grad_norm": 0.5154380202293396, "kl": 0.3330078125, "learning_rate": 6.937674120362725e-08, "loss": 0.2384, "reward": 2.059709906578064, "reward_std": 0.23481643199920654, "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.970982164144516, "rewards/tag_count_reward": 0.983816996216774, "step": 3234 }, { "clip_ratio": 0.0, "completion_length": 268.9732246398926, "epoch": 0.9663206631319543, "grad_norm": 0.9999491572380066, "kl": 0.203369140625, "learning_rate": 6.815565805140645e-08, "loss": 0.2121, "reward": 2.1361607909202576, "reward_std": 0.23855524137616158, "rewards/accuracy_reward": 0.1763392984867096, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9843750298023224, "step": 3235 }, { "clip_ratio": 0.0, "completion_length": 254.5692024230957, "epoch": 0.9666193712194757, "grad_norm": 0.7597762942314148, "kl": 0.311767578125, "learning_rate": 6.694537977240512e-08, "loss": 0.0613, "reward": 2.1361608505249023, "reward_std": 0.1332970317453146, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.9933035969734192, "rewards/tag_count_reward": 0.9955357313156128, "step": 3236 }, { "clip_ratio": 0.0, "completion_length": 291.81250762939453, "epoch": 0.9669180793069972, "grad_norm": 0.4658474624156952, "kl": 0.424072265625, "learning_rate": 6.574590768328559e-08, "loss": 0.2222, "reward": 2.0764510333538055, "reward_std": 0.21550729125738144, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9827009290456772, "step": 3237 }, { "clip_ratio": 0.0, "completion_length": 314.96653747558594, "epoch": 0.9672167873945187, "grad_norm": 1.0646905899047852, "kl": 0.33251953125, "learning_rate": 6.45572430889574e-08, "loss": 0.2345, "reward": 1.9933036863803864, "reward_std": 0.2381448745727539, "rewards/accuracy_reward": 0.053571431897580624, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.9776786267757416, "step": 3238 }, { "clip_ratio": 0.0, "completion_length": 297.8950958251953, "epoch": 0.9675154954820402, "grad_norm": 0.3788750469684601, "kl": 0.31298828125, "learning_rate": 6.337938728257054e-08, "loss": 0.1862, "reward": 2.034040242433548, "reward_std": 0.21279723197221756, "rewards/accuracy_reward": 0.08705357694998384, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9782366454601288, "step": 3239 }, { "clip_ratio": 0.0, "completion_length": 287.70314025878906, "epoch": 0.9678142035695616, "grad_norm": 0.6270975470542908, "kl": 0.2587890625, "learning_rate": 6.221234154551781e-08, "loss": 0.1, "reward": 2.073102742433548, "reward_std": 0.1867594923824072, "rewards/accuracy_reward": 0.09821428847499192, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9905134290456772, "step": 3240 }, { "clip_ratio": 0.0, "completion_length": 295.2433166503906, "epoch": 0.9681129116570831, "grad_norm": 2.071657419204712, "kl": 0.364990234375, "learning_rate": 6.105610714742805e-08, "loss": 0.242, "reward": 1.9972099363803864, "reward_std": 0.21860338747501373, "rewards/accuracy_reward": 0.04241071571595967, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9815848618745804, "step": 3241 }, { "clip_ratio": 0.0, "completion_length": 290.58929443359375, "epoch": 0.9684116197446045, "grad_norm": 0.8023565411567688, "kl": 0.344482421875, "learning_rate": 5.991068534617394e-08, "loss": 0.2782, "reward": 2.0251117050647736, "reward_std": 0.2000945396721363, "rewards/accuracy_reward": 0.07812500232830644, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9782366454601288, "step": 3242 }, { "clip_ratio": 0.0, "completion_length": 310.7120666503906, "epoch": 0.9687103278321261, "grad_norm": 0.5825337767601013, "kl": 0.3115234375, "learning_rate": 5.8776077387859845e-08, "loss": 0.1975, "reward": 2.1311384439468384, "reward_std": 0.20880207419395447, "rewards/accuracy_reward": 0.17857143515720963, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9793527275323868, "step": 3243 }, { "clip_ratio": 0.0, "completion_length": 284.1741180419922, "epoch": 0.9690090359196475, "grad_norm": 0.6778314113616943, "kl": 0.243408203125, "learning_rate": 5.765228450682947e-08, "loss": 0.234, "reward": 2.109375089406967, "reward_std": 0.22904885187745094, "rewards/accuracy_reward": 0.15401786053553224, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9821428954601288, "step": 3244 }, { "clip_ratio": 0.0, "completion_length": 276.29689025878906, "epoch": 0.969307744007169, "grad_norm": 0.44576331973075867, "kl": 0.2197265625, "learning_rate": 5.653930792565821e-08, "loss": 0.2071, "reward": 2.032366156578064, "reward_std": 0.17234568763524294, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.9799107313156128, "rewards/tag_count_reward": 0.9854910969734192, "step": 3245 }, { "clip_ratio": 0.0, "completion_length": 308.5915298461914, "epoch": 0.9696064520946904, "grad_norm": 1.170801043510437, "kl": 0.646728515625, "learning_rate": 5.5437148855156387e-08, "loss": 0.1372, "reward": 2.05803582072258, "reward_std": 0.24725212529301643, "rewards/accuracy_reward": 0.09375000419095159, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9843750447034836, "step": 3246 }, { "clip_ratio": 0.0, "completion_length": 288.28126525878906, "epoch": 0.969905160182212, "grad_norm": 1.2486107349395752, "kl": 0.32763671875, "learning_rate": 5.434580849436377e-08, "loss": 0.1137, "reward": 2.069196492433548, "reward_std": 0.13593664020299911, "rewards/accuracy_reward": 0.08928571944124997, "rewards/format_reward": 0.988839328289032, "rewards/tag_count_reward": 0.9910714626312256, "step": 3247 }, { "clip_ratio": 0.0, "completion_length": 306.7410888671875, "epoch": 0.9702038682697334, "grad_norm": 0.37773188948631287, "kl": 0.268310546875, "learning_rate": 5.3265288030553974e-08, "loss": 0.1105, "reward": 2.0295759439468384, "reward_std": 0.1407629419118166, "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.991629496216774, "step": 3248 }, { "clip_ratio": 0.0, "completion_length": 278.54018783569336, "epoch": 0.9705025763572549, "grad_norm": 1.4600814580917358, "kl": 0.42236328125, "learning_rate": 5.2195588639225584e-08, "loss": 0.3569, "reward": 1.9665179550647736, "reward_std": 0.2661643400788307, "rewards/accuracy_reward": 0.024553571827709675, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9776786267757416, "step": 3249 }, { "clip_ratio": 0.0, "completion_length": 310.87501525878906, "epoch": 0.9708012844447763, "grad_norm": 0.4823395013809204, "kl": 0.1859130859375, "learning_rate": 5.1136711484106594e-08, "loss": 0.1253, "reward": 2.0563617050647736, "reward_std": 0.195731271058321, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9893973469734192, "step": 3250 }, { "clip_ratio": 0.0, "completion_length": 292.79911041259766, "epoch": 0.9710999925322978, "grad_norm": 0.46205589175224304, "kl": 0.31689453125, "learning_rate": 5.008865771715221e-08, "loss": 0.1897, "reward": 2.1138393580913544, "reward_std": 0.21392790228128433, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9866071939468384, "step": 3251 }, { "clip_ratio": 0.0, "completion_length": 255.7165298461914, "epoch": 0.9713987006198193, "grad_norm": 1.4249708652496338, "kl": 0.399658203125, "learning_rate": 4.9051428478542604e-08, "loss": 0.1862, "reward": 2.1612724661827087, "reward_std": 0.21489847637712955, "rewards/accuracy_reward": 0.1919642984867096, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9871652126312256, "step": 3252 }, { "clip_ratio": 0.0, "completion_length": 262.9509048461914, "epoch": 0.9716974087073408, "grad_norm": 0.368712842464447, "kl": 0.309814453125, "learning_rate": 4.802502489668071e-08, "loss": 0.1303, "reward": 2.1478796005249023, "reward_std": 0.20283540710806847, "rewards/accuracy_reward": 0.1741071492433548, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9893973618745804, "step": 3253 }, { "clip_ratio": 0.0, "completion_length": 282.37500762939453, "epoch": 0.9719961167948622, "grad_norm": 0.9487401247024536, "kl": 0.447998046875, "learning_rate": 4.700944808819441e-08, "loss": 0.2836, "reward": 2.033482253551483, "reward_std": 0.31543320417404175, "rewards/accuracy_reward": 0.08705357322469354, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9776786118745804, "step": 3254 }, { "clip_ratio": 0.0, "completion_length": 293.7901916503906, "epoch": 0.9722948248823837, "grad_norm": 0.45565900206565857, "kl": 0.229248046875, "learning_rate": 4.6004699157928824e-08, "loss": 0.1325, "reward": 2.049107253551483, "reward_std": 0.24291660636663437, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9866071790456772, "step": 3255 }, { "clip_ratio": 0.0, "completion_length": 284.33707427978516, "epoch": 0.9725935329699051, "grad_norm": 0.3561834990978241, "kl": 0.285888671875, "learning_rate": 4.501077919895513e-08, "loss": 0.1811, "reward": 2.049107253551483, "reward_std": 0.1934089893475175, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9866071790456772, "step": 3256 }, { "clip_ratio": 0.0, "completion_length": 271.3125114440918, "epoch": 0.9728922410574267, "grad_norm": 0.7284398078918457, "kl": 0.2900390625, "learning_rate": 4.4027689292560626e-08, "loss": 0.1155, "reward": 2.099330425262451, "reward_std": 0.16842786967754364, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9899553805589676, "step": 3257 }, { "clip_ratio": 0.0, "completion_length": 313.7768020629883, "epoch": 0.9731909491449481, "grad_norm": 0.5943313837051392, "kl": 0.268798828125, "learning_rate": 4.3055430508248675e-08, "loss": 0.1965, "reward": 2.0463170409202576, "reward_std": 0.19397597014904022, "rewards/accuracy_reward": 0.07812500232830644, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9882812947034836, "step": 3258 }, { "clip_ratio": 0.0, "completion_length": 293.3928756713867, "epoch": 0.9734896572324696, "grad_norm": 0.9277057647705078, "kl": 0.346435546875, "learning_rate": 4.2094003903743183e-08, "loss": 0.2278, "reward": 2.049107253551483, "reward_std": 0.21208474785089493, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9843750298023224, "step": 3259 }, { "clip_ratio": 0.0, "completion_length": 284.8437614440918, "epoch": 0.973788365319991, "grad_norm": 0.4175521433353424, "kl": 0.323974609375, "learning_rate": 4.114341052498194e-08, "loss": 0.0862, "reward": 2.0859375596046448, "reward_std": 0.15692584216594696, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.986607164144516, "rewards/tag_count_reward": 0.9921875149011612, "step": 3260 }, { "clip_ratio": 0.0, "completion_length": 283.50000762939453, "epoch": 0.9740870734075125, "grad_norm": 0.5158545970916748, "kl": 0.32861328125, "learning_rate": 4.020365140611771e-08, "loss": 0.1394, "reward": 2.1562500596046448, "reward_std": 0.20812692120671272, "rewards/accuracy_reward": 0.1852678619325161, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9866071790456772, "step": 3261 }, { "clip_ratio": 0.0, "completion_length": 311.3616256713867, "epoch": 0.974385781495034, "grad_norm": 0.6159828305244446, "kl": 0.346923828125, "learning_rate": 3.927472756951489e-08, "loss": 0.2027, "reward": 2.0412947833538055, "reward_std": 0.2555908113718033, "rewards/accuracy_reward": 0.08705357741564512, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.981026828289032, "step": 3262 }, { "clip_ratio": 0.0, "completion_length": 297.1651916503906, "epoch": 0.9746844895825555, "grad_norm": 2.0434131622314453, "kl": 0.4453125, "learning_rate": 3.83566400257529e-08, "loss": 0.2543, "reward": 1.9927456378936768, "reward_std": 0.21667683124542236, "rewards/accuracy_reward": 0.04241071571595967, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9815848469734192, "step": 3263 }, { "clip_ratio": 0.0, "completion_length": 291.14733123779297, "epoch": 0.9749831976700769, "grad_norm": 0.7468169331550598, "kl": 0.368408203125, "learning_rate": 3.744938977362056e-08, "loss": 0.1306, "reward": 2.130580425262451, "reward_std": 0.2105100229382515, "rewards/accuracy_reward": 0.16071429569274187, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.987723246216774, "step": 3264 }, { "clip_ratio": 0.0, "completion_length": 287.02010345458984, "epoch": 0.9752819057575984, "grad_norm": 0.8368421196937561, "kl": 0.541015625, "learning_rate": 3.655297780011724e-08, "loss": 0.2223, "reward": 2.0463170409202576, "reward_std": 0.18266885355114937, "rewards/accuracy_reward": 0.08928571990691125, "rewards/format_reward": 0.9732143133878708, "rewards/tag_count_reward": 0.9838170260190964, "step": 3265 }, { "clip_ratio": 0.0, "completion_length": 263.0178642272949, "epoch": 0.9755806138451198, "grad_norm": 1.0118486881256104, "kl": 0.393798828125, "learning_rate": 3.566740508045174e-08, "loss": 0.2239, "reward": 2.1010045409202576, "reward_std": 0.16921955347061157, "rewards/accuracy_reward": 0.13169643771834671, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9893973618745804, "step": 3266 }, { "clip_ratio": 0.0, "completion_length": 262.34822845458984, "epoch": 0.9758793219326414, "grad_norm": 0.6398538947105408, "kl": 0.41552734375, "learning_rate": 3.4792672578038974e-08, "loss": 0.2087, "reward": 2.090959906578064, "reward_std": 0.21482915058732033, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9793527126312256, "step": 3267 }, { "clip_ratio": 0.0, "completion_length": 283.4799270629883, "epoch": 0.9761780300201628, "grad_norm": 0.6004536151885986, "kl": 0.21728515625, "learning_rate": 3.3928781244504384e-08, "loss": 0.1221, "reward": 2.111049234867096, "reward_std": 0.2097957730293274, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9860491305589676, "step": 3268 }, { "clip_ratio": 0.0, "completion_length": 323.55582427978516, "epoch": 0.9764767381076843, "grad_norm": 0.9191804528236389, "kl": 0.305908203125, "learning_rate": 3.3075732019675065e-08, "loss": 0.2543, "reward": 2.0820313096046448, "reward_std": 0.252119816839695, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.977120578289032, "step": 3269 }, { "clip_ratio": 0.0, "completion_length": 256.1495704650879, "epoch": 0.9767754461952057, "grad_norm": 1.5883089303970337, "kl": 0.3525390625, "learning_rate": 3.2233525831586455e-08, "loss": 0.1511, "reward": 2.1690849661827087, "reward_std": 0.12215939350426197, "rewards/accuracy_reward": 0.18973215110599995, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9927455633878708, "step": 3270 }, { "clip_ratio": 0.0, "completion_length": 281.59153747558594, "epoch": 0.9770741542827273, "grad_norm": 0.8376836776733398, "kl": 0.384033203125, "learning_rate": 3.140216359647452e-08, "loss": 0.216, "reward": 2.103236734867096, "reward_std": 0.23243414610624313, "rewards/accuracy_reward": 0.149553582072258, "rewards/format_reward": 0.9732143133878708, "rewards/tag_count_reward": 0.9804687947034836, "step": 3271 }, { "clip_ratio": 0.0, "completion_length": 300.15626525878906, "epoch": 0.9773728623702487, "grad_norm": 0.5552940964698792, "kl": 0.333740234375, "learning_rate": 3.0581646218781346e-08, "loss": 0.2637, "reward": 2.0630581974983215, "reward_std": 0.2754526026546955, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.9732143133878708, "rewards/tag_count_reward": 0.97823666036129, "step": 3272 }, { "clip_ratio": 0.0, "completion_length": 304.08260345458984, "epoch": 0.9776715704577701, "grad_norm": 1.5062617063522339, "kl": 0.485107421875, "learning_rate": 2.9771974591149557e-08, "loss": 0.2117, "reward": 2.033482253551483, "reward_std": 0.2595813807565719, "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9799107611179352, "step": 3273 }, { "clip_ratio": 0.0, "completion_length": 291.20314025878906, "epoch": 0.9779702785452916, "grad_norm": 0.7558524012565613, "kl": 0.64990234375, "learning_rate": 2.8973149594422323e-08, "loss": 0.2762, "reward": 2.022321492433548, "reward_std": 0.22902580350637436, "rewards/accuracy_reward": 0.07142857438884676, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.979910746216774, "step": 3274 }, { "clip_ratio": 0.0, "completion_length": 289.6696548461914, "epoch": 0.978268986632813, "grad_norm": 1.4167003631591797, "kl": 0.4560546875, "learning_rate": 2.8185172097641156e-08, "loss": 0.3213, "reward": 2.060826003551483, "reward_std": 0.2897471413016319, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9782366454601288, "step": 3275 }, { "clip_ratio": 0.0, "completion_length": 327.9866256713867, "epoch": 0.9785676947203346, "grad_norm": 5.084140777587891, "kl": 0.32666015625, "learning_rate": 2.740804295805144e-08, "loss": 0.1699, "reward": 2.051897406578064, "reward_std": 0.29330139979720116, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9737723618745804, "step": 3276 }, { "clip_ratio": 0.0, "completion_length": 293.354923248291, "epoch": 0.978866402807856, "grad_norm": 0.610213577747345, "kl": 0.39404296875, "learning_rate": 2.6641763021091337e-08, "loss": 0.2662, "reward": 2.0457590222358704, "reward_std": 0.25248760264366865, "rewards/accuracy_reward": 0.10491071967408061, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.97433041036129, "step": 3277 }, { "clip_ratio": 0.0, "completion_length": 292.6384048461914, "epoch": 0.9791651108953775, "grad_norm": 0.9573219418525696, "kl": 0.36572265625, "learning_rate": 2.5886333120398456e-08, "loss": 0.2839, "reward": 2.0412946939468384, "reward_std": 0.28823745623230934, "rewards/accuracy_reward": 0.10044643515720963, "rewards/format_reward": 0.964285746216774, "rewards/tag_count_reward": 0.9765625447034836, "step": 3278 }, { "clip_ratio": 0.0, "completion_length": 311.2589416503906, "epoch": 0.9794638189828989, "grad_norm": 2.615800619125366, "kl": 0.654296875, "learning_rate": 2.514175407780761e-08, "loss": 0.1851, "reward": 2.027343839406967, "reward_std": 0.16132424026727676, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.984933078289032, "step": 3279 }, { "clip_ratio": 0.0, "completion_length": 304.8906364440918, "epoch": 0.9797625270704204, "grad_norm": 0.9644127488136292, "kl": 0.394775390625, "learning_rate": 2.440802670334641e-08, "loss": 0.1537, "reward": 2.0714286267757416, "reward_std": 0.1959780491888523, "rewards/accuracy_reward": 0.1138392926659435, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9821428954601288, "step": 3280 }, { "clip_ratio": 0.0, "completion_length": 299.0133972167969, "epoch": 0.9800612351579419, "grad_norm": 1.6955745220184326, "kl": 0.80859375, "learning_rate": 2.368515179523967e-08, "loss": 0.2692, "reward": 1.9642857909202576, "reward_std": 0.23912580497562885, "rewards/accuracy_reward": 0.024553572526201606, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9754464626312256, "step": 3281 }, { "clip_ratio": 0.0, "completion_length": 291.77233123779297, "epoch": 0.9803599432454634, "grad_norm": 0.5074437856674194, "kl": 0.35986328125, "learning_rate": 2.2973130139903878e-08, "loss": 0.155, "reward": 2.0184152722358704, "reward_std": 0.1476927325129509, "rewards/accuracy_reward": 0.05133928777649999, "rewards/format_reward": 0.9799107760190964, "rewards/tag_count_reward": 0.9871652275323868, "step": 3282 }, { "clip_ratio": 0.0, "completion_length": 276.44420623779297, "epoch": 0.9806586513329848, "grad_norm": 0.6876992583274841, "kl": 0.259521484375, "learning_rate": 2.22719625119483e-08, "loss": 0.2111, "reward": 2.157366156578064, "reward_std": 0.26242804527282715, "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.9709821790456772, "rewards/tag_count_reward": 0.9765625298023224, "step": 3283 }, { "clip_ratio": 0.0, "completion_length": 280.98439025878906, "epoch": 0.9809573594205063, "grad_norm": 0.5012750625610352, "kl": 0.29736328125, "learning_rate": 2.1581649674176086e-08, "loss": 0.242, "reward": 2.0245536863803864, "reward_std": 0.2269148826599121, "rewards/accuracy_reward": 0.06473214644938707, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9821428805589676, "step": 3284 }, { "clip_ratio": 0.0, "completion_length": 292.1852798461914, "epoch": 0.9812560675080277, "grad_norm": 0.6427072286605835, "kl": 0.33837890625, "learning_rate": 2.0902192377577624e-08, "loss": 0.2961, "reward": 2.00725457072258, "reward_std": 0.21018891409039497, "rewards/accuracy_reward": 0.058035718742758036, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9804688096046448, "step": 3285 }, { "clip_ratio": 0.0, "completion_length": 258.56027603149414, "epoch": 0.9815547755955493, "grad_norm": 0.603365421295166, "kl": 0.4873046875, "learning_rate": 2.023359136133829e-08, "loss": 0.1905, "reward": 2.0909599661827087, "reward_std": 0.22095507010817528, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.990513414144516, "step": 3286 }, { "clip_ratio": 0.0, "completion_length": 329.4866256713867, "epoch": 0.9818534836830707, "grad_norm": 0.5374146103858948, "kl": 0.369384765625, "learning_rate": 1.957584735282847e-08, "loss": 0.198, "reward": 1.9743304550647736, "reward_std": 0.2620900683104992, "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.9553571790456772, "rewards/tag_count_reward": 0.9720982611179352, "step": 3287 }, { "clip_ratio": 0.0, "completion_length": 293.2901954650879, "epoch": 0.9821521917705922, "grad_norm": 1.109889030456543, "kl": 0.36328125, "learning_rate": 1.8928961067610217e-08, "loss": 0.2486, "reward": 2.052455484867096, "reward_std": 0.2258329726755619, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9832589626312256, "step": 3288 }, { "clip_ratio": 0.0, "completion_length": 294.38616943359375, "epoch": 0.9824508998581136, "grad_norm": 0.6101237535476685, "kl": 0.275146484375, "learning_rate": 1.8292933209432816e-08, "loss": 0.1593, "reward": 2.0580358505249023, "reward_std": 0.18294898606836796, "rewards/accuracy_reward": 0.08482143329456449, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9888393133878708, "step": 3289 }, { "clip_ratio": 0.0, "completion_length": 292.1272430419922, "epoch": 0.9827496079456352, "grad_norm": 0.5329746007919312, "kl": 0.32275390625, "learning_rate": 1.7667764470230553e-08, "loss": 0.1246, "reward": 2.083147346973419, "reward_std": 0.24648922309279442, "rewards/accuracy_reward": 0.11607143096625805, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9871652126312256, "step": 3290 }, { "clip_ratio": 0.0, "completion_length": 309.6138458251953, "epoch": 0.9830483160331566, "grad_norm": 0.6476907134056091, "kl": 0.3406982421875, "learning_rate": 1.705345553012716e-08, "loss": 0.2574, "reward": 2.0569197237491608, "reward_std": 0.24519906006753445, "rewards/accuracy_reward": 0.11830357555299997, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9743303954601288, "step": 3291 }, { "clip_ratio": 0.0, "completion_length": 287.62278747558594, "epoch": 0.9833470241206781, "grad_norm": 0.5608704686164856, "kl": 0.22802734375, "learning_rate": 1.6450007057431382e-08, "loss": 0.0998, "reward": 2.063616156578064, "reward_std": 0.13009734638035297, "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.987723246216774, "step": 3292 }, { "clip_ratio": 0.0, "completion_length": 283.8393020629883, "epoch": 0.9836457322081995, "grad_norm": 0.5113509893417358, "kl": 0.301513671875, "learning_rate": 1.5857419708633636e-08, "loss": 0.2113, "reward": 2.0820313096046448, "reward_std": 0.1753324456512928, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9838170111179352, "step": 3293 }, { "clip_ratio": 0.0, "completion_length": 302.46207427978516, "epoch": 0.983944440295721, "grad_norm": 0.36182108521461487, "kl": 0.308837890625, "learning_rate": 1.5275694128412675e-08, "loss": 0.1313, "reward": 2.092076003551483, "reward_std": 0.1764438096433878, "rewards/accuracy_reward": 0.12053572200238705, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9893973618745804, "step": 3294 }, { "clip_ratio": 0.0, "completion_length": 302.95983123779297, "epoch": 0.9842431483832424, "grad_norm": 0.945742666721344, "kl": 0.460693359375, "learning_rate": 1.4704830949627825e-08, "loss": 0.2903, "reward": 2.023995667695999, "reward_std": 0.21841605007648468, "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.9748884290456772, "step": 3295 }, { "clip_ratio": 0.0, "completion_length": 309.3616180419922, "epoch": 0.984541856470764, "grad_norm": 0.3182097375392914, "kl": 0.2447509765625, "learning_rate": 1.4144830793323406e-08, "loss": 0.121, "reward": 2.008370578289032, "reward_std": 0.13453782349824905, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.98604916036129, "step": 3296 }, { "clip_ratio": 0.0, "completion_length": 281.4643020629883, "epoch": 0.9848405645582854, "grad_norm": 0.6339004039764404, "kl": 0.35595703125, "learning_rate": 1.3595694268723202e-08, "loss": 0.1566, "reward": 2.0530134439468384, "reward_std": 0.1335152629762888, "rewards/accuracy_reward": 0.0758928582072258, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9905134290456772, "step": 3297 }, { "clip_ratio": 0.0, "completion_length": 274.0089454650879, "epoch": 0.9851392726458069, "grad_norm": 0.7553693056106567, "kl": 0.40234375, "learning_rate": 1.3057421973236007e-08, "loss": 0.1922, "reward": 2.0172992050647736, "reward_std": 0.178872580640018, "rewards/accuracy_reward": 0.05357143236324191, "rewards/format_reward": 0.9776785969734192, "rewards/tag_count_reward": 0.9860491305589676, "step": 3298 }, { "clip_ratio": 0.0, "completion_length": 329.00447845458984, "epoch": 0.9854379807333283, "grad_norm": 0.5494033694267273, "kl": 0.360107421875, "learning_rate": 1.253001449244673e-08, "loss": 0.2982, "reward": 2.0167412161827087, "reward_std": 0.2706719972193241, "rewards/accuracy_reward": 0.07589286123402417, "rewards/format_reward": 0.9642857611179352, "rewards/tag_count_reward": 0.9765625298023224, "step": 3299 }, { "clip_ratio": 0.0, "completion_length": 274.5825958251953, "epoch": 0.9857366888208499, "grad_norm": 0.49314334988594055, "kl": 0.21630859375, "learning_rate": 1.2013472400125293e-08, "loss": 0.1064, "reward": 2.029017984867096, "reward_std": 0.15064759273082018, "rewards/accuracy_reward": 0.04910714412108064, "rewards/format_reward": 0.9866071790456772, "rewards/tag_count_reward": 0.9933036118745804, "step": 3300 }, { "clip_ratio": 0.0, "completion_length": 297.12500762939453, "epoch": 0.9860353969083713, "grad_norm": 0.4976453483104706, "kl": 0.36181640625, "learning_rate": 1.150779625821885e-08, "loss": 0.1721, "reward": 2.111049234867096, "reward_std": 0.184514744207263, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9860491454601288, "step": 3301 }, { "clip_ratio": 0.0, "completion_length": 329.88170623779297, "epoch": 0.9863341049958928, "grad_norm": 0.40538251399993896, "kl": 0.311279296875, "learning_rate": 1.1012986616850685e-08, "loss": 0.1323, "reward": 1.9804687798023224, "reward_std": 0.12847187370061874, "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9871652126312256, "step": 3302 }, { "clip_ratio": 0.0, "completion_length": 290.3951072692871, "epoch": 0.9866328130834142, "grad_norm": 0.38972556591033936, "kl": 0.26025390625, "learning_rate": 1.0529044014329081e-08, "loss": 0.1786, "reward": 2.0825893878936768, "reward_std": 0.17439180798828602, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.9888393133878708, "step": 3303 }, { "clip_ratio": 0.0, "completion_length": 295.22545623779297, "epoch": 0.9869315211709357, "grad_norm": 0.4905446767807007, "kl": 0.224609375, "learning_rate": 1.0055968977132902e-08, "loss": 0.0985, "reward": 2.1194196939468384, "reward_std": 0.12824185006320477, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.9888392984867096, "rewards/tag_count_reward": 0.9921875298023224, "step": 3304 }, { "clip_ratio": 0.0, "completion_length": 282.18750762939453, "epoch": 0.9872302292584572, "grad_norm": 0.25175270438194275, "kl": 0.23876953125, "learning_rate": 9.593762019922681e-09, "loss": 0.1543, "reward": 2.0200893580913544, "reward_std": 0.1277326475828886, "rewards/accuracy_reward": 0.049107146449387074, "rewards/format_reward": 0.9821428805589676, "rewards/tag_count_reward": 0.9888393133878708, "step": 3305 }, { "clip_ratio": 0.0, "completion_length": 305.48439025878906, "epoch": 0.9875289373459787, "grad_norm": 2.5132029056549072, "kl": 0.656494140625, "learning_rate": 9.142423645535081e-09, "loss": 0.2956, "reward": 2.0546876192092896, "reward_std": 0.27880754321813583, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9765625447034836, "step": 3306 }, { "clip_ratio": 0.0, "completion_length": 284.94421005249023, "epoch": 0.9878276454335001, "grad_norm": 1.122606635093689, "kl": 0.490234375, "learning_rate": 8.701954344980668e-09, "loss": 0.2892, "reward": 2.06194207072258, "reward_std": 0.21494648233056068, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9838170111179352, "step": 3307 }, { "clip_ratio": 0.0, "completion_length": 291.3861770629883, "epoch": 0.9881263535210216, "grad_norm": 0.6501266956329346, "kl": 0.21337890625, "learning_rate": 8.272354597448351e-09, "loss": 0.1401, "reward": 2.05412957072258, "reward_std": 0.19514644145965576, "rewards/accuracy_reward": 0.08258928707800806, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.9893973618745804, "step": 3308 }, { "clip_ratio": 0.0, "completion_length": 277.38393783569336, "epoch": 0.988425061608543, "grad_norm": 0.5322305560112, "kl": 0.32568359375, "learning_rate": 7.853624870298727e-09, "loss": 0.2155, "reward": 2.102678596973419, "reward_std": 0.18380131758749485, "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9843750298023224, "step": 3309 }, { "clip_ratio": 0.0, "completion_length": 282.1116180419922, "epoch": 0.9887237696960646, "grad_norm": 0.9031490087509155, "kl": 0.2900390625, "learning_rate": 7.4457656190707324e-09, "loss": 0.1741, "reward": 2.0256697237491608, "reward_std": 0.18666969239711761, "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.9776786267757416, "rewards/tag_count_reward": 0.9832589626312256, "step": 3310 }, { "clip_ratio": 0.0, "completion_length": 302.85938262939453, "epoch": 0.989022477783586, "grad_norm": 1.0792219638824463, "kl": 0.438720703125, "learning_rate": 7.048777287472774e-09, "loss": 0.2017, "reward": 2.0630581080913544, "reward_std": 0.22930751368403435, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.982700914144516, "step": 3311 }, { "clip_ratio": 0.0, "completion_length": 310.9151916503906, "epoch": 0.9893211858711075, "grad_norm": 0.49620386958122253, "kl": 0.26416015625, "learning_rate": 6.6626603073916e-09, "loss": 0.1394, "reward": 2.03069207072258, "reward_std": 0.19603585824370384, "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.9799107611179352, "rewards/tag_count_reward": 0.98604916036129, "step": 3312 }, { "clip_ratio": 0.0, "completion_length": 273.41295623779297, "epoch": 0.9896198939586289, "grad_norm": 0.8566257953643799, "kl": 0.226318359375, "learning_rate": 6.287415098883421e-09, "loss": 0.1515, "reward": 2.0546876192092896, "reward_std": 0.14809424336999655, "rewards/accuracy_reward": 0.08035714761354029, "rewards/format_reward": 0.9843750298023224, "rewards/tag_count_reward": 0.9899553954601288, "step": 3313 }, { "clip_ratio": 0.0, "completion_length": 281.8683204650879, "epoch": 0.9899186020461505, "grad_norm": 1.2132809162139893, "kl": 0.46142578125, "learning_rate": 5.923042070178353e-09, "loss": 0.2458, "reward": 2.0786831080913544, "reward_std": 0.2728617675602436, "rewards/accuracy_reward": 0.12500000977888703, "rewards/format_reward": 0.9732143431901932, "rewards/tag_count_reward": 0.9804687947034836, "step": 3314 }, { "clip_ratio": 0.0, "completion_length": 292.60269927978516, "epoch": 0.9902173101336719, "grad_norm": 0.7256569266319275, "kl": 0.47998046875, "learning_rate": 5.569541617679308e-09, "loss": 0.2174, "reward": 2.0217634737491608, "reward_std": 0.13743101991713047, "rewards/accuracy_reward": 0.049107145285233855, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9882812798023224, "step": 3315 }, { "clip_ratio": 0.0, "completion_length": 293.8549156188965, "epoch": 0.9905160182211933, "grad_norm": 0.6661884188652039, "kl": 0.2802734375, "learning_rate": 5.22691412595866e-09, "loss": 0.1521, "reward": 2.075892925262451, "reward_std": 0.18753170780837536, "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.9821428954601288, "rewards/tag_count_reward": 0.986607164144516, "step": 3316 }, { "clip_ratio": 0.0, "completion_length": 316.1116256713867, "epoch": 0.9908147263087148, "grad_norm": 1.527334213256836, "kl": 0.6298828125, "learning_rate": 4.895159967762686e-09, "loss": 0.3074, "reward": 2.0206474363803864, "reward_std": 0.26940372213721275, "rewards/accuracy_reward": 0.08258928824216127, "rewards/format_reward": 0.9665178954601288, "rewards/tag_count_reward": 0.9715402126312256, "step": 3317 }, { "clip_ratio": 0.0, "completion_length": 315.52457427978516, "epoch": 0.9911134343962362, "grad_norm": 0.49104318022727966, "kl": 0.30908203125, "learning_rate": 4.574279504007128e-09, "loss": 0.1997, "reward": 2.0412947237491608, "reward_std": 0.15555774793028831, "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9832589775323868, "step": 3318 }, { "clip_ratio": 0.0, "completion_length": 270.3616142272949, "epoch": 0.9914121424837578, "grad_norm": 1.3155694007873535, "kl": 0.376953125, "learning_rate": 4.264273083778303e-09, "loss": 0.1761, "reward": 2.0775670409202576, "reward_std": 0.23192019015550613, "rewards/accuracy_reward": 0.1205357238650322, "rewards/format_reward": 0.975446492433548, "rewards/tag_count_reward": 0.9815848767757416, "step": 3319 }, { "clip_ratio": 0.0, "completion_length": 272.448673248291, "epoch": 0.9917108505712792, "grad_norm": 2.2566845417022705, "kl": 0.305908203125, "learning_rate": 3.965141044333099e-09, "loss": 0.1398, "reward": 2.099888503551483, "reward_std": 0.15845200419425964, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9882812947034836, "step": 3320 }, { "clip_ratio": 0.0, "completion_length": 280.1138458251953, "epoch": 0.9920095586588007, "grad_norm": 0.40875959396362305, "kl": 0.236572265625, "learning_rate": 3.676883711097867e-09, "loss": 0.1233, "reward": 2.1121652126312256, "reward_std": 0.15495355613529682, "rewards/accuracy_reward": 0.1272321455180645, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9938616454601288, "step": 3321 }, { "clip_ratio": 0.0, "completion_length": 284.59152603149414, "epoch": 0.9923082667463221, "grad_norm": 0.357875257730484, "kl": 0.271484375, "learning_rate": 3.3995013976684253e-09, "loss": 0.2481, "reward": 2.0731027722358704, "reward_std": 0.2322436086833477, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9793527275323868, "step": 3322 }, { "clip_ratio": 0.0, "completion_length": 274.83930587768555, "epoch": 0.9926069748338436, "grad_norm": 0.4951355755329132, "kl": 0.2607421875, "learning_rate": 3.132994405808942e-09, "loss": 0.1501, "reward": 2.1389509439468384, "reward_std": 0.2226889580488205, "rewards/accuracy_reward": 0.16964286286383867, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9871652275323868, "step": 3323 }, { "clip_ratio": 0.0, "completion_length": 324.0558166503906, "epoch": 0.992905682921365, "grad_norm": 0.6870294809341431, "kl": 0.351318359375, "learning_rate": 2.877363025454161e-09, "loss": 0.1721, "reward": 2.0117188692092896, "reward_std": 0.23412876203656197, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9760045111179352, "step": 3324 }, { "clip_ratio": 0.0, "completion_length": 288.69866943359375, "epoch": 0.9932043910088866, "grad_norm": 0.5042446851730347, "kl": 0.287109375, "learning_rate": 2.632607534703846e-09, "loss": 0.2425, "reward": 2.044642925262451, "reward_std": 0.2776401750743389, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9799107611179352, "step": 3325 }, { "clip_ratio": 0.0, "completion_length": 303.5892868041992, "epoch": 0.993503099096408, "grad_norm": 1.9193134307861328, "kl": 0.570068359375, "learning_rate": 2.3987281998294477e-09, "loss": 0.2887, "reward": 2.1127233505249023, "reward_std": 0.30061621218919754, "rewards/accuracy_reward": 0.1785714402794838, "rewards/format_reward": 0.9620536118745804, "rewards/tag_count_reward": 0.972098246216774, "step": 3326 }, { "clip_ratio": 0.0, "completion_length": 332.06251525878906, "epoch": 0.9938018071839295, "grad_norm": 0.366626501083374, "kl": 0.386962890625, "learning_rate": 2.1757252752685475e-09, "loss": 0.186, "reward": 2.107142925262451, "reward_std": 0.2499413900077343, "rewards/accuracy_reward": 0.14732143399305642, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9821428954601288, "step": 3327 }, { "clip_ratio": 0.0, "completion_length": 286.32143783569336, "epoch": 0.9941005152714509, "grad_norm": 0.7395191788673401, "kl": 0.48388671875, "learning_rate": 1.9635990036270813e-09, "loss": 0.1754, "reward": 2.0340402722358704, "reward_std": 0.17821764387190342, "rewards/accuracy_reward": 0.06250000046566129, "rewards/format_reward": 0.9843750447034836, "rewards/tag_count_reward": 0.9871652126312256, "step": 3328 }, { "clip_ratio": 0.0, "completion_length": 293.667423248291, "epoch": 0.9943992233589725, "grad_norm": 1.3139832019805908, "kl": 0.2568359375, "learning_rate": 1.7623496156771169e-09, "loss": 0.1704, "reward": 2.084821581840515, "reward_std": 0.16841925494372845, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9888393431901932, "step": 3329 }, { "clip_ratio": 0.0, "completion_length": 286.8571548461914, "epoch": 0.9946979314464939, "grad_norm": 1.0407545566558838, "kl": 0.26904296875, "learning_rate": 1.5719773303568553e-09, "loss": 0.1688, "reward": 2.0223214626312256, "reward_std": 0.1861633025109768, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9843750298023224, "step": 3330 }, { "clip_ratio": 0.0, "completion_length": 287.36162185668945, "epoch": 0.9949966395340154, "grad_norm": 0.45148688554763794, "kl": 0.4326171875, "learning_rate": 1.392482354775071e-09, "loss": 0.1523, "reward": 2.0228795409202576, "reward_std": 0.1863498780876398, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.9827009290456772, "step": 3331 }, { "clip_ratio": 0.0, "completion_length": 306.6495704650879, "epoch": 0.9952953476215368, "grad_norm": 1.0895121097564697, "kl": 0.4381103515625, "learning_rate": 1.2238648842033408e-09, "loss": 0.1501, "reward": 2.0848215222358704, "reward_std": 0.17510836571455002, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.9843750149011612, "rewards/tag_count_reward": 0.9888392984867096, "step": 3332 }, { "clip_ratio": 0.0, "completion_length": 303.5268020629883, "epoch": 0.9955940557090583, "grad_norm": 0.5405223369598389, "kl": 0.2529296875, "learning_rate": 1.0661251020815944e-09, "loss": 0.1523, "reward": 2.078125089406967, "reward_std": 0.15757093206048012, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9866071790456772, "step": 3333 }, { "clip_ratio": 0.0, "completion_length": 310.19420623779297, "epoch": 0.9958927637965798, "grad_norm": 0.7962281107902527, "kl": 0.42333984375, "learning_rate": 9.192631800147844e-10, "loss": 0.1078, "reward": 2.1328126192092896, "reward_std": 0.20447009429335594, "rewards/accuracy_reward": 0.17410714738070965, "rewards/format_reward": 0.9754464775323868, "rewards/tag_count_reward": 0.9832589775323868, "step": 3334 }, { "clip_ratio": 0.0, "completion_length": 295.9196548461914, "epoch": 0.9961914718841013, "grad_norm": 0.6753193140029907, "kl": 0.508544921875, "learning_rate": 7.832792777739962e-10, "loss": 0.2183, "reward": 2.0345983505249023, "reward_std": 0.18201371654868126, "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9832589626312256, "step": 3335 }, { "clip_ratio": 0.0, "completion_length": 304.92858123779297, "epoch": 0.9964901799716227, "grad_norm": 1.001061201095581, "kl": 0.258544921875, "learning_rate": 6.581735432964476e-10, "loss": 0.0676, "reward": 2.123326003551483, "reward_std": 0.18712153285741806, "rewards/accuracy_reward": 0.14732143376022577, "rewards/format_reward": 0.9866071939468384, "rewards/tag_count_reward": 0.9893973767757416, "step": 3336 }, { "clip_ratio": 0.0, "completion_length": 308.23663330078125, "epoch": 0.9967888880591442, "grad_norm": 0.9675818085670471, "kl": 0.3095703125, "learning_rate": 5.439461126854894e-10, "loss": 0.2287, "reward": 2.0390625596046448, "reward_std": 0.2695717252790928, "rewards/accuracy_reward": 0.10937500302679837, "rewards/format_reward": 0.9598214775323868, "rewards/tag_count_reward": 0.9698661118745804, "step": 3337 }, { "clip_ratio": 0.0, "completion_length": 323.89064025878906, "epoch": 0.9970875961466656, "grad_norm": 2.944906711578369, "kl": 0.489013671875, "learning_rate": 4.4059711020949523e-10, "loss": 0.2815, "reward": 2.076451003551483, "reward_std": 0.24895723909139633, "rewards/accuracy_reward": 0.12946428963914514, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9782366454601288, "step": 3338 }, { "clip_ratio": 0.0, "completion_length": 312.3058204650879, "epoch": 0.9973863042341872, "grad_norm": 6.201056480407715, "kl": 0.9521484375, "learning_rate": 3.4812664830186084e-10, "loss": 0.2185, "reward": 2.05412957072258, "reward_std": 0.1939438432455063, "rewards/accuracy_reward": 0.10714286426082253, "rewards/format_reward": 0.9687500447034836, "rewards/tag_count_reward": 0.9782366752624512, "step": 3339 }, { "clip_ratio": 0.0, "completion_length": 290.6785888671875, "epoch": 0.9976850123217086, "grad_norm": 1.1139740943908691, "kl": 0.380126953125, "learning_rate": 2.665348275610047e-10, "loss": 0.2563, "reward": 2.0809152126312256, "reward_std": 0.23957587778568268, "rewards/accuracy_reward": 0.11830358020961285, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.984933078289032, "step": 3340 }, { "clip_ratio": 0.0, "completion_length": 325.8772430419922, "epoch": 0.9979837204092301, "grad_norm": 1.3957599401474, "kl": 0.36669921875, "learning_rate": 1.958217367514781e-10, "loss": 0.176, "reward": 2.0429688096046448, "reward_std": 0.2488677129149437, "rewards/accuracy_reward": 0.08482143189758062, "rewards/format_reward": 0.9776786118745804, "rewards/tag_count_reward": 0.9804687798023224, "step": 3341 }, { "clip_ratio": 0.0, "completion_length": 286.1986846923828, "epoch": 0.9982824284967515, "grad_norm": 0.9572336673736572, "kl": 0.33056640625, "learning_rate": 1.359874528006344e-10, "loss": 0.0656, "reward": 2.189732253551483, "reward_std": 0.1831950880587101, "rewards/accuracy_reward": 0.2031250149011612, "rewards/format_reward": 0.9910714626312256, "rewards/tag_count_reward": 0.9955357313156128, "step": 3342 }, { "clip_ratio": 0.0, "completion_length": 288.75671005249023, "epoch": 0.998581136584273, "grad_norm": 0.6481905579566956, "kl": 0.39599609375, "learning_rate": 8.703204080418026e-11, "loss": 0.1173, "reward": 2.0937500596046448, "reward_std": 0.2181297093629837, "rewards/accuracy_reward": 0.12946428847499192, "rewards/format_reward": 0.979910746216774, "rewards/tag_count_reward": 0.9843750149011612, "step": 3343 }, { "clip_ratio": 0.0, "completion_length": 243.37277603149414, "epoch": 0.9988798446717945, "grad_norm": 0.8018609285354614, "kl": 0.21337890625, "learning_rate": 4.895555402062435e-11, "loss": 0.1232, "reward": 2.1088171005249023, "reward_std": 0.19409086555242538, "rewards/accuracy_reward": 0.14062500838190317, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.98604916036129, "step": 3344 }, { "clip_ratio": 0.0, "completion_length": 345.3281478881836, "epoch": 0.999178552759316, "grad_norm": 0.36247318983078003, "kl": 0.2705078125, "learning_rate": 2.1758033871277507e-11, "loss": 0.1798, "reward": 2.021763503551483, "reward_std": 0.16940201818943024, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.9754464626312256, "rewards/tag_count_reward": 0.983816996216774, "step": 3345 }, { "clip_ratio": 0.0, "completion_length": 320.23216247558594, "epoch": 0.9994772608468374, "grad_norm": 5.442612171173096, "kl": 0.619873046875, "learning_rate": 5.439509946914001e-12, "loss": 0.1839, "reward": 2.0758929550647736, "reward_std": 0.16257525235414505, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.98214291036129, "rewards/tag_count_reward": 0.9866071939468384, "step": 3346 }, { "clip_ratio": 0.0, "completion_length": 312.21287536621094, "epoch": 0.9997759689343589, "grad_norm": 0.8781343102455139, "kl": 0.383544921875, "learning_rate": 0.0, "loss": 0.1867, "reward": 2.059709906578064, "reward_std": 0.1953374296426773, "rewards/accuracy_reward": 0.10267857508733869, "rewards/format_reward": 0.973214328289032, "rewards/tag_count_reward": 0.9838170111179352, "step": 3347 }, { "epoch": 0.9997759689343589, "step": 3347, "total_flos": 0.0, "train_loss": 0.0, "train_runtime": 11.2788, "train_samples_per_second": 8310.56, "train_steps_per_second": 296.752 } ], "logging_steps": 1, "max_steps": 3347, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }