{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9976019184652278, "eval_steps": 500, "global_step": 208, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 912.9860992431641, "epoch": 0.004796163069544364, "grad_norm": 0.12673589773378027, "kl": 0.0, "learning_rate": 4.7619047619047613e-08, "loss": 0.0232, "reward": 0.6874999850988388, "reward_std": 0.3423890396952629, "rewards/accuracy_reward": 0.2500000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 883.1944580078125, "epoch": 0.009592326139088728, "grad_norm": 0.116076838638148, "kl": 0.0, "learning_rate": 9.523809523809523e-08, "loss": 0.0136, "reward": 0.6423611119389534, "reward_std": 0.3120992071926594, "rewards/accuracy_reward": 0.243055559694767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3993055522441864, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 851.4652862548828, "epoch": 0.014388489208633094, "grad_norm": 0.12982224030488893, "kl": 2.41696834564209e-05, "learning_rate": 1.4285714285714285e-07, "loss": 0.0211, "reward": 0.7118055671453476, "reward_std": 0.3277251161634922, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4409722238779068, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 877.0138854980469, "epoch": 0.019184652278177457, "grad_norm": 0.12194604446708072, "kl": 2.086162567138672e-05, "learning_rate": 1.9047619047619045e-07, "loss": 0.0285, "reward": 0.6909722238779068, "reward_std": 0.31570543721318245, "rewards/accuracy_reward": 0.2847222248092294, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.40625, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 884.4236297607422, "epoch": 0.023980815347721823, "grad_norm": 0.13261607584021887, "kl": 3.56137752532959e-05, "learning_rate": 2.3809523809523806e-07, "loss": 0.0312, "reward": 0.626736119389534, "reward_std": 0.2724486030638218, "rewards/accuracy_reward": 0.23611111007630825, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3906250074505806, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 825.4166870117188, "epoch": 0.02877697841726619, "grad_norm": 0.12101637274777749, "kl": 2.5272369384765625e-05, "learning_rate": 2.857142857142857e-07, "loss": 0.022, "reward": 0.817708358168602, "reward_std": 0.3249164782464504, "rewards/accuracy_reward": 0.361111119389534, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4565972313284874, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 898.826416015625, "epoch": 0.03357314148681055, "grad_norm": 0.13410726618357155, "kl": 3.2961368560791016e-05, "learning_rate": 3.333333333333333e-07, "loss": 0.017, "reward": 0.6961805522441864, "reward_std": 0.281472560018301, "rewards/accuracy_reward": 0.2916666641831398, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4045138880610466, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 868.3472290039062, "epoch": 0.03836930455635491, "grad_norm": 0.14428783414562169, "kl": 3.2767653465270996e-05, "learning_rate": 3.809523809523809e-07, "loss": 0.0364, "reward": 0.7777777761220932, "reward_std": 0.38908588513731956, "rewards/accuracy_reward": 0.34722222574055195, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.430555559694767, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 922.3125152587891, "epoch": 0.04316546762589928, "grad_norm": 0.14202941889032566, "kl": 3.263354301452637e-05, "learning_rate": 4.285714285714285e-07, "loss": 0.0361, "reward": 0.524305559694767, "reward_std": 0.23212899640202522, "rewards/accuracy_reward": 0.15972222574055195, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3645833432674408, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 914.1597290039062, "epoch": 0.047961630695443645, "grad_norm": 0.12987837517343923, "kl": 3.269314765930176e-05, "learning_rate": 4.761904761904761e-07, "loss": 0.0316, "reward": 0.5885416567325592, "reward_std": 0.2799038216471672, "rewards/accuracy_reward": 0.19444444822147489, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3940972238779068, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 935.5347137451172, "epoch": 0.05275779376498801, "grad_norm": 0.12123557233064179, "kl": 2.6702880859375e-05, "learning_rate": 5.238095238095238e-07, "loss": 0.0268, "reward": 0.579861119389534, "reward_std": 0.2995072081685066, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3715277761220932, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 902.8472290039062, "epoch": 0.05755395683453238, "grad_norm": 0.1263644550467175, "kl": 1.638941466808319e-05, "learning_rate": 5.714285714285714e-07, "loss": 0.0182, "reward": 0.5815972238779068, "reward_std": 0.26795749366283417, "rewards/accuracy_reward": 0.18055555410683155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.401041679084301, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 910.9583435058594, "epoch": 0.06235011990407674, "grad_norm": 0.13348819214381502, "kl": 3.2275915145874023e-05, "learning_rate": 6.19047619047619e-07, "loss": 0.033, "reward": 0.5833333507180214, "reward_std": 0.27443326637148857, "rewards/accuracy_reward": 0.18750000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3958333432674408, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 837.5764007568359, "epoch": 0.0671462829736211, "grad_norm": 0.12534297846344097, "kl": 2.5704503059387207e-05, "learning_rate": 6.666666666666666e-07, "loss": 0.0398, "reward": 0.6614583432674408, "reward_std": 0.25699039548635483, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4322916716337204, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 870.8958435058594, "epoch": 0.07194244604316546, "grad_norm": 0.1178768584334791, "kl": 1.9222497940063477e-05, "learning_rate": 7.142857142857143e-07, "loss": 0.0223, "reward": 0.6701388955116272, "reward_std": 0.25698356330394745, "rewards/accuracy_reward": 0.25694444589316845, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4131944477558136, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 855.7222137451172, "epoch": 0.07673860911270983, "grad_norm": 0.15574941119385063, "kl": 2.9087066650390625e-05, "learning_rate": 7.619047619047618e-07, "loss": 0.0337, "reward": 0.7760416865348816, "reward_std": 0.4277946427464485, "rewards/accuracy_reward": 0.3194444440305233, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4565972313284874, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 869.6944427490234, "epoch": 0.0815347721822542, "grad_norm": 0.1258833956177519, "kl": 3.331899642944336e-05, "learning_rate": 8.095238095238095e-07, "loss": 0.0129, "reward": 0.6701388955116272, "reward_std": 0.30363673344254494, "rewards/accuracy_reward": 0.2847222238779068, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3854166716337204, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 874.4097137451172, "epoch": 0.08633093525179857, "grad_norm": 0.1315962441453753, "kl": 2.0717590814456344e-05, "learning_rate": 8.57142857142857e-07, "loss": 0.0301, "reward": 0.6163194552063942, "reward_std": 0.2553598415106535, "rewards/accuracy_reward": 0.21527777705341578, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4010416716337204, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 815.0555572509766, "epoch": 0.09112709832134293, "grad_norm": 0.14298620074456062, "kl": 3.075599670410156e-05, "learning_rate": 9.047619047619047e-07, "loss": 0.0405, "reward": 0.763888880610466, "reward_std": 0.2983681969344616, "rewards/accuracy_reward": 0.3263888955116272, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375000074505806, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 862.4861145019531, "epoch": 0.09592326139088729, "grad_norm": 0.13958983684341053, "kl": 3.007054328918457e-05, "learning_rate": 9.523809523809522e-07, "loss": 0.0179, "reward": 0.6788194626569748, "reward_std": 0.2541828490793705, "rewards/accuracy_reward": 0.24305555783212185, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4357638880610466, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 891.1597290039062, "epoch": 0.10071942446043165, "grad_norm": 0.14460083396808562, "kl": 5.367398262023926e-05, "learning_rate": 1e-06, "loss": 0.0206, "reward": 0.6215277761220932, "reward_std": 0.2927175499498844, "rewards/accuracy_reward": 0.22222222574055195, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.399305559694767, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 859.0, "epoch": 0.10551558752997602, "grad_norm": 0.13266204171177207, "kl": 7.867813110351562e-05, "learning_rate": 9.999364977905849e-07, "loss": 0.0212, "reward": 0.7864583432674408, "reward_std": 0.329488068819046, "rewards/accuracy_reward": 0.3472222238779068, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4392361119389534, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 841.5069427490234, "epoch": 0.11031175059952038, "grad_norm": 0.1368070463424762, "kl": 0.00011658668518066406, "learning_rate": 9.99746009084698e-07, "loss": 0.0389, "reward": 0.7725694477558136, "reward_std": 0.3147674612700939, "rewards/accuracy_reward": 0.3402777761220932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.432291679084301, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 887.5139007568359, "epoch": 0.11510791366906475, "grad_norm": 0.1458860079179632, "kl": 0.00013267993927001953, "learning_rate": 9.994285876443557e-07, "loss": 0.0341, "reward": 0.626736119389534, "reward_std": 0.2819124907255173, "rewards/accuracy_reward": 0.2361111119389534, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3906250074505806, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 873.8611145019531, "epoch": 0.11990407673860912, "grad_norm": 0.130318186151234, "kl": 0.00016427040100097656, "learning_rate": 9.989843230560593e-07, "loss": 0.0384, "reward": 0.6493055671453476, "reward_std": 0.28573132678866386, "rewards/accuracy_reward": 0.22222222574055195, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4270833358168602, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 820.6111145019531, "epoch": 0.12470023980815348, "grad_norm": 0.1353493867926934, "kl": 0.00028514862060546875, "learning_rate": 9.984133407055104e-07, "loss": 0.0057, "reward": 0.7204861342906952, "reward_std": 0.27508755773305893, "rewards/accuracy_reward": 0.2777777798473835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4427083358168602, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 903.1041717529297, "epoch": 0.12949640287769784, "grad_norm": 0.13147174706101974, "kl": 0.0002446174621582031, "learning_rate": 9.97715801742224e-07, "loss": 0.043, "reward": 0.6232638880610466, "reward_std": 0.2895628921687603, "rewards/accuracy_reward": 0.2361111156642437, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3871527835726738, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 870.1736145019531, "epoch": 0.1342925659472422, "grad_norm": 0.12627875749194337, "kl": 0.0002665519714355469, "learning_rate": 9.968919030340457e-07, "loss": 0.0277, "reward": 0.756944477558136, "reward_std": 0.3117631673812866, "rewards/accuracy_reward": 0.3194444477558136, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 896.8680725097656, "epoch": 0.13908872901678657, "grad_norm": 0.13847019160477245, "kl": 0.00043702125549316406, "learning_rate": 9.959418771115903e-07, "loss": 0.0286, "reward": 0.5954861268401146, "reward_std": 0.268420971930027, "rewards/accuracy_reward": 0.180555559694767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.414930559694767, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 802.3194580078125, "epoch": 0.14388489208633093, "grad_norm": 0.14967482022344253, "kl": 0.0006508827209472656, "learning_rate": 9.948659921026139e-07, "loss": 0.0318, "reward": 0.8125000149011612, "reward_std": 0.4278785213828087, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4583333283662796, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 794.6875, "epoch": 0.1486810551558753, "grad_norm": 0.17944099601126384, "kl": 0.0007243156433105469, "learning_rate": 9.936645516563387e-07, "loss": 0.0583, "reward": 0.8663194477558136, "reward_std": 0.35557055473327637, "rewards/accuracy_reward": 0.4027777835726738, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4635416716337204, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 880.7361145019531, "epoch": 0.15347721822541965, "grad_norm": 0.12117489678150584, "kl": 0.0006771087646484375, "learning_rate": 9.923378948577558e-07, "loss": 0.0401, "reward": 0.6406250074505806, "reward_std": 0.26150013506412506, "rewards/accuracy_reward": 0.23611111473292112, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4045138880610466, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 834.5555725097656, "epoch": 0.15827338129496402, "grad_norm": 0.1537599572741959, "kl": 0.00096893310546875, "learning_rate": 9.908863961319219e-07, "loss": 0.0342, "reward": 0.861111119389534, "reward_std": 0.37204235792160034, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4652777835726738, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 850.9791870117188, "epoch": 0.1630695443645084, "grad_norm": 0.1605541412094071, "kl": 0.0012502670288085938, "learning_rate": 9.893104651382861e-07, "loss": 0.055, "reward": 0.8003472238779068, "reward_std": 0.3042585700750351, "rewards/accuracy_reward": 0.3819444514811039, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4184027835726738, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 851.0069580078125, "epoch": 0.16786570743405277, "grad_norm": 0.13513924603619376, "kl": 0.001659393310546875, "learning_rate": 9.876105466550707e-07, "loss": 0.0509, "reward": 0.7881944477558136, "reward_std": 0.3145363964140415, "rewards/accuracy_reward": 0.3680555671453476, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4201388880610466, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 878.7361297607422, "epoch": 0.17266187050359713, "grad_norm": 0.1542251680807794, "kl": 0.0016040802001953125, "learning_rate": 9.857871204537401e-07, "loss": 0.0544, "reward": 0.6944444477558136, "reward_std": 0.2541184388101101, "rewards/accuracy_reward": 0.305555559694767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3888888955116272, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 830.6805572509766, "epoch": 0.1774580335731415, "grad_norm": 0.13514283713713865, "kl": 0.0020580291748046875, "learning_rate": 9.838407011635942e-07, "loss": 0.0246, "reward": 0.8350694626569748, "reward_std": 0.29664015769958496, "rewards/accuracy_reward": 0.3958333283662796, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4392361119389534, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 896.7152862548828, "epoch": 0.18225419664268586, "grad_norm": 0.12350709906557038, "kl": 0.0016126632690429688, "learning_rate": 9.817718381265238e-07, "loss": 0.0437, "reward": 0.6128472238779068, "reward_std": 0.31635782122612, "rewards/accuracy_reward": 0.2291666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3836805522441864, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 864.9444580078125, "epoch": 0.18705035971223022, "grad_norm": 0.12426066518400752, "kl": 0.0020294189453125, "learning_rate": 9.795811152419678e-07, "loss": 0.0301, "reward": 0.7291666716337204, "reward_std": 0.24946986511349678, "rewards/accuracy_reward": 0.3333333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3958333358168602, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 800.3333282470703, "epoch": 0.19184652278177458, "grad_norm": 0.15357763591255594, "kl": 0.0022525787353515625, "learning_rate": 9.772691508021193e-07, "loss": 0.042, "reward": 0.8281250149011612, "reward_std": 0.310004822909832, "rewards/accuracy_reward": 0.3888888992369175, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4392361044883728, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 818.4653015136719, "epoch": 0.19664268585131894, "grad_norm": 0.14942346890583016, "kl": 0.0027675628662109375, "learning_rate": 9.748365973174227e-07, "loss": 0.0492, "reward": 0.8402778059244156, "reward_std": 0.292511161416769, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4652777835726738, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 809.9514007568359, "epoch": 0.2014388489208633, "grad_norm": 0.15897417290331764, "kl": 0.003749847412109375, "learning_rate": 9.722841413324149e-07, "loss": 0.0459, "reward": 0.8593750149011612, "reward_std": 0.3366158865392208, "rewards/accuracy_reward": 0.4027777761220932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4565972238779068, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 729.5972290039062, "epoch": 0.20623501199040767, "grad_norm": 0.176025608996368, "kl": 0.002838134765625, "learning_rate": 9.6961250323196e-07, "loss": 0.0243, "reward": 1.1562499850988388, "reward_std": 0.33967938274145126, "rewards/accuracy_reward": 0.6319444552063942, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5243055522441864, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 850.7222290039062, "epoch": 0.21103117505995203, "grad_norm": 0.11868611450690505, "kl": 0.003307342529296875, "learning_rate": 9.668224370379346e-07, "loss": 0.0277, "reward": 0.8246527910232544, "reward_std": 0.2909863740205765, "rewards/accuracy_reward": 0.4027777910232544, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4218750074505806, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 758.6111297607422, "epoch": 0.2158273381294964, "grad_norm": 0.15818031203002028, "kl": 0.003475189208984375, "learning_rate": 9.639147301964175e-07, "loss": 0.0287, "reward": 0.9652777910232544, "reward_std": 0.301775723695755, "rewards/accuracy_reward": 0.4791666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4861111119389534, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 838.8055725097656, "epoch": 0.22062350119904076, "grad_norm": 0.16002912577528097, "kl": 0.004032135009765625, "learning_rate": 9.608902033554475e-07, "loss": 0.0429, "reward": 0.7517361342906952, "reward_std": 0.34651144593954086, "rewards/accuracy_reward": 0.305555559694767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.446180559694767, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 843.9861297607422, "epoch": 0.22541966426858512, "grad_norm": 0.1540903493857499, "kl": 0.003780364990234375, "learning_rate": 9.577497101334103e-07, "loss": 0.0317, "reward": 0.8159722238779068, "reward_std": 0.3663570396602154, "rewards/accuracy_reward": 0.3611111119389534, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.454861119389534, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 865.0486145019531, "epoch": 0.2302158273381295, "grad_norm": 0.14872925893794725, "kl": 0.004302978515625, "learning_rate": 9.544941368781208e-07, "loss": 0.0514, "reward": 0.7083333432674408, "reward_std": 0.39198317378759384, "rewards/accuracy_reward": 0.2777777798473835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.430555559694767, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 808.4028015136719, "epoch": 0.23501199040767387, "grad_norm": 0.13499682062737456, "kl": 0.004261016845703125, "learning_rate": 9.51124402416666e-07, "loss": 0.0296, "reward": 0.8680555671453476, "reward_std": 0.265322033315897, "rewards/accuracy_reward": 0.4305555559694767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 853.8611145019531, "epoch": 0.23980815347721823, "grad_norm": 0.16386389384814085, "kl": 0.00469207763671875, "learning_rate": 9.476414577960834e-07, "loss": 0.0508, "reward": 0.7951389104127884, "reward_std": 0.33591291680932045, "rewards/accuracy_reward": 0.3611111119389534, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4340277835726738, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 770.3402862548828, "epoch": 0.2446043165467626, "grad_norm": 0.17896254581630255, "kl": 0.0062255859375, "learning_rate": 9.440462860149451e-07, "loss": 0.0483, "reward": 0.8229166716337204, "reward_std": 0.36454326659440994, "rewards/accuracy_reward": 0.3749999962747097, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4479166716337204, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 839.0208435058594, "epoch": 0.24940047961630696, "grad_norm": 0.13648789584357146, "kl": 0.00505828857421875, "learning_rate": 9.403399017459234e-07, "loss": 0.0323, "reward": 0.8489583283662796, "reward_std": 0.2952596992254257, "rewards/accuracy_reward": 0.3958333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.453125, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 804.8263854980469, "epoch": 0.2541966426858513, "grad_norm": 0.1523526638090565, "kl": 0.0061492919921875, "learning_rate": 9.365233510494185e-07, "loss": 0.0435, "reward": 0.892361119389534, "reward_std": 0.32454150170087814, "rewards/accuracy_reward": 0.444444440305233, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4479166716337204, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 798.3125, "epoch": 0.2589928057553957, "grad_norm": 0.18724044885869282, "kl": 0.0067901611328125, "learning_rate": 9.325977110783263e-07, "loss": 0.0222, "reward": 0.9270833432674408, "reward_std": 0.335986964404583, "rewards/accuracy_reward": 0.4444444552063942, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482638880610466, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 796.0347290039062, "epoch": 0.2637889688249401, "grad_norm": 0.15753263206218102, "kl": 0.00702667236328125, "learning_rate": 9.285640897740315e-07, "loss": 0.0554, "reward": 0.878472238779068, "reward_std": 0.3221370540559292, "rewards/accuracy_reward": 0.4097222238779068, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 807.6458435058594, "epoch": 0.2685851318944844, "grad_norm": 0.1525585503261879, "kl": 0.0070648193359375, "learning_rate": 9.244236255537107e-07, "loss": 0.0488, "reward": 0.8350694477558136, "reward_std": 0.25923068448901176, "rewards/accuracy_reward": 0.3680555522441864, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4670139029622078, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 814.4583435058594, "epoch": 0.2733812949640288, "grad_norm": 0.15792440768758637, "kl": 0.0050201416015625, "learning_rate": 9.20177486989035e-07, "loss": 0.0434, "reward": 0.878472238779068, "reward_std": 0.31662074103951454, "rewards/accuracy_reward": 0.4166666641831398, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.461805559694767, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 816.2708435058594, "epoch": 0.27817745803357313, "grad_norm": 0.16159815697797508, "kl": 0.0063934326171875, "learning_rate": 9.158268724763614e-07, "loss": 0.0424, "reward": 0.894097238779068, "reward_std": 0.3126923553645611, "rewards/accuracy_reward": 0.4236111119389534, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470486119389534, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 876.4583435058594, "epoch": 0.2829736211031175, "grad_norm": 0.1270858164567956, "kl": 0.0079803466796875, "learning_rate": 9.113730098985075e-07, "loss": 0.0267, "reward": 0.78125, "reward_std": 0.2495138719677925, "rewards/accuracy_reward": 0.3611111119389534, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4201388880610466, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 842.1111297607422, "epoch": 0.28776978417266186, "grad_norm": 0.14390666559593665, "kl": 0.0057373046875, "learning_rate": 9.068171562782021e-07, "loss": 0.0467, "reward": 0.8940972536802292, "reward_std": 0.3231881149113178, "rewards/accuracy_reward": 0.4305555559694767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.463541679084301, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 816.5902862548828, "epoch": 0.29256594724220625, "grad_norm": 0.16728826541039396, "kl": 0.00667572021484375, "learning_rate": 9.021605974233152e-07, "loss": 0.0724, "reward": 0.989583358168602, "reward_std": 0.36507341638207436, "rewards/accuracy_reward": 0.4861111044883728, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5034722313284874, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 828.9861145019531, "epoch": 0.2973621103117506, "grad_norm": 0.15432237348385633, "kl": 0.00737762451171875, "learning_rate": 8.974046475639604e-07, "loss": 0.0447, "reward": 0.925347238779068, "reward_std": 0.3722820319235325, "rewards/accuracy_reward": 0.4513888880610466, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4739583432674408, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 810.2569427490234, "epoch": 0.302158273381295, "grad_norm": 0.1856747664961947, "kl": 0.00745391845703125, "learning_rate": 8.925506489815772e-07, "loss": 0.0687, "reward": 0.895833358168602, "reward_std": 0.29615509510040283, "rewards/accuracy_reward": 0.430555559694767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4652777835726738, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 861.1111297607422, "epoch": 0.3069544364508393, "grad_norm": 0.13202082976554858, "kl": 0.00617218017578125, "learning_rate": 8.875999716300968e-07, "loss": 0.0299, "reward": 0.8020833432674408, "reward_std": 0.3038054183125496, "rewards/accuracy_reward": 0.3819444440305233, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4201388955116272, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 858.9097290039062, "epoch": 0.3117505995203837, "grad_norm": 0.152190266317737, "kl": 0.00725555419921875, "learning_rate": 8.825540127492965e-07, "loss": 0.0571, "reward": 0.7847222238779068, "reward_std": 0.3564433120191097, "rewards/accuracy_reward": 0.3472222313284874, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 798.2152862548828, "epoch": 0.31654676258992803, "grad_norm": 0.16383126534952586, "kl": 0.00787353515625, "learning_rate": 8.774141964704546e-07, "loss": 0.0431, "reward": 0.8836805671453476, "reward_std": 0.29356446862220764, "rewards/accuracy_reward": 0.4027777835726738, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4809027761220932, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 809.1597290039062, "epoch": 0.3213429256594724, "grad_norm": 0.17427922859293266, "kl": 0.00984954833984375, "learning_rate": 8.721819734144135e-07, "loss": 0.0541, "reward": 0.9930555671453476, "reward_std": 0.36635252088308334, "rewards/accuracy_reward": 0.4583333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5347222313284874, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 738.5833435058594, "epoch": 0.3261390887290168, "grad_norm": 0.16012047020291365, "kl": 0.009185791015625, "learning_rate": 8.668588202821706e-07, "loss": 0.039, "reward": 1.0850694477558136, "reward_std": 0.23961883038282394, "rewards/accuracy_reward": 0.5416666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5434027910232544, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 832.7847442626953, "epoch": 0.33093525179856115, "grad_norm": 0.15979089643431796, "kl": 0.0091400146484375, "learning_rate": 8.614462394381026e-07, "loss": 0.0613, "reward": 0.9340277761220932, "reward_std": 0.3319687321782112, "rewards/accuracy_reward": 0.4513888955116272, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4826388880610466, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 731.4444427490234, "epoch": 0.33573141486810554, "grad_norm": 0.2080530430054881, "kl": 0.01006317138671875, "learning_rate": 8.559457584859535e-07, "loss": 0.0441, "reward": 1.0954861342906952, "reward_std": 0.42393119633197784, "rewards/accuracy_reward": 0.5138888955116272, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5815972238779068, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 804.1528015136719, "epoch": 0.3405275779376499, "grad_norm": 0.16129162539469918, "kl": 0.008331298828125, "learning_rate": 8.503589298376931e-07, "loss": 0.0347, "reward": 0.9513888955116272, "reward_std": 0.37669622898101807, "rewards/accuracy_reward": 0.4583333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4930555671453476, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 851.0902862548828, "epoch": 0.34532374100719426, "grad_norm": 0.1421650821283864, "kl": 0.00882720947265625, "learning_rate": 8.446873302753783e-07, "loss": 0.0403, "reward": 0.892361119389534, "reward_std": 0.2742934599518776, "rewards/accuracy_reward": 0.423611119389534, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4687500074505806, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 834.8611145019531, "epoch": 0.3501199040767386, "grad_norm": 0.14158911723554238, "kl": 0.00821685791015625, "learning_rate": 8.389325605061341e-07, "loss": 0.0319, "reward": 0.9305555820465088, "reward_std": 0.2332368977367878, "rewards/accuracy_reward": 0.4583333283662796, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4722222238779068, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 872.9097442626953, "epoch": 0.354916067146283, "grad_norm": 0.1521349105449586, "kl": 0.0106048583984375, "learning_rate": 8.330962447103829e-07, "loss": 0.0301, "reward": 0.8125000149011612, "reward_std": 0.35327186062932014, "rewards/accuracy_reward": 0.3402777835726738, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4722222164273262, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 727.3472290039062, "epoch": 0.3597122302158273, "grad_norm": 0.1648475576874547, "kl": 0.0107269287109375, "learning_rate": 8.271800300834486e-07, "loss": 0.0719, "reward": 1.1545138657093048, "reward_std": 0.3374630883336067, "rewards/accuracy_reward": 0.5763888955116272, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.578125, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 754.5485992431641, "epoch": 0.3645083932853717, "grad_norm": 0.15678609737006508, "kl": 0.0106353759765625, "learning_rate": 8.211855863706654e-07, "loss": 0.0206, "reward": 1.1302083283662796, "reward_std": 0.3273175358772278, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5677083358168602, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 758.7639007568359, "epoch": 0.36930455635491605, "grad_norm": 0.14059085263342075, "kl": 0.011688232421875, "learning_rate": 8.151146053961217e-07, "loss": 0.0247, "reward": 1.038194477558136, "reward_std": 0.24932898953557014, "rewards/accuracy_reward": 0.5138888955116272, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5243055447936058, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 762.6527709960938, "epoch": 0.37410071942446044, "grad_norm": 0.16861004490817355, "kl": 0.011260986328125, "learning_rate": 8.089688005851745e-07, "loss": 0.0374, "reward": 1.09375, "reward_std": 0.362262312322855, "rewards/accuracy_reward": 0.5416666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5520833432674408, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 836.3055725097656, "epoch": 0.37889688249400477, "grad_norm": 0.13259854891508993, "kl": 0.009613037109375, "learning_rate": 8.02749906480864e-07, "loss": 0.0224, "reward": 0.9062500149011612, "reward_std": 0.2787036634981632, "rewards/accuracy_reward": 0.4097222313284874, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4965277835726738, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 779.7361297607422, "epoch": 0.38369304556354916, "grad_norm": 0.16767324138234957, "kl": 0.0133819580078125, "learning_rate": 7.964596782543716e-07, "loss": 0.0572, "reward": 1.0520833432674408, "reward_std": 0.28914331272244453, "rewards/accuracy_reward": 0.5277777910232544, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.524305559694767, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 767.7708435058594, "epoch": 0.38848920863309355, "grad_norm": 0.16351300117480583, "kl": 0.01165771484375, "learning_rate": 7.900998912096527e-07, "loss": 0.0307, "reward": 0.9444444477558136, "reward_std": 0.3029831796884537, "rewards/accuracy_reward": 0.4166666567325592, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5277777835726738, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 780.3541717529297, "epoch": 0.3932853717026379, "grad_norm": 0.1546912166828873, "kl": 0.013702392578125, "learning_rate": 7.836723402823913e-07, "loss": 0.0369, "reward": 1.0538194626569748, "reward_std": 0.3243863359093666, "rewards/accuracy_reward": 0.5000000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5538194477558136, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 808.9375, "epoch": 0.3980815347721823, "grad_norm": 0.14595312337300426, "kl": 0.012359619140625, "learning_rate": 7.771788395334094e-07, "loss": 0.0399, "reward": 0.918402761220932, "reward_std": 0.2620566040277481, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5017361119389534, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 767.1805419921875, "epoch": 0.4028776978417266, "grad_norm": 0.1382082777095037, "kl": 0.0124969482421875, "learning_rate": 7.706212216366819e-07, "loss": 0.0237, "reward": 0.9461805671453476, "reward_std": 0.2770383469760418, "rewards/accuracy_reward": 0.4444444514811039, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.501736119389534, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 773.5, "epoch": 0.407673860911271, "grad_norm": 0.17673710255712385, "kl": 0.0130767822265625, "learning_rate": 7.640013373620979e-07, "loss": 0.0526, "reward": 1.0694444626569748, "reward_std": 0.36386215686798096, "rewards/accuracy_reward": 0.548611119389534, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5208333432674408, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 709.4583435058594, "epoch": 0.41247002398081534, "grad_norm": 0.2107698032340365, "kl": 0.0170440673828125, "learning_rate": 7.573210550531125e-07, "loss": 0.0816, "reward": 1.222222238779068, "reward_std": 0.4683116003870964, "rewards/accuracy_reward": 0.6041666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6180555671453476, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 834.0694427490234, "epoch": 0.4172661870503597, "grad_norm": 0.13070064074785792, "kl": 0.012725830078125, "learning_rate": 7.505822600994423e-07, "loss": 0.0331, "reward": 0.8090277761220932, "reward_std": 0.2008717618882656, "rewards/accuracy_reward": 0.3541666641831398, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.454861119389534, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 741.8402862548828, "epoch": 0.42206235011990406, "grad_norm": 0.20183514921907542, "kl": 0.01544189453125, "learning_rate": 7.437868544049463e-07, "loss": 0.0421, "reward": 0.8489583432674408, "reward_std": 0.2780023626983166, "rewards/accuracy_reward": 0.3472222313284874, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.501736119389534, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 826.2777862548828, "epoch": 0.42685851318944845, "grad_norm": 0.15298818073279, "kl": 0.013885498046875, "learning_rate": 7.36936755850849e-07, "loss": 0.054, "reward": 0.8472222238779068, "reward_std": 0.22455434128642082, "rewards/accuracy_reward": 0.3750000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4722222313284874, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 796.0555419921875, "epoch": 0.4316546762589928, "grad_norm": 0.18224911397155316, "kl": 0.0146942138671875, "learning_rate": 7.300338977544519e-07, "loss": 0.0238, "reward": 0.9600694477558136, "reward_std": 0.36052028089761734, "rewards/accuracy_reward": 0.4375000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5225694477558136, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 816.0208587646484, "epoch": 0.4364508393285372, "grad_norm": 0.19516033120759366, "kl": 0.0160675048828125, "learning_rate": 7.230802283234904e-07, "loss": 0.0525, "reward": 0.989583358168602, "reward_std": 0.3490638807415962, "rewards/accuracy_reward": 0.472222238779068, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5173611044883728, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 801.7291870117188, "epoch": 0.4412470023980815, "grad_norm": 0.14966073132580007, "kl": 0.0144195556640625, "learning_rate": 7.160777101062865e-07, "loss": 0.0341, "reward": 1.0225694626569748, "reward_std": 0.337300319224596, "rewards/accuracy_reward": 0.4791666567325592, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5434027910232544, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 834.2222290039062, "epoch": 0.4460431654676259, "grad_norm": 0.13423539328804068, "kl": 0.0140380859375, "learning_rate": 7.090283194378542e-07, "loss": 0.0035, "reward": 0.921875, "reward_std": 0.259520523250103, "rewards/accuracy_reward": 0.423611119389534, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4982638880610466, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 832.9444427490234, "epoch": 0.45083932853717024, "grad_norm": 0.16645556279200993, "kl": 0.013580322265625, "learning_rate": 7.019340458821159e-07, "loss": 0.0388, "reward": 0.9652777910232544, "reward_std": 0.29097262397408485, "rewards/accuracy_reward": 0.4652777761220932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5000000074505806, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 760.0764007568359, "epoch": 0.4556354916067146, "grad_norm": 0.15120637517973337, "kl": 0.01580810546875, "learning_rate": 6.947968916703826e-07, "loss": 0.0373, "reward": 1.015625, "reward_std": 0.2590954527258873, "rewards/accuracy_reward": 0.486111119389534, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5295138880610466, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 765.4791564941406, "epoch": 0.460431654676259, "grad_norm": 0.16253895506587696, "kl": 0.0163421630859375, "learning_rate": 6.876188711362603e-07, "loss": 0.0583, "reward": 0.9513889104127884, "reward_std": 0.3330418989062309, "rewards/accuracy_reward": 0.4236111044883728, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5277777910232544, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 775.6736297607422, "epoch": 0.46522781774580335, "grad_norm": 0.20268144913502836, "kl": 0.0174560546875, "learning_rate": 6.80402010147141e-07, "loss": 0.0393, "reward": 1.017361119389534, "reward_std": 0.3649497255682945, "rewards/accuracy_reward": 0.4652777798473835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5520833358168602, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 766.9652709960938, "epoch": 0.47002398081534774, "grad_norm": 0.1728775404147991, "kl": 0.015045166015625, "learning_rate": 6.731483455324374e-07, "loss": 0.0282, "reward": 1.1076389104127884, "reward_std": 0.3713233917951584, "rewards/accuracy_reward": 0.5347222238779068, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5729166716337204, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 781.7222442626953, "epoch": 0.4748201438848921, "grad_norm": 0.1996452779202398, "kl": 0.01593017578125, "learning_rate": 6.658599245087241e-07, "loss": 0.0927, "reward": 1.1597222536802292, "reward_std": 0.3544151149690151, "rewards/accuracy_reward": 0.5833333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5763888955116272, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 823.1527862548828, "epoch": 0.47961630695443647, "grad_norm": 0.14748523873535388, "kl": 0.0178680419921875, "learning_rate": 6.585388041019487e-07, "loss": 0.0367, "reward": 1.0312500149011612, "reward_std": 0.27015675604343414, "rewards/accuracy_reward": 0.4861111119389534, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5451388880610466, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 772.1944427490234, "epoch": 0.4844124700239808, "grad_norm": 0.5160967541046672, "kl": 0.0177764892578125, "learning_rate": 6.511870505668725e-07, "loss": 0.0517, "reward": 1.0746527761220932, "reward_std": 0.32491182163357735, "rewards/accuracy_reward": 0.5138888880610466, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5607638955116272, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 813.8680572509766, "epoch": 0.4892086330935252, "grad_norm": 0.13567321274406932, "kl": 0.017913818359375, "learning_rate": 6.438067388039064e-07, "loss": 0.039, "reward": 1.0138888955116272, "reward_std": 0.2365904077887535, "rewards/accuracy_reward": 0.4722222313284874, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5416666641831398, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 792.1250152587891, "epoch": 0.4940047961630695, "grad_norm": 0.16686045898429722, "kl": 0.017578125, "learning_rate": 6.36399951773509e-07, "loss": 0.0349, "reward": 1.052083358168602, "reward_std": 0.3173811621963978, "rewards/accuracy_reward": 0.5138888880610466, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5381944552063942, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 792.9583435058594, "epoch": 0.4988009592326139, "grad_norm": 0.16673634023147957, "kl": 0.019927978515625, "learning_rate": 6.289687799083072e-07, "loss": 0.0385, "reward": 0.954861119389534, "reward_std": 0.34330564737319946, "rewards/accuracy_reward": 0.4444444477558136, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.510416679084301, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 838.2152862548828, "epoch": 0.5035971223021583, "grad_norm": 0.19765705391364718, "kl": 0.018402099609375, "learning_rate": 6.2151532052311e-07, "loss": 0.0526, "reward": 0.9461805671453476, "reward_std": 0.380074605345726, "rewards/accuracy_reward": 0.4305555522441864, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5156250074505806, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 848.3958282470703, "epoch": 0.5083932853717026, "grad_norm": 0.16771910068496884, "kl": 0.018890380859375, "learning_rate": 6.140416772229784e-07, "loss": 0.0449, "reward": 0.8819444477558136, "reward_std": 0.36511222273111343, "rewards/accuracy_reward": 0.3819444552063942, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 763.1527862548828, "epoch": 0.513189448441247, "grad_norm": 0.1914924709345533, "kl": 0.022308349609375, "learning_rate": 6.065499593095208e-07, "loss": 0.0358, "reward": 1.0451388955116272, "reward_std": 0.3345286548137665, "rewards/accuracy_reward": 0.4722222238779068, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5729166716337204, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 795.0000152587891, "epoch": 0.5179856115107914, "grad_norm": 0.23442272993782337, "kl": 0.02197265625, "learning_rate": 5.990422811855812e-07, "loss": 0.0786, "reward": 0.9982638955116272, "reward_std": 0.38987091183662415, "rewards/accuracy_reward": 0.4375000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5607639029622078, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 801.8402862548828, "epoch": 0.5227817745803357, "grad_norm": 0.21980101064995708, "kl": 0.02423095703125, "learning_rate": 5.915207617584858e-07, "loss": 0.0335, "reward": 0.9427083432674408, "reward_std": 0.36763929575681686, "rewards/accuracy_reward": 0.4305555522441864, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5121527835726738, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 743.0069427490234, "epoch": 0.5275779376498801, "grad_norm": 0.20707736821413145, "kl": 0.025299072265625, "learning_rate": 5.839875238420205e-07, "loss": 0.0706, "reward": 1.1406249850988388, "reward_std": 0.29043491929769516, "rewards/accuracy_reward": 0.555555559694767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.585069440305233, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 804.6111145019531, "epoch": 0.5323741007194245, "grad_norm": 0.17161222800354356, "kl": 0.021148681640625, "learning_rate": 5.764446935573041e-07, "loss": 0.0426, "reward": 0.9861110895872116, "reward_std": 0.33307311683893204, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.548611119389534, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 778.125, "epoch": 0.5371702637889688, "grad_norm": 0.21031410356763766, "kl": 0.0242919921875, "learning_rate": 5.688943997327288e-07, "loss": 0.05, "reward": 0.9496527910232544, "reward_std": 0.2749031111598015, "rewards/accuracy_reward": 0.4027777761220932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5468750074505806, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 767.6458435058594, "epoch": 0.5419664268585132, "grad_norm": 0.282246872790042, "kl": 0.02569580078125, "learning_rate": 5.613387733031375e-07, "loss": 0.0988, "reward": 1.1336805522441864, "reward_std": 0.35546836256980896, "rewards/accuracy_reward": 0.5486111268401146, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5850694477558136, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 727.4097137451172, "epoch": 0.5467625899280576, "grad_norm": 0.2177323521611779, "kl": 0.0255126953125, "learning_rate": 5.53779946708405e-07, "loss": 0.047, "reward": 1.1145833283662796, "reward_std": 0.28548414260149, "rewards/accuracy_reward": 0.5277777798473835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.586805559694767, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 762.1875152587891, "epoch": 0.5515587529976019, "grad_norm": 0.2228573500540112, "kl": 0.023681640625, "learning_rate": 5.462200532915951e-07, "loss": 0.0526, "reward": 1.0694444626569748, "reward_std": 0.3976950142532587, "rewards/accuracy_reward": 0.4722222350537777, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5972222313284874, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 821.8263854980469, "epoch": 0.5563549160671463, "grad_norm": 0.22544332116422824, "kl": 0.02313232421875, "learning_rate": 5.386612266968625e-07, "loss": 0.0808, "reward": 0.9774305522441864, "reward_std": 0.3071533590555191, "rewards/accuracy_reward": 0.3958333283662796, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5815972238779068, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 732.5486145019531, "epoch": 0.5611510791366906, "grad_norm": 0.24884339296220886, "kl": 0.026092529296875, "learning_rate": 5.311056002672712e-07, "loss": 0.0805, "reward": 1.2187500298023224, "reward_std": 0.33359793573617935, "rewards/accuracy_reward": 0.5902777835726738, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6284722238779068, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 759.3264007568359, "epoch": 0.565947242206235, "grad_norm": 0.20316735987226078, "kl": 0.029693603515625, "learning_rate": 5.235553064426961e-07, "loss": 0.0398, "reward": 1.1354166865348816, "reward_std": 0.29686928167939186, "rewards/accuracy_reward": 0.506944440305233, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.628472238779068, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 727.5, "epoch": 0.5707434052757794, "grad_norm": 0.29440880209457704, "kl": 0.0303955078125, "learning_rate": 5.160124761579795e-07, "loss": 0.0572, "reward": 1.2673611044883728, "reward_std": 0.3973044380545616, "rewards/accuracy_reward": 0.6180555671453476, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6493055820465088, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 746.875, "epoch": 0.5755395683453237, "grad_norm": 0.2292130236322372, "kl": 0.033233642578125, "learning_rate": 5.084792382415141e-07, "loss": 0.0549, "reward": 1.0781250149011612, "reward_std": 0.3586086630821228, "rewards/accuracy_reward": 0.4791666641831398, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5989583432674408, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 846.0, "epoch": 0.580335731414868, "grad_norm": 0.2898186862467139, "kl": 0.031280517578125, "learning_rate": 5.009577188144188e-07, "loss": 0.092, "reward": 0.9079861044883728, "reward_std": 0.3724118545651436, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5538194552063942, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 858.1944580078125, "epoch": 0.5851318944844125, "grad_norm": 0.2766016571596048, "kl": 0.03631591796875, "learning_rate": 4.93450040690479e-07, "loss": 0.0593, "reward": 0.9548611044883728, "reward_std": 0.43608929216861725, "rewards/accuracy_reward": 0.4027777761220932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5520833283662796, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 788.2152862548828, "epoch": 0.5899280575539568, "grad_norm": 0.26705241094954724, "kl": 0.0374755859375, "learning_rate": 4.859583227770217e-07, "loss": 0.0683, "reward": 1.0520833879709244, "reward_std": 0.2720135301351547, "rewards/accuracy_reward": 0.4791666641831398, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5729166567325592, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 772.5069427490234, "epoch": 0.5947242206235012, "grad_norm": 0.2671582354543242, "kl": 0.035980224609375, "learning_rate": 4.784846794768901e-07, "loss": 0.0564, "reward": 1.1267361044883728, "reward_std": 0.3838435262441635, "rewards/accuracy_reward": 0.5277777761220932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5989583283662796, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 807.1597442626953, "epoch": 0.5995203836930456, "grad_norm": 0.28942117017345487, "kl": 0.03851318359375, "learning_rate": 4.7103122009169283e-07, "loss": 0.0337, "reward": 1.0086805671453476, "reward_std": 0.31841161847114563, "rewards/accuracy_reward": 0.4513888880610466, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5572916716337204, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 794.0694580078125, "epoch": 0.60431654676259, "grad_norm": 0.29337843160217614, "kl": 0.034027099609375, "learning_rate": 4.63600048226491e-07, "loss": 0.0638, "reward": 1.0694444626569748, "reward_std": 0.37363580614328384, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5902777910232544, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 682.5347290039062, "epoch": 0.6091127098321343, "grad_norm": 0.2980289320595657, "kl": 0.046630859375, "learning_rate": 4.5619326119609346e-07, "loss": 0.0582, "reward": 1.0850694626569748, "reward_std": 0.3542333133518696, "rewards/accuracy_reward": 0.4583333283662796, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.626736119389534, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 810.3055572509766, "epoch": 0.6139088729016786, "grad_norm": 0.4115921669385449, "kl": 0.039794921875, "learning_rate": 4.4881294943312756e-07, "loss": 0.1129, "reward": 0.9635416716337204, "reward_std": 0.4346095398068428, "rewards/accuracy_reward": 0.3958333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5677083283662796, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 796.3958282470703, "epoch": 0.6187050359712231, "grad_norm": 0.28690441517101206, "kl": 0.03887939453125, "learning_rate": 4.414611958980512e-07, "loss": 0.0596, "reward": 1.104166641831398, "reward_std": 0.32108214125037193, "rewards/accuracy_reward": 0.5138888955116272, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5902777910232544, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 704.2986145019531, "epoch": 0.6235011990407674, "grad_norm": 0.39366843219844283, "kl": 0.045166015625, "learning_rate": 4.3414007549127594e-07, "loss": 0.0469, "reward": 1.1545138955116272, "reward_std": 0.362628273665905, "rewards/accuracy_reward": 0.5208333283662796, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.633680559694767, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 792.5555572509766, "epoch": 0.6282973621103117, "grad_norm": 0.3049060746508426, "kl": 0.044677734375, "learning_rate": 4.268516544675628e-07, "loss": 0.0332, "reward": 1.0902777910232544, "reward_std": 0.381888784468174, "rewards/accuracy_reward": 0.5000000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5902777761220932, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 760.2291717529297, "epoch": 0.6330935251798561, "grad_norm": 0.5199022261670415, "kl": 0.05535888671875, "learning_rate": 4.195979898528589e-07, "loss": 0.0576, "reward": 1.131944477558136, "reward_std": 0.45905186980962753, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6319444477558136, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 736.3541717529297, "epoch": 0.6378896882494005, "grad_norm": 0.4002608571694072, "kl": 0.0509033203125, "learning_rate": 4.1238112886373967e-07, "loss": 0.0692, "reward": 1.2430555820465088, "reward_std": 0.43104151636362076, "rewards/accuracy_reward": 0.5902777910232544, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6527777910232544, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 750.2986145019531, "epoch": 0.6426858513189448, "grad_norm": 0.47064244094639573, "kl": 0.05303955078125, "learning_rate": 4.0520310832961747e-07, "loss": 0.0578, "reward": 1.2552083730697632, "reward_std": 0.3141016773879528, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6510416716337204, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 700.3611297607422, "epoch": 0.6474820143884892, "grad_norm": 0.5155783092634934, "kl": 0.04730224609375, "learning_rate": 3.980659541178841e-07, "loss": 0.0851, "reward": 1.1597222089767456, "reward_std": 0.33885327726602554, "rewards/accuracy_reward": 0.5625000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5972222238779068, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 733.5833435058594, "epoch": 0.6522781774580336, "grad_norm": 0.3556569979579297, "kl": 0.0576171875, "learning_rate": 3.909716805621458e-07, "loss": 0.0298, "reward": 1.1493055522441864, "reward_std": 0.3029083050787449, "rewards/accuracy_reward": 0.5208333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.628472238779068, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 707.9791717529297, "epoch": 0.657074340527578, "grad_norm": 0.529609284338842, "kl": 0.06280517578125, "learning_rate": 3.8392228989371357e-07, "loss": 0.1004, "reward": 1.0902777910232544, "reward_std": 0.3477436378598213, "rewards/accuracy_reward": 0.4722222313284874, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6180555522441864, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 735.5902938842773, "epoch": 0.6618705035971223, "grad_norm": 0.5906533852176611, "kl": 0.06390380859375, "learning_rate": 3.7691977167650947e-07, "loss": 0.0947, "reward": 1.2638888955116272, "reward_std": 0.37084779888391495, "rewards/accuracy_reward": 0.5972222313284874, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6666666865348816, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 787.9236145019531, "epoch": 0.6666666666666666, "grad_norm": 0.3518724040292374, "kl": 0.05731201171875, "learning_rate": 3.6996610224554815e-07, "loss": 0.035, "reward": 1.1510416865348816, "reward_std": 0.39138108491897583, "rewards/accuracy_reward": 0.5208333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6302083432674408, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 753.4236145019531, "epoch": 0.6714628297362111, "grad_norm": 0.5518379170803371, "kl": 0.0836181640625, "learning_rate": 3.630632441491511e-07, "loss": 0.0206, "reward": 1.1197917014360428, "reward_std": 0.33788175135850906, "rewards/accuracy_reward": 0.4930555745959282, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6267361044883728, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 847.3541717529297, "epoch": 0.6762589928057554, "grad_norm": 0.7174313824432543, "kl": 0.08111572265625, "learning_rate": 3.562131455950538e-07, "loss": 0.075, "reward": 0.940972238779068, "reward_std": 0.39178355410695076, "rewards/accuracy_reward": 0.3819444477558136, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5590277835726738, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 823.6180572509766, "epoch": 0.6810551558752997, "grad_norm": 0.8192916163585466, "kl": 0.09405517578125, "learning_rate": 3.4941773990055777e-07, "loss": 0.0704, "reward": 0.8750000149011612, "reward_std": 0.40220723301172256, "rewards/accuracy_reward": 0.3402777761220932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.534722238779068, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 741.7569580078125, "epoch": 0.6858513189448441, "grad_norm": 0.6199201515779919, "kl": 0.0787353515625, "learning_rate": 3.426789449468873e-07, "loss": 0.0473, "reward": 1.1718749850988388, "reward_std": 0.3498489521443844, "rewards/accuracy_reward": 0.5486111119389534, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6232638955116272, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 832.3055572509766, "epoch": 0.6906474820143885, "grad_norm": 0.9765675758790994, "kl": 0.08935546875, "learning_rate": 3.359986626379022e-07, "loss": 0.0842, "reward": 0.984375, "reward_std": 0.48340315371751785, "rewards/accuracy_reward": 0.423611119389534, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5607639029622078, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 741.2639007568359, "epoch": 0.6954436450839329, "grad_norm": 0.6256951971346841, "kl": 0.0902099609375, "learning_rate": 3.293787783633182e-07, "loss": 0.0524, "reward": 1.092013880610466, "reward_std": 0.35471441224217415, "rewards/accuracy_reward": 0.4861111268401146, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6059027761220932, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 749.1250152587891, "epoch": 0.7002398081534772, "grad_norm": 0.4746047065439084, "kl": 0.0960693359375, "learning_rate": 3.2282116046659064e-07, "loss": 0.0216, "reward": 1.1197916567325592, "reward_std": 0.3484783172607422, "rewards/accuracy_reward": 0.5000000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6197916716337204, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 728.75, "epoch": 0.7050359712230215, "grad_norm": 0.4212278612821504, "kl": 0.1041259765625, "learning_rate": 3.163276597176087e-07, "loss": 0.0352, "reward": 1.3003472089767456, "reward_std": 0.366548266261816, "rewards/accuracy_reward": 0.6388888955116272, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.661458358168602, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 729.4166717529297, "epoch": 0.709832134292566, "grad_norm": 0.5703953599283403, "kl": 0.1136474609375, "learning_rate": 3.099001087903473e-07, "loss": 0.0144, "reward": 1.1718750298023224, "reward_std": 0.44783008843660355, "rewards/accuracy_reward": 0.5625000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.609375, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 807.3610992431641, "epoch": 0.7146282973621103, "grad_norm": 0.6651812784116454, "kl": 0.12939453125, "learning_rate": 3.0354032174562863e-07, "loss": 0.0654, "reward": 1.0920139104127884, "reward_std": 0.3492956757545471, "rewards/accuracy_reward": 0.4791666641831398, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6128472313284874, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 786.3194732666016, "epoch": 0.7194244604316546, "grad_norm": 0.689721941763239, "kl": 0.1400146484375, "learning_rate": 2.97250093519136e-07, "loss": 0.0635, "reward": 1.1111111342906952, "reward_std": 0.3203607201576233, "rewards/accuracy_reward": 0.486111119389534, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6250000149011612, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 792.2152709960938, "epoch": 0.7242206235011991, "grad_norm": 1.2387475983297178, "kl": 0.1351318359375, "learning_rate": 2.910311994148255e-07, "loss": 0.071, "reward": 1.09375, "reward_std": 0.36406850814819336, "rewards/accuracy_reward": 0.4930555559694767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6006944477558136, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 767.8333435058594, "epoch": 0.7290167865707434, "grad_norm": 0.7047137897175717, "kl": 0.1422119140625, "learning_rate": 2.848853946038782e-07, "loss": 0.0384, "reward": 1.0711805522441864, "reward_std": 0.2421913631260395, "rewards/accuracy_reward": 0.4791666641831398, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5920139104127884, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 749.2708435058594, "epoch": 0.7338129496402878, "grad_norm": 1.144909232594251, "kl": 0.1258544921875, "learning_rate": 2.7881441362933464e-07, "loss": 0.066, "reward": 1.045138880610466, "reward_std": 0.3445451110601425, "rewards/accuracy_reward": 0.430555559694767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6145833432674408, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 723.8055572509766, "epoch": 0.7386091127098321, "grad_norm": 0.9998994389867698, "kl": 0.1502685546875, "learning_rate": 2.7281996991655145e-07, "loss": 0.0722, "reward": 1.1649305671453476, "reward_std": 0.4142308458685875, "rewards/accuracy_reward": 0.5555555671453476, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.609375, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 712.1736297607422, "epoch": 0.7434052757793765, "grad_norm": 0.8719467247990521, "kl": 0.1572265625, "learning_rate": 2.669037552896172e-07, "loss": 0.0362, "reward": 1.1753472536802292, "reward_std": 0.4305378869175911, "rewards/accuracy_reward": 0.5277777835726738, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6475694477558136, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 792.1458282470703, "epoch": 0.7482014388489209, "grad_norm": 1.5075628106035353, "kl": 0.19970703125, "learning_rate": 2.6106743949386586e-07, "loss": 0.0657, "reward": 1.0902777910232544, "reward_std": 0.39609089493751526, "rewards/accuracy_reward": 0.4652777910232544, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6250000149011612, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 720.9791717529297, "epoch": 0.7529976019184652, "grad_norm": 1.1801204435963506, "kl": 0.14697265625, "learning_rate": 2.553126697246217e-07, "loss": 0.0499, "reward": 1.1493055820465088, "reward_std": 0.40563249588012695, "rewards/accuracy_reward": 0.541666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6076388955116272, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 708.5486297607422, "epoch": 0.7577937649880095, "grad_norm": 1.3449351919779642, "kl": 0.1728515625, "learning_rate": 2.49641070162307e-07, "loss": 0.0543, "reward": 1.2118056118488312, "reward_std": 0.3700602427124977, "rewards/accuracy_reward": 0.5833333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6284722238779068, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 757.7916870117188, "epoch": 0.762589928057554, "grad_norm": 1.141025370807687, "kl": 0.19677734375, "learning_rate": 2.440542415140466e-07, "loss": 0.0881, "reward": 1.1232638955116272, "reward_std": 0.4029542878270149, "rewards/accuracy_reward": 0.493055559694767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6302083432674408, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 677.4652862548828, "epoch": 0.7673860911270983, "grad_norm": 1.6300168405182138, "kl": 0.2421875, "learning_rate": 2.3855376056189737e-07, "loss": 0.058, "reward": 1.3194444477558136, "reward_std": 0.44138168543577194, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7152777761220932, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 664.5694580078125, "epoch": 0.7721822541966427, "grad_norm": 1.5733647270439453, "kl": 0.20703125, "learning_rate": 2.3314117971782945e-07, "loss": 0.0652, "reward": 1.1788194477558136, "reward_std": 0.3714512586593628, "rewards/accuracy_reward": 0.541666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6371527910232544, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 769.9722290039062, "epoch": 0.7769784172661871, "grad_norm": 1.533175643219152, "kl": 0.23095703125, "learning_rate": 2.2781802658558635e-07, "loss": 0.0533, "reward": 0.984375, "reward_std": 0.39164651185274124, "rewards/accuracy_reward": 0.4027777835726738, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.581597238779068, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 733.8125, "epoch": 0.7817745803357314, "grad_norm": 0.9142708944559201, "kl": 0.22607421875, "learning_rate": 2.2258580352954552e-07, "loss": 0.0356, "reward": 1.1076388955116272, "reward_std": 0.32901762425899506, "rewards/accuracy_reward": 0.472222238779068, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6354166716337204, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 743.4930572509766, "epoch": 0.7865707434052758, "grad_norm": 1.564329334465899, "kl": 0.3662109375, "learning_rate": 2.1744598725070347e-07, "loss": 0.0512, "reward": 1.0538194477558136, "reward_std": 0.28839075565338135, "rewards/accuracy_reward": 0.4652777835726738, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5885416716337204, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 739.3611145019531, "epoch": 0.7913669064748201, "grad_norm": 1.1882320571551739, "kl": 0.28759765625, "learning_rate": 2.1240002836990328e-07, "loss": 0.0243, "reward": 1.1527777910232544, "reward_std": 0.3735358491539955, "rewards/accuracy_reward": 0.4791666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6736111044883728, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 706.4166717529297, "epoch": 0.7961630695443646, "grad_norm": 1.201792642717349, "kl": 0.32080078125, "learning_rate": 2.0744935101842275e-07, "loss": 0.0349, "reward": 1.1701389253139496, "reward_std": 0.3558007851243019, "rewards/accuracy_reward": 0.4930555671453476, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6770833283662796, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 669.9444580078125, "epoch": 0.8009592326139089, "grad_norm": 1.415131555543672, "kl": 0.339111328125, "learning_rate": 2.025953524360396e-07, "loss": 0.0588, "reward": 1.2465277910232544, "reward_std": 0.3056763559579849, "rewards/accuracy_reward": 0.6250000074505806, "rewards/format_reward": 0.0069444444961845875, "rewards/tag_count_reward": 0.6145833358168602, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 690.5277862548828, "epoch": 0.8057553956834532, "grad_norm": 1.8031709952858856, "kl": 0.342041015625, "learning_rate": 1.9783940257668473e-07, "loss": 0.1108, "reward": 1.1909722089767456, "reward_std": 0.42986829578876495, "rewards/accuracy_reward": 0.5416666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6493055671453476, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 731.9027862548828, "epoch": 0.8105515587529976, "grad_norm": 1.54016513318198, "kl": 0.34375, "learning_rate": 1.9318284372179783e-07, "loss": 0.0829, "reward": 1.0902778059244156, "reward_std": 0.3709410950541496, "rewards/accuracy_reward": 0.4791666641831398, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.611111119389534, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 678.4305572509766, "epoch": 0.815347721822542, "grad_norm": 1.2889679479137333, "kl": 0.310791015625, "learning_rate": 1.8862699010149265e-07, "loss": 0.0637, "reward": 1.1493055820465088, "reward_std": 0.4024455025792122, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6493055671453476, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 666.5764007568359, "epoch": 0.8201438848920863, "grad_norm": 1.2493403412894837, "kl": 0.37060546875, "learning_rate": 1.8417312752363842e-07, "loss": 0.0292, "reward": 1.2986111342906952, "reward_std": 0.39357686042785645, "rewards/accuracy_reward": 0.5972222238779068, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7013888955116272, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 680.5, "epoch": 0.8249400479616307, "grad_norm": 1.3917142124264024, "kl": 0.289794921875, "learning_rate": 1.7982251301096496e-07, "loss": 0.0587, "reward": 1.2343749701976776, "reward_std": 0.3718552738428116, "rewards/accuracy_reward": 0.569444440305233, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6649305671453476, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 762.5139007568359, "epoch": 0.829736211031175, "grad_norm": 2.082423982787284, "kl": 0.37939453125, "learning_rate": 1.7557637444628934e-07, "loss": 0.0734, "reward": 1.0295139104127884, "reward_std": 0.42010512948036194, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.612847238779068, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 758.2777862548828, "epoch": 0.8345323741007195, "grad_norm": 1.4700469055914496, "kl": 0.31884765625, "learning_rate": 1.7143591022596842e-07, "loss": 0.0462, "reward": 1.0850694626569748, "reward_std": 0.3826001510024071, "rewards/accuracy_reward": 0.4513888955116272, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6336805373430252, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 694.0694427490234, "epoch": 0.8393285371702638, "grad_norm": 1.3297784898512763, "kl": 0.38623046875, "learning_rate": 1.674022889216737e-07, "loss": 0.0566, "reward": 1.2083333134651184, "reward_std": 0.36128322780132294, "rewards/accuracy_reward": 0.5486111044883728, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.659722238779068, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 678.7152862548828, "epoch": 0.8441247002398081, "grad_norm": 1.4970986391973622, "kl": 0.312744140625, "learning_rate": 1.634766489505815e-07, "loss": 0.0584, "reward": 1.2951389253139496, "reward_std": 0.39571166411042213, "rewards/accuracy_reward": 0.6180555522441864, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6770833283662796, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 650.5069580078125, "epoch": 0.8489208633093526, "grad_norm": 1.3198452732539876, "kl": 0.282958984375, "learning_rate": 1.5966009825407664e-07, "loss": 0.0487, "reward": 1.2291666865348816, "reward_std": 0.41252629458904266, "rewards/accuracy_reward": 0.5763888955116272, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6527777910232544, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 709.4861145019531, "epoch": 0.8537170263788969, "grad_norm": 1.571938422464948, "kl": 0.275146484375, "learning_rate": 1.5595371398505497e-07, "loss": 0.0601, "reward": 1.1354167014360428, "reward_std": 0.3936513438820839, "rewards/accuracy_reward": 0.4444444552063942, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6909722238779068, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 644.2847290039062, "epoch": 0.8585131894484412, "grad_norm": 1.2655481838867129, "kl": 0.313232421875, "learning_rate": 1.523585422039165e-07, "loss": 0.0395, "reward": 1.2447916567325592, "reward_std": 0.3132231794297695, "rewards/accuracy_reward": 0.5555555671453476, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.689236119389534, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 727.8055725097656, "epoch": 0.8633093525179856, "grad_norm": 2.287249210625076, "kl": 0.4541015625, "learning_rate": 1.4887559758333408e-07, "loss": 0.0809, "reward": 1.1718749850988388, "reward_std": 0.4368325099349022, "rewards/accuracy_reward": 0.5069444477558136, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6649305671453476, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 677.8541641235352, "epoch": 0.86810551558753, "grad_norm": 2.029066779031394, "kl": 0.44677734375, "learning_rate": 1.4550586312187919e-07, "loss": 0.0318, "reward": 1.2395833432674408, "reward_std": 0.34451349824666977, "rewards/accuracy_reward": 0.597222238779068, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.642361119389534, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 668.8194427490234, "epoch": 0.8729016786570744, "grad_norm": 1.6348530863651882, "kl": 0.3857421875, "learning_rate": 1.4225028986658965e-07, "loss": 0.0692, "reward": 1.2500000298023224, "reward_std": 0.4199504852294922, "rewards/accuracy_reward": 0.5763888880610466, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.673611119389534, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 739.4930725097656, "epoch": 0.8776978417266187, "grad_norm": 2.171960155817612, "kl": 0.31787109375, "learning_rate": 1.391097966445526e-07, "loss": 0.0609, "reward": 1.1805555820465088, "reward_std": 0.3399686738848686, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.659722238779068, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 710.2638854980469, "epoch": 0.882494004796163, "grad_norm": 2.354582366359387, "kl": 0.439208984375, "learning_rate": 1.3608526980358242e-07, "loss": 0.1236, "reward": 1.1701389104127884, "reward_std": 0.3848187327384949, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.670138880610466, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 740.0833435058594, "epoch": 0.8872901678657075, "grad_norm": 1.0825401183719674, "kl": 0.34619140625, "learning_rate": 1.331775629620653e-07, "loss": 0.0486, "reward": 1.1493055522441864, "reward_std": 0.36640702188014984, "rewards/accuracy_reward": 0.4861111268401146, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6631944477558136, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 628.0208282470703, "epoch": 0.8920863309352518, "grad_norm": 1.7907760466473297, "kl": 0.5361328125, "learning_rate": 1.303874967680399e-07, "loss": 0.0542, "reward": 1.2604166567325592, "reward_std": 0.4223191514611244, "rewards/accuracy_reward": 0.5694444552063942, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.690972238779068, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 629.2708358764648, "epoch": 0.8968824940047961, "grad_norm": 1.7941484108862593, "kl": 0.375, "learning_rate": 1.277158586675852e-07, "loss": 0.0782, "reward": 1.1996527910232544, "reward_std": 0.33358532190322876, "rewards/accuracy_reward": 0.493055559694767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.706597238779068, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 774.6180725097656, "epoch": 0.9016786570743405, "grad_norm": 1.3540196149379808, "kl": 0.42333984375, "learning_rate": 1.2516340268257737e-07, "loss": 0.0613, "reward": 1.065972238779068, "reward_std": 0.3640429899096489, "rewards/accuracy_reward": 0.3819444477558136, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6840277910232544, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 673.5208511352539, "epoch": 0.9064748201438849, "grad_norm": 2.3552439559585534, "kl": 0.56640625, "learning_rate": 1.2273084919788063e-07, "loss": 0.0419, "reward": 1.2378471940755844, "reward_std": 0.40937893092632294, "rewards/accuracy_reward": 0.5833333283662796, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.654513880610466, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 680.7222137451172, "epoch": 0.9112709832134293, "grad_norm": 2.0506106800441746, "kl": 0.62939453125, "learning_rate": 1.2041888475803217e-07, "loss": 0.0987, "reward": 1.1649305671453476, "reward_std": 0.4104561358690262, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6440972238779068, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 652.5694580078125, "epoch": 0.9160671462829736, "grad_norm": 2.011461189500051, "kl": 0.650390625, "learning_rate": 1.1822816187347622e-07, "loss": 0.1134, "reward": 1.1857638955116272, "reward_std": 0.4204775467514992, "rewards/accuracy_reward": 0.5347222313284874, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6510416716337204, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 672.8889007568359, "epoch": 0.920863309352518, "grad_norm": 1.766556315204042, "kl": 0.52197265625, "learning_rate": 1.1615929883640567e-07, "loss": 0.0868, "reward": 1.2239583283662796, "reward_std": 0.37772539258003235, "rewards/accuracy_reward": 0.5486111268401146, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.675347238779068, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 705.6805572509766, "epoch": 0.9256594724220624, "grad_norm": 1.2464963403957747, "kl": 0.42431640625, "learning_rate": 1.1421287954625985e-07, "loss": 0.0538, "reward": 1.2118055522441864, "reward_std": 0.3169648088514805, "rewards/accuracy_reward": 0.5416666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6701388955116272, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 735.0416870117188, "epoch": 0.9304556354916067, "grad_norm": 1.4114342882843525, "kl": 0.4072265625, "learning_rate": 1.1238945334492928e-07, "loss": 0.038, "reward": 1.1388888955116272, "reward_std": 0.3398313596844673, "rewards/accuracy_reward": 0.4791666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.659722238779068, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 734.25, "epoch": 0.935251798561151, "grad_norm": 1.3534299207814684, "kl": 0.740234375, "learning_rate": 1.1068953486171385e-07, "loss": 0.0948, "reward": 1.14930559694767, "reward_std": 0.4659058451652527, "rewards/accuracy_reward": 0.5138888955116272, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6354166716337204, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 767.3125, "epoch": 0.9400479616306955, "grad_norm": 1.8926261098268726, "kl": 0.6171875, "learning_rate": 1.0911360386807814e-07, "loss": 0.0999, "reward": 1.034722238779068, "reward_std": 0.3850885070860386, "rewards/accuracy_reward": 0.423611119389534, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6111111342906952, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 671.8333282470703, "epoch": 0.9448441247002398, "grad_norm": 2.2335073755128354, "kl": 0.63671875, "learning_rate": 1.0766210514224419e-07, "loss": 0.0608, "reward": 1.206597238779068, "reward_std": 0.39280908554792404, "rewards/accuracy_reward": 0.5486111342906952, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.657986119389534, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 690.3472137451172, "epoch": 0.9496402877697842, "grad_norm": 2.0084883063292747, "kl": 0.505859375, "learning_rate": 1.0633544834366123e-07, "loss": 0.1037, "reward": 1.2291666865348816, "reward_std": 0.44404156506061554, "rewards/accuracy_reward": 0.5277777835726738, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7013888955116272, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 706.125, "epoch": 0.9544364508393285, "grad_norm": 2.8291221690849255, "kl": 0.67822265625, "learning_rate": 1.051340078973863e-07, "loss": 0.084, "reward": 1.0954861044883728, "reward_std": 0.43709662556648254, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6579861044883728, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 670.6319427490234, "epoch": 0.9592326139088729, "grad_norm": 1.3994189437001958, "kl": 0.4013671875, "learning_rate": 1.0405812288840964e-07, "loss": 0.0765, "reward": 1.2777777761220932, "reward_std": 0.34174390137195587, "rewards/accuracy_reward": 0.5763889029622078, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7013888955116272, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 629.4861145019531, "epoch": 0.9640287769784173, "grad_norm": 1.434366201343452, "kl": 0.3046875, "learning_rate": 1.031080969659543e-07, "loss": 0.0855, "reward": 1.3107638955116272, "reward_std": 0.34832194447517395, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7065972238779068, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 665.0833587646484, "epoch": 0.9688249400479616, "grad_norm": 1.5034799191037997, "kl": 0.46337890625, "learning_rate": 1.0228419825777602e-07, "loss": 0.0582, "reward": 1.2343750298023224, "reward_std": 0.4124446362257004, "rewards/accuracy_reward": 0.5763889029622078, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6579861342906952, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 781.5555572509766, "epoch": 0.973621103117506, "grad_norm": 2.678396312950966, "kl": 0.5263671875, "learning_rate": 1.0158665929448951e-07, "loss": 0.0947, "reward": 1.0694444477558136, "reward_std": 0.42056479305028915, "rewards/accuracy_reward": 0.3958333283662796, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6736111044883728, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 618.8888854980469, "epoch": 0.9784172661870504, "grad_norm": 2.12368752310343, "kl": 0.5556640625, "learning_rate": 1.0101567694394071e-07, "loss": 0.1194, "reward": 1.3229166567325592, "reward_std": 0.41604190319776535, "rewards/accuracy_reward": 0.6111111268401146, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7118055522441864, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 715.125, "epoch": 0.9832134292565947, "grad_norm": 1.6931563983674274, "kl": 0.5595703125, "learning_rate": 1.0057141235564423e-07, "loss": 0.0796, "reward": 1.1458333432674408, "reward_std": 0.39061587303876877, "rewards/accuracy_reward": 0.5000000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6458333432674408, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 721.2014007568359, "epoch": 0.988009592326139, "grad_norm": 1.6094244664912092, "kl": 0.51513671875, "learning_rate": 1.0025399091530193e-07, "loss": 0.0913, "reward": 1.2239583432674408, "reward_std": 0.34610963612794876, "rewards/accuracy_reward": 0.5347222089767456, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6892361044883728, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 612.5069351196289, "epoch": 0.9928057553956835, "grad_norm": 1.7144937573704717, "kl": 0.45556640625, "learning_rate": 1.0006350220941502e-07, "loss": 0.0338, "reward": 1.3229166865348816, "reward_std": 0.37486525624990463, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.739583358168602, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 573.1250076293945, "epoch": 0.9976019184652278, "grad_norm": 1.1702049182470995, "kl": 0.43017578125, "learning_rate": 1e-07, "loss": 0.02, "reward": 1.237847238779068, "reward_std": 0.40600838512182236, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6753472089767456, "step": 208 }, { "epoch": 0.9976019184652278, "step": 208, "total_flos": 0.0, "train_loss": 0.05009339519780882, "train_runtime": 7148.6291, "train_samples_per_second": 1.049, "train_steps_per_second": 0.029 } ], "logging_steps": 1, "max_steps": 208, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 12, "trial_name": null, "trial_params": null }