| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9999146539216524, |
| "eval_steps": 500, |
| "global_step": 2929, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1028.7812957763672, |
| "epoch": 0.0003413843133907997, |
| "grad_norm": 0.27919793128967285, |
| "kl": 0.0, |
| "learning_rate": 1.0238907849829352e-08, |
| "loss": 0.0496, |
| "reward": 0.2734375111758709, |
| "reward_std": 0.28523072227835655, |
| "rewards/accuracy_reward": 0.19196430034935474, |
| "rewards/format_reward": 0.017857144121080637, |
| "rewards/tag_count_reward": 0.06361607415601611, |
| "step": 1 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1013.6094207763672, |
| "epoch": 0.0017069215669539984, |
| "grad_norm": 0.5027251839637756, |
| "kl": 0.0002154707908630371, |
| "learning_rate": 5.119453924914676e-08, |
| "loss": 0.017, |
| "reward": 0.2388392947614193, |
| "reward_std": 0.24704269948415458, |
| "rewards/accuracy_reward": 0.1718750090803951, |
| "rewards/format_reward": 0.016741072293370962, |
| "rewards/tag_count_reward": 0.050223216734593734, |
| "step": 5 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1001.812548828125, |
| "epoch": 0.003413843133907997, |
| "grad_norm": 0.3978239595890045, |
| "kl": 0.00025196075439453124, |
| "learning_rate": 1.0238907849829352e-07, |
| "loss": 0.042, |
| "reward": 0.23370536863803865, |
| "reward_std": 0.24270428121089935, |
| "rewards/accuracy_reward": 0.17500000707805158, |
| "rewards/format_reward": 0.010714286286383868, |
| "rewards/tag_count_reward": 0.04799107422586531, |
| "step": 10 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 963.5545104980469, |
| "epoch": 0.005120764700861995, |
| "grad_norm": 0.47105857729911804, |
| "kl": 0.0002925872802734375, |
| "learning_rate": 1.5358361774744026e-07, |
| "loss": 0.0327, |
| "reward": 0.2330357251688838, |
| "reward_std": 0.21997303143143654, |
| "rewards/accuracy_reward": 0.17500000745058059, |
| "rewards/format_reward": 0.012500000558793545, |
| "rewards/tag_count_reward": 0.045535716018639504, |
| "step": 15 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1039.2929138183595, |
| "epoch": 0.006827686267815994, |
| "grad_norm": 0.3969825208187103, |
| "kl": 0.00026645660400390623, |
| "learning_rate": 2.0477815699658704e-07, |
| "loss": 0.0455, |
| "reward": 0.2522321570664644, |
| "reward_std": 0.25163545124232767, |
| "rewards/accuracy_reward": 0.18928572181612252, |
| "rewards/format_reward": 0.011607143469154835, |
| "rewards/tag_count_reward": 0.05133928842842579, |
| "step": 20 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 989.4536163330079, |
| "epoch": 0.008534607834769992, |
| "grad_norm": 0.24376584589481354, |
| "kl": 0.00028543472290039064, |
| "learning_rate": 2.559726962457338e-07, |
| "loss": 0.0339, |
| "reward": 0.23214286826550962, |
| "reward_std": 0.20356001779437066, |
| "rewards/accuracy_reward": 0.1812500089406967, |
| "rewards/format_reward": 0.009821429010480642, |
| "rewards/tag_count_reward": 0.04107143094297498, |
| "step": 25 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 974.0723571777344, |
| "epoch": 0.01024152940172399, |
| "grad_norm": 0.42095184326171875, |
| "kl": 0.00038013458251953127, |
| "learning_rate": 3.0716723549488053e-07, |
| "loss": 0.0278, |
| "reward": 0.21383929550647734, |
| "reward_std": 0.21393342763185502, |
| "rewards/accuracy_reward": 0.15892857927829027, |
| "rewards/format_reward": 0.008035714644938708, |
| "rewards/tag_count_reward": 0.04687500244472176, |
| "step": 30 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 982.08486328125, |
| "epoch": 0.01194845096867799, |
| "grad_norm": 0.7862982153892517, |
| "kl": 0.0004618644714355469, |
| "learning_rate": 3.583617747440273e-07, |
| "loss": 0.0243, |
| "reward": 0.2435267960652709, |
| "reward_std": 0.26311668269336225, |
| "rewards/accuracy_reward": 0.1633928634226322, |
| "rewards/format_reward": 0.014285715017467737, |
| "rewards/tag_count_reward": 0.06584821809083223, |
| "step": 35 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 954.0571929931641, |
| "epoch": 0.013655372535631987, |
| "grad_norm": 0.5552359819412231, |
| "kl": 0.005234432220458984, |
| "learning_rate": 4.0955631399317407e-07, |
| "loss": 0.0108, |
| "reward": 0.22209822572767735, |
| "reward_std": 0.22155285775661468, |
| "rewards/accuracy_reward": 0.15803572135046123, |
| "rewards/format_reward": 0.007142857555299998, |
| "rewards/tag_count_reward": 0.0569196455180645, |
| "step": 40 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1002.6321838378906, |
| "epoch": 0.015362294102585987, |
| "grad_norm": 0.424653023481369, |
| "kl": 0.0035940170288085937, |
| "learning_rate": 4.6075085324232084e-07, |
| "loss": 0.0258, |
| "reward": 0.2645089395344257, |
| "reward_std": 0.2839862532913685, |
| "rewards/accuracy_reward": 0.18482143767178058, |
| "rewards/format_reward": 0.01785714365541935, |
| "rewards/tag_count_reward": 0.06183036016300321, |
| "step": 45 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1023.3384460449219, |
| "epoch": 0.017069215669539985, |
| "grad_norm": 0.43382683396339417, |
| "kl": 0.0026798248291015625, |
| "learning_rate": 5.119453924914676e-07, |
| "loss": 0.0376, |
| "reward": 0.26004465520381925, |
| "reward_std": 0.2732875030487776, |
| "rewards/accuracy_reward": 0.15357143431901932, |
| "rewards/format_reward": 0.020535715203732253, |
| "rewards/tag_count_reward": 0.08593750447034836, |
| "step": 50 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 954.4053985595704, |
| "epoch": 0.018776137236493982, |
| "grad_norm": 0.7390643358230591, |
| "kl": 0.017375946044921875, |
| "learning_rate": 5.631399317406143e-07, |
| "loss": 0.0153, |
| "reward": 0.2988839440047741, |
| "reward_std": 0.32841442078351973, |
| "rewards/accuracy_reward": 0.15000000689178705, |
| "rewards/format_reward": 0.03035714467987418, |
| "rewards/tag_count_reward": 0.11852679029107094, |
| "step": 55 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 975.8955688476562, |
| "epoch": 0.02048305880344798, |
| "grad_norm": 0.5477886199951172, |
| "kl": 0.021570587158203126, |
| "learning_rate": 6.143344709897611e-07, |
| "loss": 0.0459, |
| "reward": 0.4064732290804386, |
| "reward_std": 0.42607217878103254, |
| "rewards/accuracy_reward": 0.18392858225852252, |
| "rewards/format_reward": 0.05892857350409031, |
| "rewards/tag_count_reward": 0.1636160772293806, |
| "step": 60 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 849.24736328125, |
| "epoch": 0.02218998037040198, |
| "grad_norm": 0.4776475429534912, |
| "kl": 0.0297119140625, |
| "learning_rate": 6.655290102389079e-07, |
| "loss": 0.0415, |
| "reward": 0.47053573578596114, |
| "reward_std": 0.470707942545414, |
| "rewards/accuracy_reward": 0.20803572395816444, |
| "rewards/format_reward": 0.06785714607685804, |
| "rewards/tag_count_reward": 0.19464286640286446, |
| "step": 65 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1000.8661163330078, |
| "epoch": 0.02389690193735598, |
| "grad_norm": 0.5569175481796265, |
| "kl": 0.018768310546875, |
| "learning_rate": 7.167235494880546e-07, |
| "loss": 0.031, |
| "reward": 0.4183035880327225, |
| "reward_std": 0.4322426520287991, |
| "rewards/accuracy_reward": 0.1446428621187806, |
| "rewards/format_reward": 0.07053571781143546, |
| "rewards/tag_count_reward": 0.20312501080334186, |
| "step": 70 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 996.6652221679688, |
| "epoch": 0.025603823504309977, |
| "grad_norm": 0.3041313588619232, |
| "kl": 0.0203216552734375, |
| "learning_rate": 7.679180887372013e-07, |
| "loss": 0.0353, |
| "reward": 0.529464314877987, |
| "reward_std": 0.5286803618073463, |
| "rewards/accuracy_reward": 0.16250000819563865, |
| "rewards/format_reward": 0.12142857620492578, |
| "rewards/tag_count_reward": 0.24553572833538057, |
| "step": 75 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1012.7580841064453, |
| "epoch": 0.027310745071263975, |
| "grad_norm": 0.6267422437667847, |
| "kl": 0.0195526123046875, |
| "learning_rate": 8.191126279863481e-07, |
| "loss": 0.0586, |
| "reward": 0.5212053760886193, |
| "reward_std": 0.5019170552492142, |
| "rewards/accuracy_reward": 0.1580357219092548, |
| "rewards/format_reward": 0.1089285776950419, |
| "rewards/tag_count_reward": 0.2542410835623741, |
| "step": 80 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 962.2661163330079, |
| "epoch": 0.029017666638217973, |
| "grad_norm": 0.5902665853500366, |
| "kl": 0.0366546630859375, |
| "learning_rate": 8.703071672354949e-07, |
| "loss": 0.015, |
| "reward": 0.6316964507102967, |
| "reward_std": 0.5959798350930214, |
| "rewards/accuracy_reward": 0.18482143711298704, |
| "rewards/format_reward": 0.1455357214435935, |
| "rewards/tag_count_reward": 0.30133929997682574, |
| "step": 85 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1017.0955841064454, |
| "epoch": 0.030724588205171974, |
| "grad_norm": 0.5561809539794922, |
| "kl": 0.0215423583984375, |
| "learning_rate": 9.215017064846417e-07, |
| "loss": 0.049, |
| "reward": 0.6587053805589675, |
| "reward_std": 0.5868644163012504, |
| "rewards/accuracy_reward": 0.18839286528527738, |
| "rewards/format_reward": 0.15982143692672252, |
| "rewards/tag_count_reward": 0.3104910857975483, |
| "step": 90 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1010.9018280029297, |
| "epoch": 0.03243150977212597, |
| "grad_norm": 0.3226785659790039, |
| "kl": 0.0638458251953125, |
| "learning_rate": 9.726962457337883e-07, |
| "loss": 0.064, |
| "reward": 0.6285714544355869, |
| "reward_std": 0.6115485787391662, |
| "rewards/accuracy_reward": 0.1625000076368451, |
| "rewards/format_reward": 0.1616071503609419, |
| "rewards/tag_count_reward": 0.3044643014669418, |
| "step": 95 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 989.2018280029297, |
| "epoch": 0.03413843133907997, |
| "grad_norm": 0.38331085443496704, |
| "kl": 0.0280029296875, |
| "learning_rate": 1.0238907849829352e-06, |
| "loss": 0.044, |
| "reward": 0.7116071745753288, |
| "reward_std": 0.6793092235922813, |
| "rewards/accuracy_reward": 0.15000000689178705, |
| "rewards/format_reward": 0.20982143543660642, |
| "rewards/tag_count_reward": 0.3517857328057289, |
| "step": 100 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 989.4839752197265, |
| "epoch": 0.03584535290603397, |
| "grad_norm": 0.31946083903312683, |
| "kl": 0.028564453125, |
| "learning_rate": 1.075085324232082e-06, |
| "loss": 0.05, |
| "reward": 0.7866071730852127, |
| "reward_std": 0.6944080710411071, |
| "rewards/accuracy_reward": 0.15625000447034837, |
| "rewards/format_reward": 0.24464286714792252, |
| "rewards/tag_count_reward": 0.38571430146694186, |
| "step": 105 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 993.9964782714844, |
| "epoch": 0.037552274472987965, |
| "grad_norm": 0.48181456327438354, |
| "kl": 0.055322265625, |
| "learning_rate": 1.1262798634812287e-06, |
| "loss": 0.0478, |
| "reward": 0.8959821820259094, |
| "reward_std": 0.7488934248685837, |
| "rewards/accuracy_reward": 0.15357143450528382, |
| "rewards/format_reward": 0.30000001564621925, |
| "rewards/tag_count_reward": 0.44241073727607727, |
| "step": 110 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 982.9768310546875, |
| "epoch": 0.03925919603994196, |
| "grad_norm": 0.5322438478469849, |
| "kl": 0.094439697265625, |
| "learning_rate": 1.1774744027303754e-06, |
| "loss": 0.027, |
| "reward": 0.9075893253087998, |
| "reward_std": 0.6916713267564774, |
| "rewards/accuracy_reward": 0.175892864074558, |
| "rewards/format_reward": 0.29464287012815477, |
| "rewards/tag_count_reward": 0.43705359250307085, |
| "step": 115 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1018.8009429931641, |
| "epoch": 0.04096611760689596, |
| "grad_norm": 2.290813684463501, |
| "kl": 1.0316864013671876, |
| "learning_rate": 1.2286689419795221e-06, |
| "loss": 0.1651, |
| "reward": 0.8718750342726708, |
| "reward_std": 0.71419677734375, |
| "rewards/accuracy_reward": 0.12053572116419672, |
| "rewards/format_reward": 0.3089285887777805, |
| "rewards/tag_count_reward": 0.4424107387661934, |
| "step": 120 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1019.8580902099609, |
| "epoch": 0.04267303917384996, |
| "grad_norm": 1.6987059116363525, |
| "kl": 0.052471923828125, |
| "learning_rate": 1.279863481228669e-06, |
| "loss": 0.0799, |
| "reward": 0.9720982730388641, |
| "reward_std": 0.7306801319122315, |
| "rewards/accuracy_reward": 0.175000006146729, |
| "rewards/format_reward": 0.32232144474983215, |
| "rewards/tag_count_reward": 0.47477681189775467, |
| "step": 125 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1004.2143371582031, |
| "epoch": 0.04437996074080396, |
| "grad_norm": 3.3034074306488037, |
| "kl": 0.07410888671875, |
| "learning_rate": 1.3310580204778158e-06, |
| "loss": 0.0982, |
| "reward": 0.9348214775323868, |
| "reward_std": 0.7496399849653244, |
| "rewards/accuracy_reward": 0.13303572088479995, |
| "rewards/format_reward": 0.3044643007218838, |
| "rewards/tag_count_reward": 0.4973214492201805, |
| "step": 130 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1024.3098571777343, |
| "epoch": 0.04608688230775796, |
| "grad_norm": 1.845048427581787, |
| "kl": 0.21453857421875, |
| "learning_rate": 1.3822525597269625e-06, |
| "loss": 0.0697, |
| "reward": 1.1470982640981675, |
| "reward_std": 0.7625433832406998, |
| "rewards/accuracy_reward": 0.16428572153672577, |
| "rewards/format_reward": 0.3892857313156128, |
| "rewards/tag_count_reward": 0.5935268118977547, |
| "step": 135 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1005.0714691162109, |
| "epoch": 0.04779380387471196, |
| "grad_norm": 1.300987720489502, |
| "kl": 0.5607666015625, |
| "learning_rate": 1.4334470989761092e-06, |
| "loss": 0.1016, |
| "reward": 1.3178571999073028, |
| "reward_std": 0.7537211120128632, |
| "rewards/accuracy_reward": 0.17410714970901608, |
| "rewards/format_reward": 0.46875002086162565, |
| "rewards/tag_count_reward": 0.6750000268220901, |
| "step": 140 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 971.468798828125, |
| "epoch": 0.049500725441665956, |
| "grad_norm": 1.4947351217269897, |
| "kl": 0.1273681640625, |
| "learning_rate": 1.484641638225256e-06, |
| "loss": 0.0903, |
| "reward": 1.4821429193019866, |
| "reward_std": 0.7479644685983657, |
| "rewards/accuracy_reward": 0.17321429271250963, |
| "rewards/format_reward": 0.5758928790688514, |
| "rewards/tag_count_reward": 0.7330357491970062, |
| "step": 145 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 907.8357604980469, |
| "epoch": 0.051207647008619954, |
| "grad_norm": 3.5529232025146484, |
| "kl": 1.070947265625, |
| "learning_rate": 1.5358361774744026e-06, |
| "loss": 0.0934, |
| "reward": 1.5294643580913543, |
| "reward_std": 0.7225078850984573, |
| "rewards/accuracy_reward": 0.186607151851058, |
| "rewards/format_reward": 0.5839285984635353, |
| "rewards/tag_count_reward": 0.7589286029338836, |
| "step": 150 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 962.1339691162109, |
| "epoch": 0.05291456857557395, |
| "grad_norm": 146.53732299804688, |
| "kl": 0.70908203125, |
| "learning_rate": 1.5870307167235496e-06, |
| "loss": 0.0814, |
| "reward": 1.5712054193019866, |
| "reward_std": 0.7219390630722046, |
| "rewards/accuracy_reward": 0.1517857219092548, |
| "rewards/format_reward": 0.6169643193483353, |
| "rewards/tag_count_reward": 0.8024553954601288, |
| "step": 155 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 974.2196868896484, |
| "epoch": 0.05462149014252795, |
| "grad_norm": 38.124210357666016, |
| "kl": 2.0916015625, |
| "learning_rate": 1.6382252559726963e-06, |
| "loss": 0.1192, |
| "reward": 1.5966518700122834, |
| "reward_std": 0.7490073859691619, |
| "rewards/accuracy_reward": 0.18214286491274834, |
| "rewards/format_reward": 0.6294643074274063, |
| "rewards/tag_count_reward": 0.7850446790456772, |
| "step": 160 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1026.6437927246093, |
| "epoch": 0.05632841170948195, |
| "grad_norm": 50.31554412841797, |
| "kl": 1.8455078125, |
| "learning_rate": 1.6894197952218432e-06, |
| "loss": 0.1378, |
| "reward": 1.5475447177886963, |
| "reward_std": 0.740731555223465, |
| "rewards/accuracy_reward": 0.14553572097793221, |
| "rewards/format_reward": 0.6187500298023224, |
| "rewards/tag_count_reward": 0.7832589596509933, |
| "step": 165 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1012.067904663086, |
| "epoch": 0.058035333276435945, |
| "grad_norm": 55.0086555480957, |
| "kl": 2.6337890625, |
| "learning_rate": 1.7406143344709897e-06, |
| "loss": 0.1877, |
| "reward": 1.483928644657135, |
| "reward_std": 0.8173549324274063, |
| "rewards/accuracy_reward": 0.15535715073347092, |
| "rewards/format_reward": 0.5758928820490837, |
| "rewards/tag_count_reward": 0.752678605914116, |
| "step": 170 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1027.2464630126954, |
| "epoch": 0.05974225484338994, |
| "grad_norm": 58.59043502807617, |
| "kl": 3.63203125, |
| "learning_rate": 1.7918088737201367e-06, |
| "loss": 0.2713, |
| "reward": 1.4720982789993287, |
| "reward_std": 0.8386385828256607, |
| "rewards/accuracy_reward": 0.15535714933648706, |
| "rewards/format_reward": 0.5723214507102966, |
| "rewards/tag_count_reward": 0.7444196730852127, |
| "step": 175 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1085.983087158203, |
| "epoch": 0.06144917641034395, |
| "grad_norm": 118.3796615600586, |
| "kl": 3.1908203125, |
| "learning_rate": 1.8430034129692834e-06, |
| "loss": 0.2424, |
| "reward": 1.3830357909202575, |
| "reward_std": 0.8672506153583527, |
| "rewards/accuracy_reward": 0.14375000717118383, |
| "rewards/format_reward": 0.5339285925030708, |
| "rewards/tag_count_reward": 0.7053571850061416, |
| "step": 180 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1069.7571899414063, |
| "epoch": 0.06315609797729795, |
| "grad_norm": 1056.704833984375, |
| "kl": 4.60859375, |
| "learning_rate": 1.8941979522184299e-06, |
| "loss": 0.349, |
| "reward": 1.483705425262451, |
| "reward_std": 0.8457996159791946, |
| "rewards/accuracy_reward": 0.17142857862636446, |
| "rewards/format_reward": 0.5830357432365417, |
| "rewards/tag_count_reward": 0.729241105914116, |
| "step": 185 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1033.57861328125, |
| "epoch": 0.06486301954425194, |
| "grad_norm": 116.93949890136719, |
| "kl": 4.104296875, |
| "learning_rate": 1.9453924914675766e-06, |
| "loss": 0.2809, |
| "reward": 1.4609375596046448, |
| "reward_std": 0.8616278827190399, |
| "rewards/accuracy_reward": 0.168750009406358, |
| "rewards/format_reward": 0.5660714522004128, |
| "rewards/tag_count_reward": 0.7261161029338836, |
| "step": 190 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1109.1634399414063, |
| "epoch": 0.06656994111120594, |
| "grad_norm": 226.57644653320312, |
| "kl": 4.3140625, |
| "learning_rate": 1.9965870307167235e-06, |
| "loss": 0.3192, |
| "reward": 1.3892857730388641, |
| "reward_std": 0.8872519373893738, |
| "rewards/accuracy_reward": 0.11160714821889997, |
| "rewards/format_reward": 0.5732143193483352, |
| "rewards/tag_count_reward": 0.704464316368103, |
| "step": 195 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1111.0705841064453, |
| "epoch": 0.06827686267815994, |
| "grad_norm": 40.324981689453125, |
| "kl": 6.765625, |
| "learning_rate": 2.0477815699658705e-06, |
| "loss": 0.4585, |
| "reward": 1.3991072058677674, |
| "reward_std": 0.904101237654686, |
| "rewards/accuracy_reward": 0.14107143497094513, |
| "rewards/format_reward": 0.5598214507102967, |
| "rewards/tag_count_reward": 0.6982143223285675, |
| "step": 200 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1101.1598724365235, |
| "epoch": 0.06998378424511394, |
| "grad_norm": 24.814128875732422, |
| "kl": 3.97109375, |
| "learning_rate": 2.098976109215017e-06, |
| "loss": 0.3623, |
| "reward": 1.4299107789993286, |
| "reward_std": 0.8886282354593277, |
| "rewards/accuracy_reward": 0.1821428656578064, |
| "rewards/format_reward": 0.5482143044471741, |
| "rewards/tag_count_reward": 0.6995536029338837, |
| "step": 205 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1134.5839721679688, |
| "epoch": 0.07169070581206793, |
| "grad_norm": 35.07132339477539, |
| "kl": 4.96171875, |
| "learning_rate": 2.150170648464164e-06, |
| "loss": 0.4362, |
| "reward": 1.3689732730388642, |
| "reward_std": 0.9125554233789444, |
| "rewards/accuracy_reward": 0.15535715082660317, |
| "rewards/format_reward": 0.5392857372760773, |
| "rewards/tag_count_reward": 0.6743303894996643, |
| "step": 210 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1099.2161224365234, |
| "epoch": 0.07339762737902193, |
| "grad_norm": 27.72007942199707, |
| "kl": 5.187109375, |
| "learning_rate": 2.201365187713311e-06, |
| "loss": 0.4161, |
| "reward": 1.3982143580913544, |
| "reward_std": 0.8859813660383224, |
| "rewards/accuracy_reward": 0.18750000819563867, |
| "rewards/format_reward": 0.5401785910129547, |
| "rewards/tag_count_reward": 0.6705357432365417, |
| "step": 215 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1172.6286163330078, |
| "epoch": 0.07510454894597593, |
| "grad_norm": 28.218076705932617, |
| "kl": 4.496875, |
| "learning_rate": 2.2525597269624573e-06, |
| "loss": 0.3957, |
| "reward": 1.3350447058677672, |
| "reward_std": 0.907436516880989, |
| "rewards/accuracy_reward": 0.14375000763684512, |
| "rewards/format_reward": 0.5258928805589675, |
| "rewards/tag_count_reward": 0.6654018104076386, |
| "step": 220 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1138.0920104980469, |
| "epoch": 0.07681147051292993, |
| "grad_norm": 30.30360221862793, |
| "kl": 4.7421875, |
| "learning_rate": 2.3037542662116043e-06, |
| "loss": 0.3741, |
| "reward": 1.3508929312229156, |
| "reward_std": 0.9466613680124283, |
| "rewards/accuracy_reward": 0.15625000512227416, |
| "rewards/format_reward": 0.5321428820490837, |
| "rewards/tag_count_reward": 0.6625000238418579, |
| "step": 225 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1146.39736328125, |
| "epoch": 0.07851839207988393, |
| "grad_norm": 15.380309104919434, |
| "kl": 6.12265625, |
| "learning_rate": 2.3549488054607508e-06, |
| "loss": 0.4282, |
| "reward": 1.33727685213089, |
| "reward_std": 0.9166835993528366, |
| "rewards/accuracy_reward": 0.13839286239817739, |
| "rewards/format_reward": 0.5285714492201805, |
| "rewards/tag_count_reward": 0.6703125298023224, |
| "step": 230 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1135.8580841064454, |
| "epoch": 0.08022531364683792, |
| "grad_norm": 49.58182907104492, |
| "kl": 4.6671875, |
| "learning_rate": 2.4061433447098977e-06, |
| "loss": 0.3852, |
| "reward": 1.3584822177886964, |
| "reward_std": 0.8921870917081833, |
| "rewards/accuracy_reward": 0.12500000577419995, |
| "rewards/format_reward": 0.5598214492201805, |
| "rewards/tag_count_reward": 0.673660746216774, |
| "step": 235 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1134.0955810546875, |
| "epoch": 0.08193223521379192, |
| "grad_norm": 9.52077865600586, |
| "kl": 4.63515625, |
| "learning_rate": 2.4573378839590442e-06, |
| "loss": 0.3793, |
| "reward": 1.4533482909202575, |
| "reward_std": 0.9463501214981079, |
| "rewards/accuracy_reward": 0.21607143534347414, |
| "rewards/format_reward": 0.5598214536905288, |
| "rewards/tag_count_reward": 0.6774553894996643, |
| "step": 240 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1126.5911193847655, |
| "epoch": 0.08363915678074592, |
| "grad_norm": 12.795201301574707, |
| "kl": 5.07578125, |
| "learning_rate": 2.508532423208191e-06, |
| "loss": 0.444, |
| "reward": 1.4241071879863738, |
| "reward_std": 0.9276215642690658, |
| "rewards/accuracy_reward": 0.16607143776491284, |
| "rewards/format_reward": 0.5776786029338836, |
| "rewards/tag_count_reward": 0.6803571671247483, |
| "step": 245 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1206.5196990966797, |
| "epoch": 0.08534607834769992, |
| "grad_norm": 25.597183227539062, |
| "kl": 6.453125, |
| "learning_rate": 2.559726962457338e-06, |
| "loss": 0.5221, |
| "reward": 1.3265625447034837, |
| "reward_std": 0.9622172951698303, |
| "rewards/accuracy_reward": 0.16517857862636448, |
| "rewards/format_reward": 0.529464316368103, |
| "rewards/tag_count_reward": 0.631919664144516, |
| "step": 250 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1214.5321838378907, |
| "epoch": 0.08705299991465393, |
| "grad_norm": 12.811568260192871, |
| "kl": 5.79453125, |
| "learning_rate": 2.6109215017064846e-06, |
| "loss": 0.5348, |
| "reward": 1.3906250774860383, |
| "reward_std": 0.9294156819581986, |
| "rewards/accuracy_reward": 0.1723214373923838, |
| "rewards/format_reward": 0.5633928835391998, |
| "rewards/tag_count_reward": 0.6549107417464256, |
| "step": 255 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1202.7625549316406, |
| "epoch": 0.08875992148160793, |
| "grad_norm": 154.68002319335938, |
| "kl": 5.7234375, |
| "learning_rate": 2.6621160409556315e-06, |
| "loss": 0.4805, |
| "reward": 1.361384004354477, |
| "reward_std": 0.9610002607107162, |
| "rewards/accuracy_reward": 0.1544642921537161, |
| "rewards/format_reward": 0.555357164144516, |
| "rewards/tag_count_reward": 0.6515625298023224, |
| "step": 260 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1146.6179077148438, |
| "epoch": 0.09046684304856192, |
| "grad_norm": 6.1883440017700195, |
| "kl": 5.08046875, |
| "learning_rate": 2.7133105802047784e-06, |
| "loss": 0.4617, |
| "reward": 1.4383929252624512, |
| "reward_std": 0.9193685740232468, |
| "rewards/accuracy_reward": 0.14017857760190963, |
| "rewards/format_reward": 0.6000000268220902, |
| "rewards/tag_count_reward": 0.698214316368103, |
| "step": 265 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1172.1741607666015, |
| "epoch": 0.09217376461551592, |
| "grad_norm": 11.599955558776855, |
| "kl": 5.28671875, |
| "learning_rate": 2.764505119453925e-06, |
| "loss": 0.4665, |
| "reward": 1.4604911386966706, |
| "reward_std": 0.923082035779953, |
| "rewards/accuracy_reward": 0.15178572060540318, |
| "rewards/format_reward": 0.6098214566707612, |
| "rewards/tag_count_reward": 0.6988839596509934, |
| "step": 270 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1103.8205841064453, |
| "epoch": 0.09388068618246992, |
| "grad_norm": 24.47557830810547, |
| "kl": 5.521875, |
| "learning_rate": 2.8156996587030715e-06, |
| "loss": 0.4808, |
| "reward": 1.4462054193019866, |
| "reward_std": 0.8959993481636047, |
| "rewards/accuracy_reward": 0.141071433480829, |
| "rewards/format_reward": 0.6080357372760773, |
| "rewards/tag_count_reward": 0.6970982432365418, |
| "step": 275 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1036.750051879883, |
| "epoch": 0.09558760774942392, |
| "grad_norm": 18.15721893310547, |
| "kl": 5.049609375, |
| "learning_rate": 2.8668941979522184e-06, |
| "loss": 0.3953, |
| "reward": 1.5037946939468383, |
| "reward_std": 0.8739502459764481, |
| "rewards/accuracy_reward": 0.14821429317817092, |
| "rewards/format_reward": 0.6312500327825546, |
| "rewards/tag_count_reward": 0.7243303894996643, |
| "step": 280 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1079.7732727050782, |
| "epoch": 0.09729452931637791, |
| "grad_norm": 9.835553169250488, |
| "kl": 4.321875, |
| "learning_rate": 2.9180887372013653e-06, |
| "loss": 0.3538, |
| "reward": 1.4616072118282317, |
| "reward_std": 0.9075537651777268, |
| "rewards/accuracy_reward": 0.15446429196745157, |
| "rewards/format_reward": 0.6044643148779869, |
| "rewards/tag_count_reward": 0.7026785999536515, |
| "step": 285 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1014.2696868896485, |
| "epoch": 0.09900145088333191, |
| "grad_norm": 65.74641418457031, |
| "kl": 5.026953125, |
| "learning_rate": 2.969283276450512e-06, |
| "loss": 0.4408, |
| "reward": 1.6125000774860383, |
| "reward_std": 0.8444355905056, |
| "rewards/accuracy_reward": 0.16428572293370963, |
| "rewards/format_reward": 0.685714316368103, |
| "rewards/tag_count_reward": 0.7625000268220902, |
| "step": 290 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 975.2527221679687, |
| "epoch": 0.10070837245028591, |
| "grad_norm": 8.98360538482666, |
| "kl": 4.430078125, |
| "learning_rate": 2.999995738818993e-06, |
| "loss": 0.4058, |
| "reward": 1.6013393580913544, |
| "reward_std": 0.8386077880859375, |
| "rewards/accuracy_reward": 0.15714286370202898, |
| "rewards/format_reward": 0.6794643223285675, |
| "rewards/tag_count_reward": 0.7647321850061417, |
| "step": 295 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1040.8562866210937, |
| "epoch": 0.10241529401723991, |
| "grad_norm": 7.139811992645264, |
| "kl": 5.41328125, |
| "learning_rate": 2.9999478008106995e-06, |
| "loss": 0.5289, |
| "reward": 1.5127232789993286, |
| "reward_std": 0.8606418490409851, |
| "rewards/accuracy_reward": 0.18571429569274187, |
| "rewards/format_reward": 0.6151785984635353, |
| "rewards/tag_count_reward": 0.7118303835391998, |
| "step": 300 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 999.74111328125, |
| "epoch": 0.1041222155841939, |
| "grad_norm": 8.274175643920898, |
| "kl": 4.5482421875, |
| "learning_rate": 2.9998466000257944e-06, |
| "loss": 0.3431, |
| "reward": 1.5814732909202576, |
| "reward_std": 0.8399073332548141, |
| "rewards/accuracy_reward": 0.17142857825383545, |
| "rewards/format_reward": 0.6616071701049805, |
| "rewards/tag_count_reward": 0.7484375268220902, |
| "step": 305 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 984.4464752197266, |
| "epoch": 0.1058291371511479, |
| "grad_norm": 5.403537750244141, |
| "kl": 4.11796875, |
| "learning_rate": 2.999692140057893e-06, |
| "loss": 0.3573, |
| "reward": 1.6404018580913544, |
| "reward_std": 0.7838103622198105, |
| "rewards/accuracy_reward": 0.22946429662406445, |
| "rewards/format_reward": 0.658035746216774, |
| "rewards/tag_count_reward": 0.752901816368103, |
| "step": 310 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1023.7839813232422, |
| "epoch": 0.1075360587181019, |
| "grad_norm": 10.112387657165527, |
| "kl": 4.18515625, |
| "learning_rate": 2.999484426391831e-06, |
| "loss": 0.3199, |
| "reward": 1.5156250596046448, |
| "reward_std": 0.8099619418382644, |
| "rewards/accuracy_reward": 0.11517857648432255, |
| "rewards/format_reward": 0.6580357402563095, |
| "rewards/tag_count_reward": 0.7424107432365418, |
| "step": 315 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 939.3991516113281, |
| "epoch": 0.1092429802850559, |
| "grad_norm": 4.295641899108887, |
| "kl": 3.85234375, |
| "learning_rate": 2.9992234664034687e-06, |
| "loss": 0.3389, |
| "reward": 1.6013393521308898, |
| "reward_std": 0.8064254641532898, |
| "rewards/accuracy_reward": 0.16875000642612575, |
| "rewards/format_reward": 0.6758928924798966, |
| "rewards/tag_count_reward": 0.7566964656114579, |
| "step": 320 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 944.2964782714844, |
| "epoch": 0.1109499018520099, |
| "grad_norm": 6.538363456726074, |
| "kl": 4.298046875, |
| "learning_rate": 2.998909269359431e-06, |
| "loss": 0.3693, |
| "reward": 1.712946504354477, |
| "reward_std": 0.7462773695588112, |
| "rewards/accuracy_reward": 0.1812500067986548, |
| "rewards/format_reward": 0.7276786029338836, |
| "rewards/tag_count_reward": 0.8040178924798965, |
| "step": 325 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 995.8259307861329, |
| "epoch": 0.1126568234189639, |
| "grad_norm": 9.679475784301758, |
| "kl": 3.96171875, |
| "learning_rate": 2.9985418464167776e-06, |
| "loss": 0.3515, |
| "reward": 1.6656250715255738, |
| "reward_std": 0.7768863618373871, |
| "rewards/accuracy_reward": 0.18928572479635478, |
| "rewards/format_reward": 0.6937500298023224, |
| "rewards/tag_count_reward": 0.7825893223285675, |
| "step": 330 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1035.0509338378906, |
| "epoch": 0.11436374498591789, |
| "grad_norm": 10.23958683013916, |
| "kl": 4.15234375, |
| "learning_rate": 2.9981212106226067e-06, |
| "loss": 0.4532, |
| "reward": 1.7529018700122834, |
| "reward_std": 0.7962618798017502, |
| "rewards/accuracy_reward": 0.20982143813744186, |
| "rewards/format_reward": 0.7437500268220901, |
| "rewards/tag_count_reward": 0.7993303954601287, |
| "step": 335 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1050.3491577148438, |
| "epoch": 0.11607066655287189, |
| "grad_norm": 21.458248138427734, |
| "kl": 3.53984375, |
| "learning_rate": 2.9976473769135918e-06, |
| "loss": 0.3423, |
| "reward": 1.7979911386966705, |
| "reward_std": 0.7509608373045922, |
| "rewards/accuracy_reward": 0.24285715371370314, |
| "rewards/format_reward": 0.7500000417232513, |
| "rewards/tag_count_reward": 0.8051339626312256, |
| "step": 340 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 985.1045043945312, |
| "epoch": 0.11777758811982589, |
| "grad_norm": 8.980128288269043, |
| "kl": 4.795703125, |
| "learning_rate": 2.997120362115451e-06, |
| "loss": 0.4425, |
| "reward": 1.8357143640518188, |
| "reward_std": 0.6679434359073639, |
| "rewards/accuracy_reward": 0.20267857778817416, |
| "rewards/format_reward": 0.7964286059141159, |
| "rewards/tag_count_reward": 0.8366071820259094, |
| "step": 345 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 926.8366455078125, |
| "epoch": 0.11948450968677989, |
| "grad_norm": 12.840106964111328, |
| "kl": 2.9732421875, |
| "learning_rate": 2.99654018494235e-06, |
| "loss": 0.3475, |
| "reward": 1.8665179431438446, |
| "reward_std": 0.665838934481144, |
| "rewards/accuracy_reward": 0.22589286882430315, |
| "rewards/format_reward": 0.7919643223285675, |
| "rewards/tag_count_reward": 0.8486607581377029, |
| "step": 350 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 898.8857543945312, |
| "epoch": 0.1211914312537339, |
| "grad_norm": 1.285402774810791, |
| "kl": 2.25703125, |
| "learning_rate": 2.9959068659962367e-06, |
| "loss": 0.2652, |
| "reward": 1.908035808801651, |
| "reward_std": 0.5491333983838558, |
| "rewards/accuracy_reward": 0.16607143646106123, |
| "rewards/format_reward": 0.8508929014205933, |
| "rewards/tag_count_reward": 0.8910714685916901, |
| "step": 355 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 834.8294952392578, |
| "epoch": 0.1228983528206879, |
| "grad_norm": 16.34928321838379, |
| "kl": 4.01875, |
| "learning_rate": 2.995220427766111e-06, |
| "loss": 0.3496, |
| "reward": 1.764285796880722, |
| "reward_std": 0.6508073821663857, |
| "rewards/accuracy_reward": 0.1687500079162419, |
| "rewards/format_reward": 0.7500000327825547, |
| "rewards/tag_count_reward": 0.8455357521772384, |
| "step": 360 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 824.4848602294921, |
| "epoch": 0.12460527438764189, |
| "grad_norm": 22.009183883666992, |
| "kl": 1.86591796875, |
| "learning_rate": 2.994480894627225e-06, |
| "loss": 0.1837, |
| "reward": 1.7015625834465027, |
| "reward_std": 0.6883793324232101, |
| "rewards/accuracy_reward": 0.16964286332949996, |
| "rewards/format_reward": 0.7187500268220901, |
| "rewards/tag_count_reward": 0.8131696820259094, |
| "step": 365 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 828.4500396728515, |
| "epoch": 0.1263121959545959, |
| "grad_norm": 11.808023452758789, |
| "kl": 3.427734375, |
| "learning_rate": 2.9936882928402187e-06, |
| "loss": 0.2191, |
| "reward": 1.6609375894069671, |
| "reward_std": 0.7794422417879104, |
| "rewards/accuracy_reward": 0.20357143841683864, |
| "rewards/format_reward": 0.689285746216774, |
| "rewards/tag_count_reward": 0.7680803865194321, |
| "step": 370 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 899.5518280029297, |
| "epoch": 0.12801911752154987, |
| "grad_norm": 3.559278964996338, |
| "kl": 2.0583984375, |
| "learning_rate": 2.992842650550186e-06, |
| "loss": 0.149, |
| "reward": 1.731026864051819, |
| "reward_std": 0.6875755071640015, |
| "rewards/accuracy_reward": 0.18214286481961608, |
| "rewards/format_reward": 0.7375000357627869, |
| "rewards/tag_count_reward": 0.81138396859169, |
| "step": 375 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 935.9803985595703, |
| "epoch": 0.12972603908850389, |
| "grad_norm": 7.016046524047852, |
| "kl": 2.69150390625, |
| "learning_rate": 2.991943997785676e-06, |
| "loss": 0.21, |
| "reward": 1.8580358147621154, |
| "reward_std": 0.6217495501041412, |
| "rewards/accuracy_reward": 0.1910714365541935, |
| "rewards/format_reward": 0.8044643223285675, |
| "rewards/tag_count_reward": 0.8625000417232513, |
| "step": 380 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 911.315219116211, |
| "epoch": 0.13143296065545787, |
| "grad_norm": 7.220456600189209, |
| "kl": 2.14296875, |
| "learning_rate": 2.9909923664576264e-06, |
| "loss": 0.1268, |
| "reward": 1.8678572356700898, |
| "reward_std": 0.5655309081077575, |
| "rewards/accuracy_reward": 0.19107143981382252, |
| "rewards/format_reward": 0.80357146859169, |
| "rewards/tag_count_reward": 0.8732143253087997, |
| "step": 385 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 862.4652130126954, |
| "epoch": 0.13313988222241188, |
| "grad_norm": 7.075647830963135, |
| "kl": 1.9138671875, |
| "learning_rate": 2.9899877903582307e-06, |
| "loss": 0.1381, |
| "reward": 1.7910715162754058, |
| "reward_std": 0.5908127099275589, |
| "rewards/accuracy_reward": 0.16250000642612578, |
| "rewards/format_reward": 0.77857146859169, |
| "rewards/tag_count_reward": 0.8500000417232514, |
| "step": 390 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 824.7241424560547, |
| "epoch": 0.13484680378936587, |
| "grad_norm": 4.559528350830078, |
| "kl": 1.3359130859375, |
| "learning_rate": 2.9889303051597403e-06, |
| "loss": 0.1208, |
| "reward": 1.9924108028411864, |
| "reward_std": 0.4078477367758751, |
| "rewards/accuracy_reward": 0.18035715036094188, |
| "rewards/format_reward": 0.8848214656114578, |
| "rewards/tag_count_reward": 0.9272321879863739, |
| "step": 395 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 915.5607513427734, |
| "epoch": 0.13655372535631988, |
| "grad_norm": 6.0182414054870605, |
| "kl": 1.38389892578125, |
| "learning_rate": 2.9878199484131928e-06, |
| "loss": 0.1426, |
| "reward": 1.8991072237491609, |
| "reward_std": 0.5109399899840354, |
| "rewards/accuracy_reward": 0.20892858393490316, |
| "rewards/format_reward": 0.8116071820259094, |
| "rewards/tag_count_reward": 0.8785714656114578, |
| "step": 400 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 918.3304016113282, |
| "epoch": 0.1382606469232739, |
| "grad_norm": 7.766772270202637, |
| "kl": 2.130078125, |
| "learning_rate": 2.986656759547082e-06, |
| "loss": 0.2231, |
| "reward": 1.9397322297096253, |
| "reward_std": 0.5156191930174827, |
| "rewards/accuracy_reward": 0.22053572321310638, |
| "rewards/format_reward": 0.8276786118745804, |
| "rewards/tag_count_reward": 0.891517898440361, |
| "step": 405 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 915.7795013427734, |
| "epoch": 0.13996756849022787, |
| "grad_norm": 21.836950302124023, |
| "kl": 1.909521484375, |
| "learning_rate": 2.9854407798659583e-06, |
| "loss": 0.2047, |
| "reward": 1.9703125834465027, |
| "reward_std": 0.5022375226020813, |
| "rewards/accuracy_reward": 0.202678582072258, |
| "rewards/format_reward": 0.8616071850061416, |
| "rewards/tag_count_reward": 0.906026828289032, |
| "step": 410 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 858.590219116211, |
| "epoch": 0.14167449005718188, |
| "grad_norm": 18.037555694580078, |
| "kl": 2.42880859375, |
| "learning_rate": 2.984172052548961e-06, |
| "loss": 0.2126, |
| "reward": 1.9808036744594575, |
| "reward_std": 0.5602116242051125, |
| "rewards/accuracy_reward": 0.22410715455189348, |
| "rewards/format_reward": 0.8526786148548127, |
| "rewards/tag_count_reward": 0.9040179014205932, |
| "step": 415 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 856.7375366210938, |
| "epoch": 0.14338141162413587, |
| "grad_norm": 8.315873146057129, |
| "kl": 3.725, |
| "learning_rate": 2.982850622648283e-06, |
| "loss": 0.2975, |
| "reward": 1.8669643700122833, |
| "reward_std": 0.62198735922575, |
| "rewards/accuracy_reward": 0.20625000745058059, |
| "rewards/format_reward": 0.8044643253087997, |
| "rewards/tag_count_reward": 0.8562500387430191, |
| "step": 420 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 860.3080749511719, |
| "epoch": 0.14508833319108988, |
| "grad_norm": 6.0542707443237305, |
| "kl": 2.50927734375, |
| "learning_rate": 2.9814765370875757e-06, |
| "loss": 0.1952, |
| "reward": 1.7029018640518188, |
| "reward_std": 0.732259088754654, |
| "rewards/accuracy_reward": 0.2017857251688838, |
| "rewards/format_reward": 0.7133928894996643, |
| "rewards/tag_count_reward": 0.7877232491970062, |
| "step": 425 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 801.1982513427735, |
| "epoch": 0.14679525475804386, |
| "grad_norm": 7.207229137420654, |
| "kl": 3.885546875, |
| "learning_rate": 2.9800498446602777e-06, |
| "loss": 0.2705, |
| "reward": 1.6745536506175995, |
| "reward_std": 0.6814444154500962, |
| "rewards/accuracy_reward": 0.15446429196745157, |
| "rewards/format_reward": 0.7223214626312255, |
| "rewards/tag_count_reward": 0.7977678924798965, |
| "step": 430 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 867.9830749511718, |
| "epoch": 0.14850217632499788, |
| "grad_norm": 3.2406466007232666, |
| "kl": 1.6208984375, |
| "learning_rate": 2.9785705960278854e-06, |
| "loss": 0.1428, |
| "reward": 1.6584822237491608, |
| "reward_std": 0.6414668798446655, |
| "rewards/accuracy_reward": 0.17946429708972572, |
| "rewards/format_reward": 0.6955357402563095, |
| "rewards/tag_count_reward": 0.7834821820259095, |
| "step": 435 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 774.519677734375, |
| "epoch": 0.15020909789195186, |
| "grad_norm": 5.495645046234131, |
| "kl": 2.61328125, |
| "learning_rate": 2.977038843718153e-06, |
| "loss": 0.0369, |
| "reward": 1.54977685213089, |
| "reward_std": 0.7357341796159744, |
| "rewards/accuracy_reward": 0.16964286426082253, |
| "rewards/format_reward": 0.608035746216774, |
| "rewards/tag_count_reward": 0.772098246216774, |
| "step": 440 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 753.3616424560547, |
| "epoch": 0.15191601945890587, |
| "grad_norm": 4.03928804397583, |
| "kl": 1.2857421875, |
| "learning_rate": 2.975454642123228e-06, |
| "loss": 0.0017, |
| "reward": 1.5091518580913543, |
| "reward_std": 0.7535226970911026, |
| "rewards/accuracy_reward": 0.24642858542501928, |
| "rewards/format_reward": 0.5053571626543999, |
| "rewards/tag_count_reward": 0.7573661029338836, |
| "step": 445 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 806.1402191162109, |
| "epoch": 0.15362294102585985, |
| "grad_norm": 2.121255874633789, |
| "kl": 1.4345703125, |
| "learning_rate": 2.9738180474977184e-06, |
| "loss": 0.0222, |
| "reward": 1.5348214983940125, |
| "reward_std": 0.7348162770271301, |
| "rewards/accuracy_reward": 0.17053572023287417, |
| "rewards/format_reward": 0.5758928850293159, |
| "rewards/tag_count_reward": 0.7883928894996644, |
| "step": 450 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 787.08486328125, |
| "epoch": 0.15532986259281387, |
| "grad_norm": 2.3794944286346436, |
| "kl": 2.43564453125, |
| "learning_rate": 2.972129117956695e-06, |
| "loss": 0.0834, |
| "reward": 1.6506697297096253, |
| "reward_std": 0.7070199698209763, |
| "rewards/accuracy_reward": 0.1687500079162419, |
| "rewards/format_reward": 0.6589286088943481, |
| "rewards/tag_count_reward": 0.8229911088943481, |
| "step": 455 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 824.7964599609375, |
| "epoch": 0.15703678415976785, |
| "grad_norm": 1.7041521072387695, |
| "kl": 0.3784423828125, |
| "learning_rate": 2.9703879134736304e-06, |
| "loss": 0.0166, |
| "reward": 1.7145090103149414, |
| "reward_std": 0.6771728962659835, |
| "rewards/accuracy_reward": 0.16964286416769028, |
| "rewards/format_reward": 0.7071428954601288, |
| "rewards/tag_count_reward": 0.8377232521772384, |
| "step": 460 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 845.7839599609375, |
| "epoch": 0.15874370572672186, |
| "grad_norm": 15.45041561126709, |
| "kl": 2.6501953125, |
| "learning_rate": 2.968594495878266e-06, |
| "loss": 0.0991, |
| "reward": 1.7546875834465028, |
| "reward_std": 0.613152152299881, |
| "rewards/accuracy_reward": 0.18303572051227093, |
| "rewards/format_reward": 0.7321428894996643, |
| "rewards/tag_count_reward": 0.8395089656114578, |
| "step": 465 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 880.5625366210937, |
| "epoch": 0.16045062729367585, |
| "grad_norm": 1.617084264755249, |
| "kl": 0.76259765625, |
| "learning_rate": 2.9667489288544177e-06, |
| "loss": 0.028, |
| "reward": 1.8450893878936767, |
| "reward_std": 0.5999127000570297, |
| "rewards/accuracy_reward": 0.20714286714792252, |
| "rewards/format_reward": 0.7776786088943481, |
| "rewards/tag_count_reward": 0.8602678865194321, |
| "step": 470 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 810.7125396728516, |
| "epoch": 0.16215754886062986, |
| "grad_norm": 6.216405391693115, |
| "kl": 1.2310546875, |
| "learning_rate": 2.964851277937717e-06, |
| "loss": 0.0479, |
| "reward": 1.5607143580913543, |
| "reward_std": 0.6846371173858643, |
| "rewards/accuracy_reward": 0.1517857201397419, |
| "rewards/format_reward": 0.6464286029338837, |
| "rewards/tag_count_reward": 0.7625000357627869, |
| "step": 475 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 854.5232513427734, |
| "epoch": 0.16386447042758384, |
| "grad_norm": 4.113749980926514, |
| "kl": 0.834716796875, |
| "learning_rate": 2.9629016105132797e-06, |
| "loss": 0.0379, |
| "reward": 1.7453125953674316, |
| "reward_std": 0.6409839779138565, |
| "rewards/accuracy_reward": 0.19017858048900962, |
| "rewards/format_reward": 0.7366071730852127, |
| "rewards/tag_count_reward": 0.818526816368103, |
| "step": 480 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 798.655386352539, |
| "epoch": 0.16557139199453785, |
| "grad_norm": 18.216583251953125, |
| "kl": 3.745068359375, |
| "learning_rate": 2.9608999958133147e-06, |
| "loss": 0.1496, |
| "reward": 1.7482143700122834, |
| "reward_std": 0.5732271403074265, |
| "rewards/accuracy_reward": 0.1580357201397419, |
| "rewards/format_reward": 0.7580357432365418, |
| "rewards/tag_count_reward": 0.832142898440361, |
| "step": 485 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 796.342886352539, |
| "epoch": 0.16727831356149184, |
| "grad_norm": 3.8710556030273438, |
| "kl": 1.31533203125, |
| "learning_rate": 2.9588465049146673e-06, |
| "loss": -0.017, |
| "reward": 1.4564732909202576, |
| "reward_std": 0.7351120918989181, |
| "rewards/accuracy_reward": 0.1446428645402193, |
| "rewards/format_reward": 0.5937500238418579, |
| "rewards/tag_count_reward": 0.7180803924798965, |
| "step": 490 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 745.3670013427734, |
| "epoch": 0.16898523512844585, |
| "grad_norm": 4.14769983291626, |
| "kl": 2.19775390625, |
| "learning_rate": 2.9567412107362925e-06, |
| "loss": 0.148, |
| "reward": 1.4696429371833801, |
| "reward_std": 0.7330913826823234, |
| "rewards/accuracy_reward": 0.1678571505472064, |
| "rewards/format_reward": 0.5955357432365418, |
| "rewards/tag_count_reward": 0.7062500327825546, |
| "step": 495 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 821.0946746826172, |
| "epoch": 0.17069215669539983, |
| "grad_norm": 6.637725830078125, |
| "kl": 2.6984375, |
| "learning_rate": 2.954584188036668e-06, |
| "loss": 0.224, |
| "reward": 1.3792411386966705, |
| "reward_std": 0.7458429962396622, |
| "rewards/accuracy_reward": 0.14017857983708382, |
| "rewards/format_reward": 0.5544643133878708, |
| "rewards/tag_count_reward": 0.6845982402563096, |
| "step": 500 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 879.7982543945312, |
| "epoch": 0.17239907826235384, |
| "grad_norm": 4.457671642303467, |
| "kl": 2.8482421875, |
| "learning_rate": 2.952375513411137e-06, |
| "loss": 0.2476, |
| "reward": 1.3660714864730834, |
| "reward_std": 0.7996741533279419, |
| "rewards/accuracy_reward": 0.12946429178118707, |
| "rewards/format_reward": 0.5562500268220901, |
| "rewards/tag_count_reward": 0.6803571701049804, |
| "step": 505 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 940.1705780029297, |
| "epoch": 0.17410599982930786, |
| "grad_norm": 3.609261989593506, |
| "kl": 2.91796875, |
| "learning_rate": 2.9501152652891924e-06, |
| "loss": 0.3096, |
| "reward": 1.5997768640518188, |
| "reward_std": 0.7488022714853286, |
| "rewards/accuracy_reward": 0.15267857694998382, |
| "rewards/format_reward": 0.6794643104076385, |
| "rewards/tag_count_reward": 0.7676339626312256, |
| "step": 510 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 921.5455718994141, |
| "epoch": 0.17581292139626184, |
| "grad_norm": 3.6567742824554443, |
| "kl": 3.191015625, |
| "learning_rate": 2.947803523931687e-06, |
| "loss": 0.3447, |
| "reward": 1.6316965103149415, |
| "reward_std": 0.6867892518639565, |
| "rewards/accuracy_reward": 0.15625000847503542, |
| "rewards/format_reward": 0.6946428820490838, |
| "rewards/tag_count_reward": 0.7808036059141159, |
| "step": 515 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 931.1562896728516, |
| "epoch": 0.17751984296321585, |
| "grad_norm": 6.023605823516846, |
| "kl": 3.08828125, |
| "learning_rate": 2.945440371427987e-06, |
| "loss": 0.3204, |
| "reward": 1.5986607909202575, |
| "reward_std": 0.7305556893348694, |
| "rewards/accuracy_reward": 0.1598214365541935, |
| "rewards/format_reward": 0.6714285969734192, |
| "rewards/tag_count_reward": 0.767410746216774, |
| "step": 520 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 870.6866394042969, |
| "epoch": 0.17922676453016984, |
| "grad_norm": 4.5348615646362305, |
| "kl": 2.55, |
| "learning_rate": 2.943025891693054e-06, |
| "loss": 0.2878, |
| "reward": 1.7037947177886963, |
| "reward_std": 0.6444522187113761, |
| "rewards/accuracy_reward": 0.17410715110599995, |
| "rewards/format_reward": 0.7169643193483353, |
| "rewards/tag_count_reward": 0.8127232521772385, |
| "step": 525 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 843.8196807861328, |
| "epoch": 0.18093368609712385, |
| "grad_norm": 5.064777851104736, |
| "kl": 3.21171875, |
| "learning_rate": 2.940560170464469e-06, |
| "loss": 0.2754, |
| "reward": 1.6127232968807221, |
| "reward_std": 0.774041372537613, |
| "rewards/accuracy_reward": 0.144642864074558, |
| "rewards/format_reward": 0.6803571701049804, |
| "rewards/tag_count_reward": 0.7877232491970062, |
| "step": 530 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 864.150033569336, |
| "epoch": 0.18264060766407783, |
| "grad_norm": 3.2473807334899902, |
| "kl": 1.7181640625, |
| "learning_rate": 2.938043295299385e-06, |
| "loss": 0.2154, |
| "reward": 1.6125000834465026, |
| "reward_std": 0.7306119620800018, |
| "rewards/accuracy_reward": 0.1553571529686451, |
| "rewards/format_reward": 0.6508928865194321, |
| "rewards/tag_count_reward": 0.8062500417232513, |
| "step": 535 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 891.7946899414062, |
| "epoch": 0.18434752923103184, |
| "grad_norm": 3.4195573329925537, |
| "kl": 3.03828125, |
| "learning_rate": 2.9354753555714188e-06, |
| "loss": 0.3556, |
| "reward": 1.7002232849597931, |
| "reward_std": 0.6955976724624634, |
| "rewards/accuracy_reward": 0.14375000847503544, |
| "rewards/format_reward": 0.7267857462167739, |
| "rewards/tag_count_reward": 0.8296875327825546, |
| "step": 540 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 863.1812896728516, |
| "epoch": 0.18605445079798583, |
| "grad_norm": 4.025784969329834, |
| "kl": 2.8484375, |
| "learning_rate": 2.932856442467476e-06, |
| "loss": 0.2801, |
| "reward": 1.723660796880722, |
| "reward_std": 0.6836878031492233, |
| "rewards/accuracy_reward": 0.19375001210719348, |
| "rewards/format_reward": 0.7125000387430191, |
| "rewards/tag_count_reward": 0.817410746216774, |
| "step": 545 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 902.3437896728516, |
| "epoch": 0.18776137236493984, |
| "grad_norm": 7.916510105133057, |
| "kl": 3.9625, |
| "learning_rate": 2.9301866489845167e-06, |
| "loss": 0.3677, |
| "reward": 1.4658482789993286, |
| "reward_std": 0.7412176042795181, |
| "rewards/accuracy_reward": 0.16875000838190318, |
| "rewards/format_reward": 0.5830357387661934, |
| "rewards/tag_count_reward": 0.7140625357627869, |
| "step": 550 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 867.4428955078125, |
| "epoch": 0.18946829393189382, |
| "grad_norm": 4.7268781661987305, |
| "kl": 2.042578125, |
| "learning_rate": 2.9274660699262483e-06, |
| "loss": 0.2883, |
| "reward": 1.5462054193019867, |
| "reward_std": 0.7172608077526093, |
| "rewards/accuracy_reward": 0.14375000577419997, |
| "rewards/format_reward": 0.6491071701049804, |
| "rewards/tag_count_reward": 0.7533482521772384, |
| "step": 555 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 815.8750366210937, |
| "epoch": 0.19117521549884783, |
| "grad_norm": 9.568544387817383, |
| "kl": 2.749609375, |
| "learning_rate": 2.9246948018997622e-06, |
| "loss": 0.2904, |
| "reward": 1.508035770058632, |
| "reward_std": 0.6859041944146156, |
| "rewards/accuracy_reward": 0.1125000048428774, |
| "rewards/format_reward": 0.6437500223517418, |
| "rewards/tag_count_reward": 0.7517857521772384, |
| "step": 560 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 751.5598541259766, |
| "epoch": 0.19288213706580182, |
| "grad_norm": 8.419025421142578, |
| "kl": 3.47421875, |
| "learning_rate": 2.9218729433121034e-06, |
| "loss": 0.2747, |
| "reward": 1.25133935213089, |
| "reward_std": 0.7746896028518677, |
| "rewards/accuracy_reward": 0.11785714812576771, |
| "rewards/format_reward": 0.48660716563463213, |
| "rewards/tag_count_reward": 0.6468750298023224, |
| "step": 565 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 694.6839538574219, |
| "epoch": 0.19458905863275583, |
| "grad_norm": 6.536264896392822, |
| "kl": 2.1771484375, |
| "learning_rate": 2.9190005943667748e-06, |
| "loss": 0.2243, |
| "reward": 1.5011161386966705, |
| "reward_std": 0.7515750855207444, |
| "rewards/accuracy_reward": 0.15892858086153866, |
| "rewards/format_reward": 0.6035714507102966, |
| "rewards/tag_count_reward": 0.7386161029338837, |
| "step": 570 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 712.3911041259765, |
| "epoch": 0.1962959801997098, |
| "grad_norm": 9.216137886047363, |
| "kl": 1.8626953125, |
| "learning_rate": 2.9160778570601787e-06, |
| "loss": 0.1986, |
| "reward": 1.6468750715255738, |
| "reward_std": 0.5837471626698971, |
| "rewards/accuracy_reward": 0.11517857760190964, |
| "rewards/format_reward": 0.7107143133878708, |
| "rewards/tag_count_reward": 0.8209821790456772, |
| "step": 575 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 746.0116394042968, |
| "epoch": 0.19800290176666382, |
| "grad_norm": 3.9668009281158447, |
| "kl": 2.507421875, |
| "learning_rate": 2.9131048351779963e-06, |
| "loss": 0.2798, |
| "reward": 1.6100447118282317, |
| "reward_std": 0.6843759298324585, |
| "rewards/accuracy_reward": 0.15803572265431284, |
| "rewards/format_reward": 0.6633928894996644, |
| "rewards/tag_count_reward": 0.7886161029338836, |
| "step": 580 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 708.175927734375, |
| "epoch": 0.1997098233336178, |
| "grad_norm": 2.981647491455078, |
| "kl": 2.158203125, |
| "learning_rate": 2.9100816342915025e-06, |
| "loss": 0.2073, |
| "reward": 1.5310268461704255, |
| "reward_std": 0.688689549267292, |
| "rewards/accuracy_reward": 0.16428572265431285, |
| "rewards/format_reward": 0.6160714536905288, |
| "rewards/tag_count_reward": 0.750669676065445, |
| "step": 585 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 675.5616455078125, |
| "epoch": 0.20141674490057182, |
| "grad_norm": 2.74385142326355, |
| "kl": 2.028076171875, |
| "learning_rate": 2.907008361753815e-06, |
| "loss": 0.2021, |
| "reward": 1.7705357670783997, |
| "reward_std": 0.5894090965390205, |
| "rewards/accuracy_reward": 0.16964286444708704, |
| "rewards/format_reward": 0.7535714626312255, |
| "rewards/tag_count_reward": 0.8473214656114578, |
| "step": 590 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 691.3527038574218, |
| "epoch": 0.2031236664675258, |
| "grad_norm": 13.477487564086914, |
| "kl": 1.8919921875, |
| "learning_rate": 2.903885126696083e-06, |
| "loss": 0.2684, |
| "reward": 1.987723308801651, |
| "reward_std": 0.3995923690497875, |
| "rewards/accuracy_reward": 0.17142858142033218, |
| "rewards/format_reward": 0.8812500417232514, |
| "rewards/tag_count_reward": 0.9350446790456772, |
| "step": 595 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 785.2036071777344, |
| "epoch": 0.20483058803447982, |
| "grad_norm": 3.5577802658081055, |
| "kl": 2.7875, |
| "learning_rate": 2.900712040023615e-06, |
| "loss": 0.3984, |
| "reward": 1.8696429312229157, |
| "reward_std": 0.549374633282423, |
| "rewards/accuracy_reward": 0.15892857955768705, |
| "rewards/format_reward": 0.8214286148548127, |
| "rewards/tag_count_reward": 0.8892857551574707, |
| "step": 600 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 760.4580657958984, |
| "epoch": 0.20653750960143383, |
| "grad_norm": 5.842397689819336, |
| "kl": 3.459765625, |
| "learning_rate": 2.8974892144119353e-06, |
| "loss": 0.3857, |
| "reward": 1.702455425262451, |
| "reward_std": 0.6209064692258834, |
| "rewards/accuracy_reward": 0.16160715045407414, |
| "rewards/format_reward": 0.7205357551574707, |
| "rewards/tag_count_reward": 0.8203125447034836, |
| "step": 605 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 774.1259307861328, |
| "epoch": 0.2082444311683878, |
| "grad_norm": 4.560490131378174, |
| "kl": 2.733203125, |
| "learning_rate": 2.894216764302787e-06, |
| "loss": 0.3168, |
| "reward": 1.5366072058677673, |
| "reward_std": 0.7480582863092422, |
| "rewards/accuracy_reward": 0.14196429261937737, |
| "rewards/format_reward": 0.636607164144516, |
| "rewards/tag_count_reward": 0.7580357521772385, |
| "step": 610 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 765.8830657958985, |
| "epoch": 0.20995135273534182, |
| "grad_norm": 3.172254800796509, |
| "kl": 2.722265625, |
| "learning_rate": 2.8908948059000676e-06, |
| "loss": 0.2966, |
| "reward": 1.5986607730388642, |
| "reward_std": 0.6495244219899178, |
| "rewards/accuracy_reward": 0.13571429261937737, |
| "rewards/format_reward": 0.6714285999536515, |
| "rewards/tag_count_reward": 0.7915178924798966, |
| "step": 615 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 711.2607513427735, |
| "epoch": 0.2116582743022958, |
| "grad_norm": 3.3680763244628906, |
| "kl": 3.16171875, |
| "learning_rate": 2.8875234571656997e-06, |
| "loss": 0.3196, |
| "reward": 1.6500000834465027, |
| "reward_std": 0.7083895608782769, |
| "rewards/accuracy_reward": 0.15000000903382898, |
| "rewards/format_reward": 0.6964285969734192, |
| "rewards/tag_count_reward": 0.8035714656114579, |
| "step": 620 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 765.7464630126954, |
| "epoch": 0.21336519586924982, |
| "grad_norm": 7.421957492828369, |
| "kl": 3.5240234375, |
| "learning_rate": 2.8841028378154463e-06, |
| "loss": 0.3704, |
| "reward": 1.6843750715255736, |
| "reward_std": 0.7275039911270141, |
| "rewards/accuracy_reward": 0.16696429308503866, |
| "rewards/format_reward": 0.710714316368103, |
| "rewards/tag_count_reward": 0.8066964626312256, |
| "step": 625 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 751.6946838378906, |
| "epoch": 0.2150721174362038, |
| "grad_norm": 5.945168972015381, |
| "kl": 2.863671875, |
| "learning_rate": 2.8806330693146575e-06, |
| "loss": 0.3454, |
| "reward": 1.7339286506175995, |
| "reward_std": 0.6255421549081802, |
| "rewards/accuracy_reward": 0.1321428634226322, |
| "rewards/format_reward": 0.7598214596509933, |
| "rewards/tag_count_reward": 0.8419643193483353, |
| "step": 630 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 769.4634246826172, |
| "epoch": 0.21677903900315781, |
| "grad_norm": 2.3628225326538086, |
| "kl": 3.294921875, |
| "learning_rate": 2.877114274873957e-06, |
| "loss": 0.3297, |
| "reward": 1.7406250834465027, |
| "reward_std": 0.6107124865055085, |
| "rewards/accuracy_reward": 0.14464286426082254, |
| "rewards/format_reward": 0.764285746216774, |
| "rewards/tag_count_reward": 0.8316964715719223, |
| "step": 635 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 806.5482543945312, |
| "epoch": 0.2184859605701118, |
| "grad_norm": 5.536141395568848, |
| "kl": 2.88671875, |
| "learning_rate": 2.8735465794448674e-06, |
| "loss": 0.3702, |
| "reward": 1.673883992433548, |
| "reward_std": 0.6438215777277947, |
| "rewards/accuracy_reward": 0.12857143385335804, |
| "rewards/format_reward": 0.7267857402563095, |
| "rewards/tag_count_reward": 0.8185268223285675, |
| "step": 640 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 863.1018280029297, |
| "epoch": 0.2201928821370658, |
| "grad_norm": 7.033008575439453, |
| "kl": 3.584375, |
| "learning_rate": 2.869930109715375e-06, |
| "loss": 0.4444, |
| "reward": 1.6212054193019867, |
| "reward_std": 0.6893970921635628, |
| "rewards/accuracy_reward": 0.13660714784637093, |
| "rewards/format_reward": 0.6901785939931869, |
| "rewards/tag_count_reward": 0.794419676065445, |
| "step": 645 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 786.244677734375, |
| "epoch": 0.2218998037040198, |
| "grad_norm": 4.502243518829346, |
| "kl": 2.404296875, |
| "learning_rate": 2.8662649941054266e-06, |
| "loss": 0.2794, |
| "reward": 1.6738840103149415, |
| "reward_std": 0.6413164183497428, |
| "rewards/accuracy_reward": 0.12232143357396126, |
| "rewards/format_reward": 0.7303571730852128, |
| "rewards/tag_count_reward": 0.8212053954601288, |
| "step": 650 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 787.2545013427734, |
| "epoch": 0.2236067252709738, |
| "grad_norm": 6.634191989898682, |
| "kl": 3.10234375, |
| "learning_rate": 2.8625513627623757e-06, |
| "loss": 0.3178, |
| "reward": 1.6941964983940125, |
| "reward_std": 0.6722912862896919, |
| "rewards/accuracy_reward": 0.155357148591429, |
| "rewards/format_reward": 0.7267857551574707, |
| "rewards/tag_count_reward": 0.8120536059141159, |
| "step": 655 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 740.8027191162109, |
| "epoch": 0.2253136468379278, |
| "grad_norm": 5.780096530914307, |
| "kl": 2.25556640625, |
| "learning_rate": 2.8587893475563546e-06, |
| "loss": 0.2618, |
| "reward": 1.9029018759727478, |
| "reward_std": 0.548953752219677, |
| "rewards/accuracy_reward": 0.17589286472648383, |
| "rewards/format_reward": 0.8410714715719223, |
| "rewards/tag_count_reward": 0.8859375387430191, |
| "step": 660 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 804.132177734375, |
| "epoch": 0.2270205684048818, |
| "grad_norm": 5.417214870452881, |
| "kl": 2.9451171875, |
| "learning_rate": 2.854979082075596e-06, |
| "loss": 0.3144, |
| "reward": 1.9008929550647735, |
| "reward_std": 0.5650276392698288, |
| "rewards/accuracy_reward": 0.2026785809546709, |
| "rewards/format_reward": 0.8223214656114578, |
| "rewards/tag_count_reward": 0.8758929014205933, |
| "step": 665 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 783.5616394042969, |
| "epoch": 0.22872748997183578, |
| "grad_norm": 3.13038969039917, |
| "kl": 3.588671875, |
| "learning_rate": 2.851120701621688e-06, |
| "loss": 0.3702, |
| "reward": 1.8468750834465026, |
| "reward_std": 0.5954644531011581, |
| "rewards/accuracy_reward": 0.19642858095467092, |
| "rewards/format_reward": 0.7919643223285675, |
| "rewards/tag_count_reward": 0.8584821790456771, |
| "step": 670 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 778.3678833007813, |
| "epoch": 0.2304344115387898, |
| "grad_norm": 5.504825592041016, |
| "kl": 2.851171875, |
| "learning_rate": 2.8472143432047694e-06, |
| "loss": 0.3215, |
| "reward": 1.8622768580913545, |
| "reward_std": 0.5927805215120315, |
| "rewards/accuracy_reward": 0.19732143823057413, |
| "rewards/format_reward": 0.8017857521772385, |
| "rewards/tag_count_reward": 0.8631696820259094, |
| "step": 675 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 827.1464691162109, |
| "epoch": 0.23214133310574378, |
| "grad_norm": 4.780069351196289, |
| "kl": 3.583984375, |
| "learning_rate": 2.8432601455386644e-06, |
| "loss": 0.3892, |
| "reward": 1.737723284959793, |
| "reward_std": 0.6600617378950119, |
| "rewards/accuracy_reward": 0.17500000931322574, |
| "rewards/format_reward": 0.7455357432365417, |
| "rewards/tag_count_reward": 0.8171875357627869, |
| "step": 680 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 792.6098571777344, |
| "epoch": 0.2338482546726978, |
| "grad_norm": 3.67447829246521, |
| "kl": 3.701953125, |
| "learning_rate": 2.8392582490359563e-06, |
| "loss": 0.3747, |
| "reward": 1.7160715103149413, |
| "reward_std": 0.6576286390423774, |
| "rewards/accuracy_reward": 0.16964286621659994, |
| "rewards/format_reward": 0.7348214566707612, |
| "rewards/tag_count_reward": 0.811607176065445, |
| "step": 685 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 756.3125366210937, |
| "epoch": 0.23555517623965178, |
| "grad_norm": 7.4521660804748535, |
| "kl": 2.940234375, |
| "learning_rate": 2.8352087958030044e-06, |
| "loss": 0.2788, |
| "reward": 1.7551340162754059, |
| "reward_std": 0.6580756172537804, |
| "rewards/accuracy_reward": 0.16428572107106448, |
| "rewards/format_reward": 0.7553571790456772, |
| "rewards/tag_count_reward": 0.8354911088943482, |
| "step": 690 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 743.0946716308594, |
| "epoch": 0.2372620978066058, |
| "grad_norm": 3.591153144836426, |
| "kl": 3.9421875, |
| "learning_rate": 2.8311119296348947e-06, |
| "loss": 0.3788, |
| "reward": 1.6745536565780639, |
| "reward_std": 0.714211243391037, |
| "rewards/accuracy_reward": 0.19375000949949026, |
| "rewards/format_reward": 0.6946428894996644, |
| "rewards/tag_count_reward": 0.7861607521772385, |
| "step": 695 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 693.334848022461, |
| "epoch": 0.23896901937355977, |
| "grad_norm": 17.02330207824707, |
| "kl": 2.3927734375, |
| "learning_rate": 2.826967796010334e-06, |
| "loss": 0.2871, |
| "reward": 1.8256697237491608, |
| "reward_std": 0.600771751999855, |
| "rewards/accuracy_reward": 0.15803572218865156, |
| "rewards/format_reward": 0.8008928954601288, |
| "rewards/tag_count_reward": 0.8667411148548126, |
| "step": 700 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 669.8428833007813, |
| "epoch": 0.24067594094051378, |
| "grad_norm": 3.661524534225464, |
| "kl": 3.7298828125, |
| "learning_rate": 2.8227765420864864e-06, |
| "loss": 0.3348, |
| "reward": 1.8287947297096252, |
| "reward_std": 0.6175472036004066, |
| "rewards/accuracy_reward": 0.21517857927829026, |
| "rewards/format_reward": 0.7669643223285675, |
| "rewards/tag_count_reward": 0.846651828289032, |
| "step": 705 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 668.5580657958984, |
| "epoch": 0.2423828625074678, |
| "grad_norm": 3.7521235942840576, |
| "kl": 2.2900390625, |
| "learning_rate": 2.8185383166937453e-06, |
| "loss": 0.2425, |
| "reward": 1.7064732909202576, |
| "reward_std": 0.6850044190883636, |
| "rewards/accuracy_reward": 0.16071429289877415, |
| "rewards/format_reward": 0.7241071730852127, |
| "rewards/tag_count_reward": 0.8216518193483353, |
| "step": 710 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 669.4509185791015, |
| "epoch": 0.24408978407442178, |
| "grad_norm": 5.815065860748291, |
| "kl": 3.68515625, |
| "learning_rate": 2.8142532703304487e-06, |
| "loss": 0.2798, |
| "reward": 1.6671875596046448, |
| "reward_std": 0.6780211150646209, |
| "rewards/accuracy_reward": 0.15535714998841285, |
| "rewards/format_reward": 0.6991071730852128, |
| "rewards/tag_count_reward": 0.8127232491970062, |
| "step": 715 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 716.5839660644531, |
| "epoch": 0.2457967056413758, |
| "grad_norm": 2.345759153366089, |
| "kl": 2.4634765625, |
| "learning_rate": 2.8099215551575375e-06, |
| "loss": 0.2412, |
| "reward": 1.6359375834465026, |
| "reward_std": 0.695411990582943, |
| "rewards/accuracy_reward": 0.17767857862636446, |
| "rewards/format_reward": 0.6669643133878708, |
| "rewards/tag_count_reward": 0.7912946760654449, |
| "step": 720 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 697.8205673217774, |
| "epoch": 0.24750362720832977, |
| "grad_norm": 4.898887634277344, |
| "kl": 2.38125, |
| "learning_rate": 2.805543324993149e-06, |
| "loss": 0.2515, |
| "reward": 1.6654018700122832, |
| "reward_std": 0.6400819554924965, |
| "rewards/accuracy_reward": 0.1598214372061193, |
| "rewards/format_reward": 0.6964285969734192, |
| "rewards/tag_count_reward": 0.8091518193483352, |
| "step": 725 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 711.1000305175781, |
| "epoch": 0.24921054877528379, |
| "grad_norm": 2.1958072185516357, |
| "kl": 2.2142578125, |
| "learning_rate": 2.8011187353071575e-06, |
| "loss": 0.2594, |
| "reward": 1.758035796880722, |
| "reward_std": 0.6030809044837951, |
| "rewards/accuracy_reward": 0.12589286137372255, |
| "rewards/format_reward": 0.77232146859169, |
| "rewards/tag_count_reward": 0.8598214685916901, |
| "step": 730 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 717.7393188476562, |
| "epoch": 0.25091747034223777, |
| "grad_norm": 3.359809637069702, |
| "kl": 2.7140625, |
| "learning_rate": 2.796647943215651e-06, |
| "loss": 0.2854, |
| "reward": 1.7325893461704254, |
| "reward_std": 0.6349606230854988, |
| "rewards/accuracy_reward": 0.16785715138539672, |
| "rewards/format_reward": 0.7375000327825546, |
| "rewards/tag_count_reward": 0.8272321820259094, |
| "step": 735 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 721.5625427246093, |
| "epoch": 0.2526243919091918, |
| "grad_norm": 3.7433454990386963, |
| "kl": 3.691796875, |
| "learning_rate": 2.792131107475355e-06, |
| "loss": 0.3333, |
| "reward": 1.5823661267757416, |
| "reward_std": 0.7140142098069191, |
| "rewards/accuracy_reward": 0.1642857214435935, |
| "rewards/format_reward": 0.6526785969734192, |
| "rewards/tag_count_reward": 0.7654018223285675, |
| "step": 740 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 744.0803863525391, |
| "epoch": 0.2543313134761458, |
| "grad_norm": 3.059191942214966, |
| "kl": 2.079296875, |
| "learning_rate": 2.7875683884779937e-06, |
| "loss": 0.2113, |
| "reward": 1.699330449104309, |
| "reward_std": 0.6429368361830712, |
| "rewards/accuracy_reward": 0.1500000087544322, |
| "rewards/format_reward": 0.7285714596509933, |
| "rewards/tag_count_reward": 0.8207589685916901, |
| "step": 745 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 677.4527130126953, |
| "epoch": 0.25603823504309975, |
| "grad_norm": 4.240560054779053, |
| "kl": 1.996240234375, |
| "learning_rate": 2.782959948244593e-06, |
| "loss": 0.1608, |
| "reward": 1.8892858147621154, |
| "reward_std": 0.4875759735703468, |
| "rewards/accuracy_reward": 0.13214286137372255, |
| "rewards/format_reward": 0.8526786118745804, |
| "rewards/tag_count_reward": 0.9044643312692642, |
| "step": 750 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 682.899136352539, |
| "epoch": 0.25774515661005376, |
| "grad_norm": 4.286968231201172, |
| "kl": 3.210546875, |
| "learning_rate": 2.7783059504197293e-06, |
| "loss": 0.3291, |
| "reward": 1.8241072475910187, |
| "reward_std": 0.6449521824717521, |
| "rewards/accuracy_reward": 0.1866071516647935, |
| "rewards/format_reward": 0.7794643253087997, |
| "rewards/tag_count_reward": 0.8580357432365417, |
| "step": 755 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 718.2161071777343, |
| "epoch": 0.25945207817700777, |
| "grad_norm": 5.481039047241211, |
| "kl": 2.884765625, |
| "learning_rate": 2.7736065602657186e-06, |
| "loss": 0.3314, |
| "reward": 1.7383929252624513, |
| "reward_std": 0.6634438171982765, |
| "rewards/accuracy_reward": 0.15089286332949997, |
| "rewards/format_reward": 0.7526785999536514, |
| "rewards/tag_count_reward": 0.83482146859169, |
| "step": 760 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 741.3330688476562, |
| "epoch": 0.2611589997439618, |
| "grad_norm": 5.945084095001221, |
| "kl": 3.43203125, |
| "learning_rate": 2.7688619446567456e-06, |
| "loss": 0.4039, |
| "reward": 1.7281250834465027, |
| "reward_std": 0.6317564234137535, |
| "rewards/accuracy_reward": 0.11250000493600965, |
| "rewards/format_reward": 0.7660714745521545, |
| "rewards/tag_count_reward": 0.8495536029338837, |
| "step": 765 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 730.2044952392578, |
| "epoch": 0.26286592131091574, |
| "grad_norm": 9.397069931030273, |
| "kl": 3.7162109375, |
| "learning_rate": 2.7640722720729424e-06, |
| "loss": 0.3945, |
| "reward": 1.7779018819332122, |
| "reward_std": 0.6457269221544266, |
| "rewards/accuracy_reward": 0.15714286686852574, |
| "rewards/format_reward": 0.7714286088943482, |
| "rewards/tag_count_reward": 0.8493303894996643, |
| "step": 770 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 686.750032043457, |
| "epoch": 0.26457284287786975, |
| "grad_norm": 3.8738150596618652, |
| "kl": 2.05673828125, |
| "learning_rate": 2.7592377125944e-06, |
| "loss": 0.2139, |
| "reward": 1.8906250834465026, |
| "reward_std": 0.49385173320770265, |
| "rewards/accuracy_reward": 0.14910714831203223, |
| "rewards/format_reward": 0.8428571850061417, |
| "rewards/tag_count_reward": 0.8986607521772385, |
| "step": 775 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 708.9919921875, |
| "epoch": 0.26627976444482376, |
| "grad_norm": 5.740293502807617, |
| "kl": 2.2453125, |
| "learning_rate": 2.7543584378951353e-06, |
| "loss": 0.3081, |
| "reward": 1.9504465162754059, |
| "reward_std": 0.4741246700286865, |
| "rewards/accuracy_reward": 0.1589285794645548, |
| "rewards/format_reward": 0.8758929014205933, |
| "rewards/tag_count_reward": 0.9156250447034836, |
| "step": 780 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 744.438427734375, |
| "epoch": 0.2679866860117778, |
| "grad_norm": 5.1604743003845215, |
| "kl": 3.141796875, |
| "learning_rate": 2.7494346212369884e-06, |
| "loss": 0.3417, |
| "reward": 1.8397322237491607, |
| "reward_std": 0.5357820302248001, |
| "rewards/accuracy_reward": 0.1508928632363677, |
| "rewards/format_reward": 0.8133928954601288, |
| "rewards/tag_count_reward": 0.8754464715719223, |
| "step": 785 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 760.9339630126954, |
| "epoch": 0.26969360757873173, |
| "grad_norm": 2.745206832885742, |
| "kl": 3.4203125, |
| "learning_rate": 2.7444664374634755e-06, |
| "loss": 0.3742, |
| "reward": 1.7848215222358703, |
| "reward_std": 0.6119966760277749, |
| "rewards/accuracy_reward": 0.12321429047733545, |
| "rewards/format_reward": 0.8026786148548126, |
| "rewards/tag_count_reward": 0.8589286118745804, |
| "step": 790 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 745.5946807861328, |
| "epoch": 0.27140052914568574, |
| "grad_norm": 5.523811340332031, |
| "kl": 3.22548828125, |
| "learning_rate": 2.739454062993578e-06, |
| "loss": 0.3161, |
| "reward": 1.862946516275406, |
| "reward_std": 0.5260402396321296, |
| "rewards/accuracy_reward": 0.16607143506407737, |
| "rewards/format_reward": 0.823214328289032, |
| "rewards/tag_count_reward": 0.8736607551574707, |
| "step": 795 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 748.8053894042969, |
| "epoch": 0.27310745071263975, |
| "grad_norm": 4.337275981903076, |
| "kl": 1.62451171875, |
| "learning_rate": 2.7343976758154765e-06, |
| "loss": 0.2197, |
| "reward": 1.9397322237491608, |
| "reward_std": 0.47983556240797043, |
| "rewards/accuracy_reward": 0.169642863702029, |
| "rewards/format_reward": 0.86607146859169, |
| "rewards/tag_count_reward": 0.9040178984403611, |
| "step": 800 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 729.189321899414, |
| "epoch": 0.27481437227959377, |
| "grad_norm": 6.721214294433594, |
| "kl": 3.484375, |
| "learning_rate": 2.7292974554802343e-06, |
| "loss": 0.3518, |
| "reward": 1.8767857909202577, |
| "reward_std": 0.5359082013368607, |
| "rewards/accuracy_reward": 0.16875000894069672, |
| "rewards/format_reward": 0.8303571820259095, |
| "rewards/tag_count_reward": 0.8776786088943481, |
| "step": 805 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 764.805386352539, |
| "epoch": 0.2765212938465478, |
| "grad_norm": 4.701261520385742, |
| "kl": 2.3328125, |
| "learning_rate": 2.7241535830954174e-06, |
| "loss": 0.2629, |
| "reward": 1.816517949104309, |
| "reward_std": 0.500702029466629, |
| "rewards/accuracy_reward": 0.1241071479395032, |
| "rewards/format_reward": 0.8241071820259094, |
| "rewards/tag_count_reward": 0.8683036148548127, |
| "step": 810 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 714.7661026000976, |
| "epoch": 0.27822821541350173, |
| "grad_norm": 13.666290283203125, |
| "kl": 2.6896484375, |
| "learning_rate": 2.718966241318666e-06, |
| "loss": 0.3039, |
| "reward": 1.8928572177886962, |
| "reward_std": 0.551244530826807, |
| "rewards/accuracy_reward": 0.17857143664732575, |
| "rewards/format_reward": 0.835714328289032, |
| "rewards/tag_count_reward": 0.8785714715719223, |
| "step": 815 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 743.2277130126953, |
| "epoch": 0.27993513698045575, |
| "grad_norm": 8.007411003112793, |
| "kl": 2.865234375, |
| "learning_rate": 2.713735614351208e-06, |
| "loss": 0.2725, |
| "reward": 1.8796875894069671, |
| "reward_std": 0.5558550491929054, |
| "rewards/accuracy_reward": 0.17857143580913543, |
| "rewards/format_reward": 0.8267857521772385, |
| "rewards/tag_count_reward": 0.874330398440361, |
| "step": 820 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 822.1491485595703, |
| "epoch": 0.28164205854740976, |
| "grad_norm": 5.34137487411499, |
| "kl": 3.694140625, |
| "learning_rate": 2.7084618879313177e-06, |
| "loss": 0.4349, |
| "reward": 1.6993304371833802, |
| "reward_std": 0.7172507822513581, |
| "rewards/accuracy_reward": 0.12500000596046448, |
| "rewards/format_reward": 0.7419643193483353, |
| "rewards/tag_count_reward": 0.8323661059141159, |
| "step": 825 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 812.3500396728516, |
| "epoch": 0.28334898011436377, |
| "grad_norm": 2.870173454284668, |
| "kl": 3.059375, |
| "learning_rate": 2.7031452493277193e-06, |
| "loss": 0.3536, |
| "reward": 1.5551339983940125, |
| "reward_std": 0.7007823586463928, |
| "rewards/accuracy_reward": 0.12500000558793545, |
| "rewards/format_reward": 0.6125000178813934, |
| "rewards/tag_count_reward": 0.8176339656114578, |
| "step": 830 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 774.5098571777344, |
| "epoch": 0.2850559016813177, |
| "grad_norm": 3.5636160373687744, |
| "kl": 2.5625, |
| "learning_rate": 2.6977858873329394e-06, |
| "loss": 0.2921, |
| "reward": 1.5524554193019866, |
| "reward_std": 0.6861270070075989, |
| "rewards/accuracy_reward": 0.16696429420262576, |
| "rewards/format_reward": 0.5660714566707611, |
| "rewards/tag_count_reward": 0.819419676065445, |
| "step": 835 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 755.8803894042969, |
| "epoch": 0.28676282324827174, |
| "grad_norm": 7.776058197021484, |
| "kl": 2.5333984375, |
| "learning_rate": 2.6923839922566012e-06, |
| "loss": 0.3192, |
| "reward": 1.6341518700122832, |
| "reward_std": 0.6832039266824722, |
| "rewards/accuracy_reward": 0.17410714970901608, |
| "rewards/format_reward": 0.6196428865194321, |
| "rewards/tag_count_reward": 0.8404018163681031, |
| "step": 840 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 714.9723510742188, |
| "epoch": 0.28846974481522575, |
| "grad_norm": 5.4556498527526855, |
| "kl": 3.023828125, |
| "learning_rate": 2.686939755918667e-06, |
| "loss": 0.3092, |
| "reward": 1.707589364051819, |
| "reward_std": 0.6695069923996926, |
| "rewards/accuracy_reward": 0.1642857222817838, |
| "rewards/format_reward": 0.700892886519432, |
| "rewards/tag_count_reward": 0.842410746216774, |
| "step": 845 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 713.3223480224609, |
| "epoch": 0.29017666638217976, |
| "grad_norm": 4.508235454559326, |
| "kl": 3.01953125, |
| "learning_rate": 2.6814533716426266e-06, |
| "loss": 0.3284, |
| "reward": 1.711160808801651, |
| "reward_std": 0.6680841892957687, |
| "rewards/accuracy_reward": 0.14910715091973542, |
| "rewards/format_reward": 0.7205357491970062, |
| "rewards/tag_count_reward": 0.8415178894996643, |
| "step": 850 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 693.2723495483399, |
| "epoch": 0.2918835879491337, |
| "grad_norm": 5.78735876083374, |
| "kl": 2.384765625, |
| "learning_rate": 2.675925034248633e-06, |
| "loss": 0.2759, |
| "reward": 1.7803572475910188, |
| "reward_std": 0.5899964898824692, |
| "rewards/accuracy_reward": 0.11339286342263222, |
| "rewards/format_reward": 0.7901786118745804, |
| "rewards/tag_count_reward": 0.8767857521772384, |
| "step": 855 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 703.7946746826171, |
| "epoch": 0.2935905095160877, |
| "grad_norm": 10.656431198120117, |
| "kl": 2.68349609375, |
| "learning_rate": 2.670354940046585e-06, |
| "loss": 0.2471, |
| "reward": 1.8513393819332122, |
| "reward_std": 0.5069714426994324, |
| "rewards/accuracy_reward": 0.15625000707805156, |
| "rewards/format_reward": 0.8116071820259094, |
| "rewards/tag_count_reward": 0.8834821820259094, |
| "step": 860 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 701.3857421875, |
| "epoch": 0.29529743108304174, |
| "grad_norm": 5.979944229125977, |
| "kl": 1.7492431640625, |
| "learning_rate": 2.664743286829154e-06, |
| "loss": 0.1888, |
| "reward": 1.957812601327896, |
| "reward_std": 0.4485042683780193, |
| "rewards/accuracy_reward": 0.16785714952275158, |
| "rewards/format_reward": 0.8714286148548126, |
| "rewards/tag_count_reward": 0.918526828289032, |
| "step": 865 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 720.1571807861328, |
| "epoch": 0.29700435264999575, |
| "grad_norm": 11.468668937683105, |
| "kl": 2.44384765625, |
| "learning_rate": 2.6590902738647616e-06, |
| "loss": 0.2573, |
| "reward": 1.9354911506175996, |
| "reward_std": 0.4262035805732012, |
| "rewards/accuracy_reward": 0.14375000605359672, |
| "rewards/format_reward": 0.8741071820259094, |
| "rewards/tag_count_reward": 0.9176339715719223, |
| "step": 870 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 751.6589630126953, |
| "epoch": 0.2987112742169497, |
| "grad_norm": 2.175903081893921, |
| "kl": 1.407275390625, |
| "learning_rate": 2.6533961018905052e-06, |
| "loss": 0.1315, |
| "reward": 1.9747768819332123, |
| "reward_std": 0.41898268088698387, |
| "rewards/accuracy_reward": 0.17946429271250963, |
| "rewards/format_reward": 0.8776786088943481, |
| "rewards/tag_count_reward": 0.9176339745521546, |
| "step": 875 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 717.8036071777344, |
| "epoch": 0.3004181957839037, |
| "grad_norm": 14.61109733581543, |
| "kl": 1.8591796875, |
| "learning_rate": 2.6476609731050277e-06, |
| "loss": 0.18, |
| "reward": 1.9504465281963348, |
| "reward_std": 0.44176030084490775, |
| "rewards/accuracy_reward": 0.17410715091973544, |
| "rewards/format_reward": 0.8669643223285675, |
| "rewards/tag_count_reward": 0.9093750417232513, |
| "step": 880 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 720.5634307861328, |
| "epoch": 0.30212511735085773, |
| "grad_norm": 9.741268157958984, |
| "kl": 2.7447265625, |
| "learning_rate": 2.6418850911613385e-06, |
| "loss": 0.1958, |
| "reward": 1.9113840103149413, |
| "reward_std": 0.5450401276350021, |
| "rewards/accuracy_reward": 0.19375000856816768, |
| "rewards/format_reward": 0.8312500447034836, |
| "rewards/tag_count_reward": 0.8863839685916901, |
| "step": 885 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 695.589321899414, |
| "epoch": 0.30383203891781174, |
| "grad_norm": 3.411552667617798, |
| "kl": 1.9880859375, |
| "learning_rate": 2.6360686611595808e-06, |
| "loss": 0.1606, |
| "reward": 1.933928668498993, |
| "reward_std": 0.5159840732812881, |
| "rewards/accuracy_reward": 0.18303572619333863, |
| "rewards/format_reward": 0.8535714715719223, |
| "rewards/tag_count_reward": 0.8973214626312256, |
| "step": 890 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 700.4536041259765, |
| "epoch": 0.3055389604847657, |
| "grad_norm": 2.229177474975586, |
| "kl": 1.51279296875, |
| "learning_rate": 2.63021188963975e-06, |
| "loss": 0.1022, |
| "reward": 1.9627233028411866, |
| "reward_std": 0.5419594079256058, |
| "rewards/accuracy_reward": 0.21071429420262575, |
| "rewards/format_reward": 0.8491071790456772, |
| "rewards/tag_count_reward": 0.902901828289032, |
| "step": 895 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 720.6919952392578, |
| "epoch": 0.3072458820517197, |
| "grad_norm": 5.474370956420898, |
| "kl": 2.558203125, |
| "learning_rate": 2.62431498457436e-06, |
| "loss": 0.2085, |
| "reward": 1.7459822177886963, |
| "reward_std": 0.5931232050061226, |
| "rewards/accuracy_reward": 0.11696429029107094, |
| "rewards/format_reward": 0.7803571790456771, |
| "rewards/tag_count_reward": 0.848660746216774, |
| "step": 900 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 731.7214614868165, |
| "epoch": 0.3089528036186737, |
| "grad_norm": 3.358218193054199, |
| "kl": 1.9822265625, |
| "learning_rate": 2.6183781553610553e-06, |
| "loss": 0.1622, |
| "reward": 1.9395090162754058, |
| "reward_std": 0.45833816528320315, |
| "rewards/accuracy_reward": 0.1821428650058806, |
| "rewards/format_reward": 0.8562500417232514, |
| "rewards/tag_count_reward": 0.9011161118745804, |
| "step": 905 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 698.5411071777344, |
| "epoch": 0.31065972518562773, |
| "grad_norm": 3.1719696521759033, |
| "kl": 1.41435546875, |
| "learning_rate": 2.612401612815176e-06, |
| "loss": 0.1284, |
| "reward": 2.0187501192092894, |
| "reward_std": 0.4606904126703739, |
| "rewards/accuracy_reward": 0.20892858002334833, |
| "rewards/format_reward": 0.8901786148548126, |
| "rewards/tag_count_reward": 0.9196429014205932, |
| "step": 910 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 702.0857452392578, |
| "epoch": 0.31236664675258174, |
| "grad_norm": 4.088690757751465, |
| "kl": 2.1846923828125, |
| "learning_rate": 2.6063855691622773e-06, |
| "loss": 0.2211, |
| "reward": 1.9000000834465027, |
| "reward_std": 0.4846745885908604, |
| "rewards/accuracy_reward": 0.134821433480829, |
| "rewards/format_reward": 0.8642857521772385, |
| "rewards/tag_count_reward": 0.900892898440361, |
| "step": 915 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 721.0803833007812, |
| "epoch": 0.3140735683195357, |
| "grad_norm": 4.751431465148926, |
| "kl": 2.767578125, |
| "learning_rate": 2.6003302380305835e-06, |
| "loss": 0.2363, |
| "reward": 1.8395090222358703, |
| "reward_std": 0.6193484604358673, |
| "rewards/accuracy_reward": 0.18660715268924832, |
| "rewards/format_reward": 0.7964286088943482, |
| "rewards/tag_count_reward": 0.8564732551574707, |
| "step": 920 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 763.2937896728515, |
| "epoch": 0.3157804898864897, |
| "grad_norm": 13.211581230163574, |
| "kl": 2.8927734375, |
| "learning_rate": 2.5942358344434123e-06, |
| "loss": 0.2895, |
| "reward": 1.7747768580913543, |
| "reward_std": 0.6452827632427216, |
| "rewards/accuracy_reward": 0.16428572237491607, |
| "rewards/format_reward": 0.7723214626312256, |
| "rewards/tag_count_reward": 0.8381696790456772, |
| "step": 925 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 788.6928924560547, |
| "epoch": 0.3174874114534437, |
| "grad_norm": 10.8933687210083, |
| "kl": 3.546875, |
| "learning_rate": 2.588102574811531e-06, |
| "loss": 0.3418, |
| "reward": 1.7863839983940124, |
| "reward_std": 0.6044072821736336, |
| "rewards/accuracy_reward": 0.1562500067986548, |
| "rewards/format_reward": 0.7821428894996643, |
| "rewards/tag_count_reward": 0.8479911088943481, |
| "step": 930 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 764.8339691162109, |
| "epoch": 0.31919433302039774, |
| "grad_norm": 6.15324068069458, |
| "kl": 3.3, |
| "learning_rate": 2.581930676925478e-06, |
| "loss": 0.3588, |
| "reward": 1.8930804491043092, |
| "reward_std": 0.5663298577070236, |
| "rewards/accuracy_reward": 0.20714286640286445, |
| "rewards/format_reward": 0.8142857611179352, |
| "rewards/tag_count_reward": 0.8716518223285675, |
| "step": 935 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 747.1009246826172, |
| "epoch": 0.3209012545873517, |
| "grad_norm": 6.0142927169799805, |
| "kl": 1.954833984375, |
| "learning_rate": 2.5757203599478252e-06, |
| "loss": 0.2146, |
| "reward": 1.9209822177886964, |
| "reward_std": 0.4752822183072567, |
| "rewards/accuracy_reward": 0.1660714359022677, |
| "rewards/format_reward": 0.8562500447034835, |
| "rewards/tag_count_reward": 0.8986607551574707, |
| "step": 940 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 794.6161071777344, |
| "epoch": 0.3226081761543057, |
| "grad_norm": 4.774122714996338, |
| "kl": 2.436328125, |
| "learning_rate": 2.5694718444053977e-06, |
| "loss": 0.281, |
| "reward": 1.9752233147621154, |
| "reward_std": 0.4990184798836708, |
| "rewards/accuracy_reward": 0.18660715082660317, |
| "rewards/format_reward": 0.873214328289032, |
| "rewards/tag_count_reward": 0.9154018312692642, |
| "step": 945 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 798.3562744140625, |
| "epoch": 0.3243150977212597, |
| "grad_norm": 8.109549522399902, |
| "kl": 2.608984375, |
| "learning_rate": 2.5631853521814413e-06, |
| "loss": 0.3287, |
| "reward": 1.9203125774860381, |
| "reward_std": 0.5808916047215462, |
| "rewards/accuracy_reward": 0.2053571494296193, |
| "rewards/format_reward": 0.8321428924798966, |
| "rewards/tag_count_reward": 0.8828125417232513, |
| "step": 950 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 786.5107513427735, |
| "epoch": 0.3260220192882137, |
| "grad_norm": 10.114983558654785, |
| "kl": 3.005859375, |
| "learning_rate": 2.556861106507745e-06, |
| "loss": 0.3483, |
| "reward": 1.8308036565780639, |
| "reward_std": 0.598972900211811, |
| "rewards/accuracy_reward": 0.16696429401636123, |
| "rewards/format_reward": 0.8000000387430191, |
| "rewards/tag_count_reward": 0.8638393193483352, |
| "step": 955 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 804.1580688476563, |
| "epoch": 0.3277289408551677, |
| "grad_norm": 6.412922382354736, |
| "kl": 3.478125, |
| "learning_rate": 2.5504993319567154e-06, |
| "loss": 0.3655, |
| "reward": 1.837276864051819, |
| "reward_std": 0.5885704472661019, |
| "rewards/accuracy_reward": 0.1660714378580451, |
| "rewards/format_reward": 0.8062500447034836, |
| "rewards/tag_count_reward": 0.8649553924798965, |
| "step": 960 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 773.3937805175781, |
| "epoch": 0.3294358624221217, |
| "grad_norm": 8.863078117370605, |
| "kl": 2.7748046875, |
| "learning_rate": 2.544100254433396e-06, |
| "loss": 0.3388, |
| "reward": 1.8790179371833802, |
| "reward_std": 0.5164345070719719, |
| "rewards/accuracy_reward": 0.1598214370198548, |
| "rewards/format_reward": 0.836607176065445, |
| "rewards/tag_count_reward": 0.8825893253087997, |
| "step": 965 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 789.2919982910156, |
| "epoch": 0.3311427839890757, |
| "grad_norm": 10.961185455322266, |
| "kl": 3.2140625, |
| "learning_rate": 2.537664101167453e-06, |
| "loss": 0.3765, |
| "reward": 1.897991156578064, |
| "reward_std": 0.5266193248331547, |
| "rewards/accuracy_reward": 0.1919642908498645, |
| "rewards/format_reward": 0.8276786118745804, |
| "rewards/tag_count_reward": 0.8783482581377029, |
| "step": 970 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 822.3991485595703, |
| "epoch": 0.3328497055560297, |
| "grad_norm": 3.4954206943511963, |
| "kl": 3.3, |
| "learning_rate": 2.531191100705102e-06, |
| "loss": 0.3987, |
| "reward": 1.801116168498993, |
| "reward_std": 0.596581481397152, |
| "rewards/accuracy_reward": 0.15803572116419673, |
| "rewards/format_reward": 0.795535746216774, |
| "rewards/tag_count_reward": 0.8475446820259094, |
| "step": 975 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 819.3446838378907, |
| "epoch": 0.3345566271229837, |
| "grad_norm": 4.400045871734619, |
| "kl": 4.9328125, |
| "learning_rate": 2.5246814829009937e-06, |
| "loss": 0.493, |
| "reward": 1.7078125894069671, |
| "reward_std": 0.6570774331688881, |
| "rewards/accuracy_reward": 0.14910714896395802, |
| "rewards/format_reward": 0.7455357491970063, |
| "rewards/tag_count_reward": 0.8131696790456772, |
| "step": 980 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 791.8509246826172, |
| "epoch": 0.3362635486899377, |
| "grad_norm": 5.740270137786865, |
| "kl": 2.66484375, |
| "learning_rate": 2.518135478910051e-06, |
| "loss": 0.3217, |
| "reward": 1.900223296880722, |
| "reward_std": 0.5493961855769157, |
| "rewards/accuracy_reward": 0.18928572423756124, |
| "rewards/format_reward": 0.8312500417232513, |
| "rewards/tag_count_reward": 0.8796875387430191, |
| "step": 985 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 797.6411010742188, |
| "epoch": 0.3379704702568917, |
| "grad_norm": 7.0871076583862305, |
| "kl": 2.4041015625, |
| "learning_rate": 2.5115533211792624e-06, |
| "loss": 0.2537, |
| "reward": 1.90089293718338, |
| "reward_std": 0.4941477760672569, |
| "rewards/accuracy_reward": 0.14821429271250963, |
| "rewards/format_reward": 0.8571429044008255, |
| "rewards/tag_count_reward": 0.8955357670783997, |
| "step": 990 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 750.6303894042969, |
| "epoch": 0.3396773918238457, |
| "grad_norm": 2.675424575805664, |
| "kl": 2.11796875, |
| "learning_rate": 2.5049352434394263e-06, |
| "loss": 0.2056, |
| "reward": 2.0265625774860383, |
| "reward_std": 0.39688876681029794, |
| "rewards/accuracy_reward": 0.18392857871949672, |
| "rewards/format_reward": 0.9044643312692642, |
| "rewards/tag_count_reward": 0.9381696909666062, |
| "step": 995 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 783.2669982910156, |
| "epoch": 0.34138431339079967, |
| "grad_norm": 5.672367572784424, |
| "kl": 1.9142578125, |
| "learning_rate": 2.4982814806968506e-06, |
| "loss": 0.2906, |
| "reward": 1.9834822058677672, |
| "reward_std": 0.4078844651579857, |
| "rewards/accuracy_reward": 0.15892857778817415, |
| "rewards/format_reward": 0.8955357551574707, |
| "rewards/tag_count_reward": 0.9290179044008255, |
| "step": 1000 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 814.5116424560547, |
| "epoch": 0.3430912349577537, |
| "grad_norm": 4.191405773162842, |
| "kl": 2.8767578125, |
| "learning_rate": 2.4915922692250107e-06, |
| "loss": 0.3323, |
| "reward": 1.9546875834465027, |
| "reward_std": 0.512929305434227, |
| "rewards/accuracy_reward": 0.19910715222358705, |
| "rewards/format_reward": 0.8580357611179352, |
| "rewards/tag_count_reward": 0.8975446790456771, |
| "step": 1005 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 802.0875366210937, |
| "epoch": 0.3447981565247077, |
| "grad_norm": 10.191553115844727, |
| "kl": 2.809375, |
| "learning_rate": 2.484867846556157e-06, |
| "loss": 0.3604, |
| "reward": 1.8754464983940125, |
| "reward_std": 0.5539876684546471, |
| "rewards/accuracy_reward": 0.16250000847503543, |
| "rewards/format_reward": 0.8330357551574707, |
| "rewards/tag_count_reward": 0.8799107581377029, |
| "step": 1010 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 760.6366394042968, |
| "epoch": 0.3465050780916617, |
| "grad_norm": 9.369229316711426, |
| "kl": 2.7625, |
| "learning_rate": 2.4781084514728797e-06, |
| "loss": 0.3838, |
| "reward": 1.8613839983940124, |
| "reward_std": 0.5541360631585122, |
| "rewards/accuracy_reward": 0.17589286714792252, |
| "rewards/format_reward": 0.8142857551574707, |
| "rewards/tag_count_reward": 0.871205398440361, |
| "step": 1015 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 756.9161071777344, |
| "epoch": 0.3482119996586157, |
| "grad_norm": 49.239768981933594, |
| "kl": 2.9908203125, |
| "learning_rate": 2.471314323999632e-06, |
| "loss": 0.3966, |
| "reward": 1.8754465281963348, |
| "reward_std": 0.548448670655489, |
| "rewards/accuracy_reward": 0.17589286426082254, |
| "rewards/format_reward": 0.8223214656114578, |
| "rewards/tag_count_reward": 0.8772321790456772, |
| "step": 1020 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 724.8803802490235, |
| "epoch": 0.34991892122556967, |
| "grad_norm": 8.442652702331543, |
| "kl": 2.96015625, |
| "learning_rate": 2.4644857053942066e-06, |
| "loss": 0.3474, |
| "reward": 1.855803668498993, |
| "reward_std": 0.5516223564743996, |
| "rewards/accuracy_reward": 0.16160714980214835, |
| "rewards/format_reward": 0.8151786029338837, |
| "rewards/tag_count_reward": 0.8790178924798966, |
| "step": 1025 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 755.0678955078125, |
| "epoch": 0.3516258427925237, |
| "grad_norm": 6.77180814743042, |
| "kl": 2.8849609375, |
| "learning_rate": 2.457622838139166e-06, |
| "loss": 0.4207, |
| "reward": 1.8727679431438446, |
| "reward_std": 0.6202140808105469, |
| "rewards/accuracy_reward": 0.18928572423756124, |
| "rewards/format_reward": 0.8116071820259094, |
| "rewards/tag_count_reward": 0.8718750387430191, |
| "step": 1030 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 849.719677734375, |
| "epoch": 0.3533327643594777, |
| "grad_norm": 10.987722396850586, |
| "kl": 3.19482421875, |
| "learning_rate": 2.4507259659332335e-06, |
| "loss": 0.3413, |
| "reward": 1.7948661506175996, |
| "reward_std": 0.6245307192206383, |
| "rewards/accuracy_reward": 0.17410715082660316, |
| "rewards/format_reward": 0.7767857521772384, |
| "rewards/tag_count_reward": 0.8439732521772385, |
| "step": 1035 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 895.3580810546875, |
| "epoch": 0.3550396859264317, |
| "grad_norm": 6.328294277191162, |
| "kl": 3.1818359375, |
| "learning_rate": 2.443795333682642e-06, |
| "loss": 0.3346, |
| "reward": 1.7332590043544769, |
| "reward_std": 0.5950308412313461, |
| "rewards/accuracy_reward": 0.11160714598372579, |
| "rewards/format_reward": 0.7758929014205933, |
| "rewards/tag_count_reward": 0.8457589685916901, |
| "step": 1040 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 793.3732391357422, |
| "epoch": 0.35674660749338566, |
| "grad_norm": 4.254904270172119, |
| "kl": 2.4732421875, |
| "learning_rate": 2.4368311874924335e-06, |
| "loss": 0.2072, |
| "reward": 1.8361608028411864, |
| "reward_std": 0.5906515352427959, |
| "rewards/accuracy_reward": 0.2044642929919064, |
| "rewards/format_reward": 0.7812500387430191, |
| "rewards/tag_count_reward": 0.8504464656114579, |
| "step": 1045 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 773.1652160644531, |
| "epoch": 0.35845352906033967, |
| "grad_norm": 5.058516979217529, |
| "kl": 1.618115234375, |
| "learning_rate": 2.4298337746577227e-06, |
| "loss": 0.1907, |
| "reward": 1.948660784959793, |
| "reward_std": 0.4876935049891472, |
| "rewards/accuracy_reward": 0.1875000072643161, |
| "rewards/format_reward": 0.8535714715719223, |
| "rewards/tag_count_reward": 0.9075893253087998, |
| "step": 1050 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 853.6509307861328, |
| "epoch": 0.3601604506272937, |
| "grad_norm": 7.05323600769043, |
| "kl": 3.0990234375, |
| "learning_rate": 2.4228033436549135e-06, |
| "loss": 0.3536, |
| "reward": 1.8406250953674317, |
| "reward_std": 0.5797086969017983, |
| "rewards/accuracy_reward": 0.1598214370198548, |
| "rewards/format_reward": 0.8080357611179352, |
| "rewards/tag_count_reward": 0.8727678984403611, |
| "step": 1055 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 881.2268157958985, |
| "epoch": 0.3618673721942477, |
| "grad_norm": 7.7438883781433105, |
| "kl": 2.562109375, |
| "learning_rate": 2.4157401441328782e-06, |
| "loss": 0.311, |
| "reward": 1.8348215222358704, |
| "reward_std": 0.5855709910392761, |
| "rewards/accuracy_reward": 0.16339286286383867, |
| "rewards/format_reward": 0.80357146859169, |
| "rewards/tag_count_reward": 0.8678571909666062, |
| "step": 1060 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 815.9911071777344, |
| "epoch": 0.36357429376120165, |
| "grad_norm": 2.9513394832611084, |
| "kl": 3.583203125, |
| "learning_rate": 2.4086444269040905e-06, |
| "loss": 0.3823, |
| "reward": 1.8600447118282317, |
| "reward_std": 0.560142383724451, |
| "rewards/accuracy_reward": 0.16071429243311286, |
| "rewards/format_reward": 0.8205357521772385, |
| "rewards/tag_count_reward": 0.8787946820259094, |
| "step": 1065 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 861.2714660644531, |
| "epoch": 0.36528121532815566, |
| "grad_norm": 5.095993995666504, |
| "kl": 3.023828125, |
| "learning_rate": 2.4015164439357192e-06, |
| "loss": 0.2902, |
| "reward": 1.8332590162754059, |
| "reward_std": 0.6293601065874099, |
| "rewards/accuracy_reward": 0.16785715147852898, |
| "rewards/format_reward": 0.8000000417232513, |
| "rewards/tag_count_reward": 0.8654018253087997, |
| "step": 1070 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 789.6696807861329, |
| "epoch": 0.3669881368951097, |
| "grad_norm": 3.97644305229187, |
| "kl": 2.890625, |
| "learning_rate": 2.3943564483406825e-06, |
| "loss": 0.3023, |
| "reward": 1.897991156578064, |
| "reward_std": 0.5991736590862274, |
| "rewards/accuracy_reward": 0.20000000931322576, |
| "rewards/format_reward": 0.8187500417232514, |
| "rewards/tag_count_reward": 0.8792411088943481, |
| "step": 1075 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 780.6500396728516, |
| "epoch": 0.3686950584620637, |
| "grad_norm": 2.795880079269409, |
| "kl": 2.1537109375, |
| "learning_rate": 2.387164694368659e-06, |
| "loss": 0.2472, |
| "reward": 1.9285715222358704, |
| "reward_std": 0.5227874010801316, |
| "rewards/accuracy_reward": 0.1892857223749161, |
| "rewards/format_reward": 0.8401786148548126, |
| "rewards/tag_count_reward": 0.8991071790456772, |
| "step": 1080 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 799.2286102294922, |
| "epoch": 0.37040198002901764, |
| "grad_norm": 5.091084003448486, |
| "kl": 4.1421875, |
| "learning_rate": 2.3799414373970595e-06, |
| "loss": 0.3939, |
| "reward": 1.7937500655651093, |
| "reward_std": 0.5763512536883354, |
| "rewards/accuracy_reward": 0.1687500085681677, |
| "rewards/format_reward": 0.7803571790456771, |
| "rewards/tag_count_reward": 0.8446428954601288, |
| "step": 1085 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 763.5044982910156, |
| "epoch": 0.37210890159597165, |
| "grad_norm": 4.417270183563232, |
| "kl": 2.73828125, |
| "learning_rate": 2.372686933921957e-06, |
| "loss": 0.2796, |
| "reward": 1.7484375894069673, |
| "reward_std": 0.6087081640958786, |
| "rewards/accuracy_reward": 0.12053572125732899, |
| "rewards/format_reward": 0.7758928954601287, |
| "rewards/tag_count_reward": 0.8520089656114578, |
| "step": 1090 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 762.025033569336, |
| "epoch": 0.37381582316292566, |
| "grad_norm": 3.8321306705474854, |
| "kl": 3.471875, |
| "learning_rate": 2.3654014415489823e-06, |
| "loss": 0.3855, |
| "reward": 1.7381697118282318, |
| "reward_std": 0.6132876291871071, |
| "rewards/accuracy_reward": 0.137500006146729, |
| "rewards/format_reward": 0.7633928924798965, |
| "rewards/tag_count_reward": 0.8372768223285675, |
| "step": 1095 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 748.9294982910156, |
| "epoch": 0.3755227447298797, |
| "grad_norm": 6.6013569831848145, |
| "kl": 2.5224609375, |
| "learning_rate": 2.3580852189841734e-06, |
| "loss": 0.2485, |
| "reward": 1.7801340043544769, |
| "reward_std": 0.5882782399654388, |
| "rewards/accuracy_reward": 0.11785714775323868, |
| "rewards/format_reward": 0.7991071850061416, |
| "rewards/tag_count_reward": 0.8631696850061417, |
| "step": 1100 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 722.4464599609375, |
| "epoch": 0.3772296662968337, |
| "grad_norm": 10.079838752746582, |
| "kl": 2.2388671875, |
| "learning_rate": 2.35073852602479e-06, |
| "loss": 0.2362, |
| "reward": 1.9562501072883607, |
| "reward_std": 0.42431456968188286, |
| "rewards/accuracy_reward": 0.1857142916880548, |
| "rewards/format_reward": 0.8633928954601288, |
| "rewards/tag_count_reward": 0.907142898440361, |
| "step": 1105 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 735.2321746826171, |
| "epoch": 0.37893658786378764, |
| "grad_norm": 2.2775678634643555, |
| "kl": 2.25322265625, |
| "learning_rate": 2.343361623550087e-06, |
| "loss": 0.2052, |
| "reward": 1.963616180419922, |
| "reward_std": 0.4199460901319981, |
| "rewards/accuracy_reward": 0.18303572265431284, |
| "rewards/format_reward": 0.8732143223285675, |
| "rewards/tag_count_reward": 0.9073661148548127, |
| "step": 1110 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 686.8169891357422, |
| "epoch": 0.38064350943074166, |
| "grad_norm": 0.7645272016525269, |
| "kl": 1.0261962890625, |
| "learning_rate": 2.3359547735120533e-06, |
| "loss": 0.0896, |
| "reward": 2.073214370012283, |
| "reward_std": 0.35470662005245684, |
| "rewards/accuracy_reward": 0.23214286658912897, |
| "rewards/format_reward": 0.9089286148548126, |
| "rewards/tag_count_reward": 0.9321429014205933, |
| "step": 1115 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 673.1143157958984, |
| "epoch": 0.38235043099769567, |
| "grad_norm": 2.4553771018981934, |
| "kl": 1.4921875, |
| "learning_rate": 2.328518238926108e-06, |
| "loss": 0.1956, |
| "reward": 2.0895090222358705, |
| "reward_std": 0.3072059566155076, |
| "rewards/accuracy_reward": 0.20089286640286447, |
| "rewards/format_reward": 0.9339286088943481, |
| "rewards/tag_count_reward": 0.9546875387430191, |
| "step": 1120 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 710.1732513427735, |
| "epoch": 0.3840573525646497, |
| "grad_norm": 14.056977272033691, |
| "kl": 1.6969482421875, |
| "learning_rate": 2.32105228386176e-06, |
| "loss": 0.1577, |
| "reward": 2.0502232909202576, |
| "reward_std": 0.3774921327829361, |
| "rewards/accuracy_reward": 0.17589286342263222, |
| "rewards/format_reward": 0.9223214656114578, |
| "rewards/tag_count_reward": 0.9520089715719223, |
| "step": 1125 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 625.2696655273437, |
| "epoch": 0.38576427413160363, |
| "grad_norm": 5.242733478546143, |
| "kl": 1.62451171875, |
| "learning_rate": 2.313557173433233e-06, |
| "loss": 0.1798, |
| "reward": 2.0428572416305544, |
| "reward_std": 0.3383133355528116, |
| "rewards/accuracy_reward": 0.16339286314323545, |
| "rewards/format_reward": 0.9241071790456772, |
| "rewards/tag_count_reward": 0.9553571850061416, |
| "step": 1130 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 649.1410980224609, |
| "epoch": 0.38747119569855765, |
| "grad_norm": 2.7701191902160645, |
| "kl": 2.2046142578125, |
| "learning_rate": 2.306033173790051e-06, |
| "loss": 0.2391, |
| "reward": 2.039062571525574, |
| "reward_std": 0.4759730361402035, |
| "rewards/accuracy_reward": 0.22321429550647737, |
| "rewards/format_reward": 0.8892857551574707, |
| "rewards/tag_count_reward": 0.9265625417232514, |
| "step": 1135 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 717.9062866210937, |
| "epoch": 0.38917811726551166, |
| "grad_norm": 4.026260852813721, |
| "kl": 3.090625, |
| "learning_rate": 2.298480552107586e-06, |
| "loss": 0.3405, |
| "reward": 1.8812500834465027, |
| "reward_std": 0.5901964485645295, |
| "rewards/accuracy_reward": 0.14642857741564513, |
| "rewards/format_reward": 0.8419643312692642, |
| "rewards/tag_count_reward": 0.8928571879863739, |
| "step": 1140 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 728.6393188476562, |
| "epoch": 0.39088503883246567, |
| "grad_norm": 5.9124603271484375, |
| "kl": 3.3755859375, |
| "learning_rate": 2.2908995765775724e-06, |
| "loss": 0.3093, |
| "reward": 1.8071429431438446, |
| "reward_std": 0.5719165071845055, |
| "rewards/accuracy_reward": 0.15000000419095158, |
| "rewards/format_reward": 0.7964286088943482, |
| "rewards/tag_count_reward": 0.8607143253087998, |
| "step": 1145 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 720.6089508056641, |
| "epoch": 0.3925919603994196, |
| "grad_norm": 1.1206648349761963, |
| "kl": 1.8416015625, |
| "learning_rate": 2.283290516398582e-06, |
| "loss": 0.2198, |
| "reward": 1.9783483147621155, |
| "reward_std": 0.5003036454319953, |
| "rewards/accuracy_reward": 0.20178572246804832, |
| "rewards/format_reward": 0.8660714715719223, |
| "rewards/tag_count_reward": 0.9104911118745804, |
| "step": 1150 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 688.6893173217774, |
| "epoch": 0.39429888196637364, |
| "grad_norm": 2.443659543991089, |
| "kl": 1.76376953125, |
| "learning_rate": 2.275653641766466e-06, |
| "loss": 0.1235, |
| "reward": 2.005803668498993, |
| "reward_std": 0.3753303915262222, |
| "rewards/accuracy_reward": 0.15714286388829352, |
| "rewards/format_reward": 0.9116071820259094, |
| "rewards/tag_count_reward": 0.9370536148548126, |
| "step": 1155 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 725.0339630126953, |
| "epoch": 0.39600580353332765, |
| "grad_norm": 1.9696784019470215, |
| "kl": 0.81748046875, |
| "learning_rate": 2.2679892238647593e-06, |
| "loss": 0.127, |
| "reward": 2.1151786506175996, |
| "reward_std": 0.26731859482824805, |
| "rewards/accuracy_reward": 0.19375001126900315, |
| "rewards/format_reward": 0.9535714536905289, |
| "rewards/tag_count_reward": 0.967857176065445, |
| "step": 1160 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 770.9250274658203, |
| "epoch": 0.39771272510028166, |
| "grad_norm": 6.627246856689453, |
| "kl": 1.4739501953125, |
| "learning_rate": 2.2602975348550526e-06, |
| "loss": 0.169, |
| "reward": 2.117857205867767, |
| "reward_std": 0.2829253111034632, |
| "rewards/accuracy_reward": 0.2026785809546709, |
| "rewards/format_reward": 0.9508928924798965, |
| "rewards/tag_count_reward": 0.9642857521772384, |
| "step": 1165 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 765.2500427246093, |
| "epoch": 0.3994196466672356, |
| "grad_norm": 1.477338433265686, |
| "kl": 1.10830078125, |
| "learning_rate": 2.2525788478673256e-06, |
| "loss": 0.1475, |
| "reward": 2.150223308801651, |
| "reward_std": 0.3132738400250673, |
| "rewards/accuracy_reward": 0.24642858207225798, |
| "rewards/format_reward": 0.9437500387430191, |
| "rewards/tag_count_reward": 0.9600446879863739, |
| "step": 1170 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 783.8303863525391, |
| "epoch": 0.40112656823418963, |
| "grad_norm": 3.81966495513916, |
| "kl": 1.746142578125, |
| "learning_rate": 2.2448334369902512e-06, |
| "loss": 0.167, |
| "reward": 2.0694197356700896, |
| "reward_std": 0.39150173366069796, |
| "rewards/accuracy_reward": 0.21160715222358703, |
| "rewards/format_reward": 0.9142857521772385, |
| "rewards/tag_count_reward": 0.943526816368103, |
| "step": 1175 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 875.8545043945312, |
| "epoch": 0.40283348980114364, |
| "grad_norm": 4.390524387359619, |
| "kl": 1.417822265625, |
| "learning_rate": 2.2370615772614596e-06, |
| "loss": 0.1507, |
| "reward": 2.0004465222358703, |
| "reward_std": 0.38841529097408056, |
| "rewards/accuracy_reward": 0.13928572116419674, |
| "rewards/format_reward": 0.9187500417232514, |
| "rewards/tag_count_reward": 0.9424107611179352, |
| "step": 1180 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 854.280404663086, |
| "epoch": 0.40454041136809765, |
| "grad_norm": 3.236765146255493, |
| "kl": 1.91328125, |
| "learning_rate": 2.229263544657774e-06, |
| "loss": 0.1798, |
| "reward": 2.026785832643509, |
| "reward_std": 0.4409887820482254, |
| "rewards/accuracy_reward": 0.20535715129226445, |
| "rewards/format_reward": 0.8919643223285675, |
| "rewards/tag_count_reward": 0.929464328289032, |
| "step": 1185 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 848.9277191162109, |
| "epoch": 0.4062473329350516, |
| "grad_norm": 2.797024965286255, |
| "kl": 2.8095703125, |
| "learning_rate": 2.2214396160854086e-06, |
| "loss": 0.3007, |
| "reward": 1.9095982909202576, |
| "reward_std": 0.5447552859783172, |
| "rewards/accuracy_reward": 0.16696429532021284, |
| "rewards/format_reward": 0.848214328289032, |
| "rewards/tag_count_reward": 0.8944196909666061, |
| "step": 1190 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 863.5241485595703, |
| "epoch": 0.4079542545020056, |
| "grad_norm": 1.5841748714447021, |
| "kl": 2.126171875, |
| "learning_rate": 2.2135900693701396e-06, |
| "loss": 0.1976, |
| "reward": 1.966517949104309, |
| "reward_std": 0.49482071250677107, |
| "rewards/accuracy_reward": 0.19375001098960637, |
| "rewards/format_reward": 0.8687500417232513, |
| "rewards/tag_count_reward": 0.9040179014205932, |
| "step": 1195 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 841.4991424560546, |
| "epoch": 0.40966117606895963, |
| "grad_norm": 2.687502861022949, |
| "kl": 2.2541015625, |
| "learning_rate": 2.2057151832474344e-06, |
| "loss": 0.1548, |
| "reward": 1.9569197297096252, |
| "reward_std": 0.49666360318660735, |
| "rewards/accuracy_reward": 0.17500001080334188, |
| "rewards/format_reward": 0.8705357581377029, |
| "rewards/tag_count_reward": 0.9113839715719223, |
| "step": 1200 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 822.8750457763672, |
| "epoch": 0.41136809763591364, |
| "grad_norm": 4.721628189086914, |
| "kl": 1.90126953125, |
| "learning_rate": 2.197815237352559e-06, |
| "loss": 0.2051, |
| "reward": 1.9877233028411865, |
| "reward_std": 0.4236783929169178, |
| "rewards/accuracy_reward": 0.16250000623986124, |
| "rewards/format_reward": 0.9000000387430191, |
| "rewards/tag_count_reward": 0.9252232581377029, |
| "step": 1205 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 890.1616455078125, |
| "epoch": 0.41307501920286765, |
| "grad_norm": 5.242164611816406, |
| "kl": 1.93837890625, |
| "learning_rate": 2.189890512210643e-06, |
| "loss": 0.171, |
| "reward": 2.0156251013278963, |
| "reward_std": 0.44212441742420194, |
| "rewards/accuracy_reward": 0.20357143450528384, |
| "rewards/format_reward": 0.8901786148548126, |
| "rewards/tag_count_reward": 0.9218750387430191, |
| "step": 1210 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 859.317007446289, |
| "epoch": 0.4147819407698216, |
| "grad_norm": 5.278153419494629, |
| "kl": 2.12939453125, |
| "learning_rate": 2.181941289226724e-06, |
| "loss": 0.2144, |
| "reward": 2.0169643819332124, |
| "reward_std": 0.41962831988930704, |
| "rewards/accuracy_reward": 0.18660715240985154, |
| "rewards/format_reward": 0.900892898440361, |
| "rewards/tag_count_reward": 0.9294643372297287, |
| "step": 1215 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 826.7553985595703, |
| "epoch": 0.4164888623367756, |
| "grad_norm": 3.9751880168914795, |
| "kl": 2.369921875, |
| "learning_rate": 2.173967850675749e-06, |
| "loss": 0.2736, |
| "reward": 1.9564732968807221, |
| "reward_std": 0.44398799240589143, |
| "rewards/accuracy_reward": 0.1392857214435935, |
| "rewards/format_reward": 0.8946429014205932, |
| "rewards/tag_count_reward": 0.9225446850061416, |
| "step": 1220 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 787.3187896728516, |
| "epoch": 0.41819578390372963, |
| "grad_norm": 2.2601590156555176, |
| "kl": 1.5919921875, |
| "learning_rate": 2.1659704796925556e-06, |
| "loss": 0.1694, |
| "reward": 1.9640626013278961, |
| "reward_std": 0.4538421332836151, |
| "rewards/accuracy_reward": 0.1473214347846806, |
| "rewards/format_reward": 0.8928571909666061, |
| "rewards/tag_count_reward": 0.9238839715719223, |
| "step": 1225 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 787.5107513427735, |
| "epoch": 0.41990270547068365, |
| "grad_norm": 3.633096933364868, |
| "kl": 2.5519287109375, |
| "learning_rate": 2.157949460261816e-06, |
| "loss": 0.2969, |
| "reward": 1.9627233147621155, |
| "reward_std": 0.5212313048541546, |
| "rewards/accuracy_reward": 0.17678572060540318, |
| "rewards/format_reward": 0.8767857521772384, |
| "rewards/tag_count_reward": 0.9091518312692642, |
| "step": 1230 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 823.5098602294922, |
| "epoch": 0.4216096270376376, |
| "grad_norm": 4.196262359619141, |
| "kl": 3.162890625, |
| "learning_rate": 2.149905077207953e-06, |
| "loss": 0.3764, |
| "reward": 1.890401875972748, |
| "reward_std": 0.5524288520216942, |
| "rewards/accuracy_reward": 0.18928572311997413, |
| "rewards/format_reward": 0.8285714656114578, |
| "rewards/tag_count_reward": 0.8725446820259094, |
| "step": 1235 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 800.7946807861329, |
| "epoch": 0.4233165486045916, |
| "grad_norm": 14.379007339477539, |
| "kl": 2.49970703125, |
| "learning_rate": 2.1418376161850247e-06, |
| "loss": 0.2902, |
| "reward": 1.9551340341567993, |
| "reward_std": 0.5261093828827142, |
| "rewards/accuracy_reward": 0.1857142945751548, |
| "rewards/format_reward": 0.8633929044008255, |
| "rewards/tag_count_reward": 0.9060268223285675, |
| "step": 1240 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 780.7705718994141, |
| "epoch": 0.4250234701715456, |
| "grad_norm": 1.6710326671600342, |
| "kl": 1.42890625, |
| "learning_rate": 2.133747363666584e-06, |
| "loss": 0.1738, |
| "reward": 2.026116156578064, |
| "reward_std": 0.39677606597542764, |
| "rewards/accuracy_reward": 0.17500000940635801, |
| "rewards/format_reward": 0.9151786118745804, |
| "rewards/tag_count_reward": 0.9359375447034836, |
| "step": 1245 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 776.9598541259766, |
| "epoch": 0.42673039173849964, |
| "grad_norm": 2.11255145072937, |
| "kl": 2.662890625, |
| "learning_rate": 2.1256346069355026e-06, |
| "loss": 0.3397, |
| "reward": 2.0069197475910188, |
| "reward_std": 0.45871393829584123, |
| "rewards/accuracy_reward": 0.20267857946455478, |
| "rewards/format_reward": 0.8848214745521545, |
| "rewards/tag_count_reward": 0.9194196820259094, |
| "step": 1250 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 817.2223602294922, |
| "epoch": 0.4284373133054536, |
| "grad_norm": 1.47514009475708, |
| "kl": 1.4896484375, |
| "learning_rate": 2.117499634073772e-06, |
| "loss": 0.2132, |
| "reward": 2.0267858028411867, |
| "reward_std": 0.42209520787000654, |
| "rewards/accuracy_reward": 0.17946429522708057, |
| "rewards/format_reward": 0.910714328289032, |
| "rewards/tag_count_reward": 0.9366071909666062, |
| "step": 1255 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 768.3696807861328, |
| "epoch": 0.4301442348724076, |
| "grad_norm": 4.074100971221924, |
| "kl": 2.11015625, |
| "learning_rate": 2.1093427339522736e-06, |
| "loss": 0.2266, |
| "reward": 1.9930804431438447, |
| "reward_std": 0.403466971218586, |
| "rewards/accuracy_reward": 0.1750000081025064, |
| "rewards/format_reward": 0.894642898440361, |
| "rewards/tag_count_reward": 0.9234375357627869, |
| "step": 1260 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 787.3937866210938, |
| "epoch": 0.4318511564393616, |
| "grad_norm": 3.29717755317688, |
| "kl": 1.61904296875, |
| "learning_rate": 2.1011641962205187e-06, |
| "loss": 0.228, |
| "reward": 1.9837054312229156, |
| "reward_std": 0.3893513225018978, |
| "rewards/accuracy_reward": 0.13660714868456125, |
| "rewards/format_reward": 0.9125000417232514, |
| "rewards/tag_count_reward": 0.9345982551574707, |
| "step": 1265 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 810.4214721679688, |
| "epoch": 0.43355807800631563, |
| "grad_norm": 1.2225881814956665, |
| "kl": 2.337841796875, |
| "learning_rate": 2.092964311296366e-06, |
| "loss": 0.366, |
| "reward": 2.038616180419922, |
| "reward_std": 0.45512128323316575, |
| "rewards/accuracy_reward": 0.240178579185158, |
| "rewards/format_reward": 0.8839286088943481, |
| "rewards/tag_count_reward": 0.9145089685916901, |
| "step": 1270 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 765.8562805175782, |
| "epoch": 0.4352649995732696, |
| "grad_norm": 0.9028781652450562, |
| "kl": 1.768310546875, |
| "learning_rate": 2.0847433703557086e-06, |
| "loss": 0.2351, |
| "reward": 1.9801340281963349, |
| "reward_std": 0.42181163243949416, |
| "rewards/accuracy_reward": 0.14732143403962256, |
| "rewards/format_reward": 0.9035714715719223, |
| "rewards/tag_count_reward": 0.9292411148548126, |
| "step": 1275 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 774.1705718994141, |
| "epoch": 0.4369719211402236, |
| "grad_norm": 3.9188528060913086, |
| "kl": 1.5515625, |
| "learning_rate": 2.0765016653221312e-06, |
| "loss": 0.1852, |
| "reward": 2.011160808801651, |
| "reward_std": 0.4215161487460136, |
| "rewards/accuracy_reward": 0.17053572349250318, |
| "rewards/format_reward": 0.9098214656114578, |
| "rewards/tag_count_reward": 0.9308036059141159, |
| "step": 1280 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 752.0446746826171, |
| "epoch": 0.4386788427071776, |
| "grad_norm": 1.324172019958496, |
| "kl": 1.8251708984375, |
| "learning_rate": 2.068239488856549e-06, |
| "loss": 0.1298, |
| "reward": 1.9770090222358703, |
| "reward_std": 0.3746281571686268, |
| "rewards/accuracy_reward": 0.13303572088479995, |
| "rewards/format_reward": 0.9116071850061417, |
| "rewards/tag_count_reward": 0.9323661148548126, |
| "step": 1285 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 783.0786010742188, |
| "epoch": 0.4403857642741316, |
| "grad_norm": 1.5869454145431519, |
| "kl": 1.52734375, |
| "learning_rate": 2.05995713434681e-06, |
| "loss": 0.1537, |
| "reward": 1.961384004354477, |
| "reward_std": 0.4327257826924324, |
| "rewards/accuracy_reward": 0.15446429289877414, |
| "rewards/format_reward": 0.8857143253087998, |
| "rewards/tag_count_reward": 0.921205398440361, |
| "step": 1290 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 810.894677734375, |
| "epoch": 0.4420926858410856, |
| "grad_norm": 3.493547201156616, |
| "kl": 2.1126953125, |
| "learning_rate": 2.0516548958972816e-06, |
| "loss": 0.2248, |
| "reward": 1.917410784959793, |
| "reward_std": 0.5152807034552097, |
| "rewards/accuracy_reward": 0.16339286556467414, |
| "rewards/format_reward": 0.8562500476837158, |
| "rewards/tag_count_reward": 0.8977679044008255, |
| "step": 1295 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 776.244677734375, |
| "epoch": 0.4437996074080396, |
| "grad_norm": 1.382163405418396, |
| "kl": 1.617578125, |
| "learning_rate": 2.043333068318405e-06, |
| "loss": 0.1537, |
| "reward": 1.9573661506175994, |
| "reward_std": 0.5043010532855987, |
| "rewards/accuracy_reward": 0.1919642925262451, |
| "rewards/format_reward": 0.8625000417232513, |
| "rewards/tag_count_reward": 0.902901828289032, |
| "step": 1300 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 761.0607482910157, |
| "epoch": 0.4455065289749936, |
| "grad_norm": 2.492479085922241, |
| "kl": 2.128125, |
| "learning_rate": 2.0349919471162245e-06, |
| "loss": 0.1946, |
| "reward": 1.8754465103149414, |
| "reward_std": 0.5213326171040535, |
| "rewards/accuracy_reward": 0.1455357194878161, |
| "rewards/format_reward": 0.8401786118745804, |
| "rewards/tag_count_reward": 0.8897321820259094, |
| "step": 1305 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 812.150927734375, |
| "epoch": 0.4472134505419476, |
| "grad_norm": 4.123756408691406, |
| "kl": 2.7369140625, |
| "learning_rate": 2.0266318284818983e-06, |
| "loss": 0.2577, |
| "reward": 1.83883935213089, |
| "reward_std": 0.5365928649902344, |
| "rewards/accuracy_reward": 0.14642857788130642, |
| "rewards/format_reward": 0.8160714715719223, |
| "rewards/tag_count_reward": 0.8763393282890319, |
| "step": 1310 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 758.2562866210938, |
| "epoch": 0.4489203721089016, |
| "grad_norm": 3.1437504291534424, |
| "kl": 1.9005859375, |
| "learning_rate": 2.0182530092811776e-06, |
| "loss": 0.2219, |
| "reward": 1.877678668498993, |
| "reward_std": 0.5208032101392746, |
| "rewards/accuracy_reward": 0.1464285783469677, |
| "rewards/format_reward": 0.84107146859169, |
| "rewards/tag_count_reward": 0.8901786118745804, |
| "step": 1315 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 773.2527160644531, |
| "epoch": 0.4506272936758556, |
| "grad_norm": 3.798027753829956, |
| "kl": 2.2734375, |
| "learning_rate": 2.0098557870438672e-06, |
| "loss": 0.2386, |
| "reward": 1.937946516275406, |
| "reward_std": 0.5240325286984444, |
| "rewards/accuracy_reward": 0.18482143841683865, |
| "rewards/format_reward": 0.8526786148548127, |
| "rewards/tag_count_reward": 0.9004464715719223, |
| "step": 1320 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 755.4589569091797, |
| "epoch": 0.4523342152428096, |
| "grad_norm": 4.841824054718018, |
| "kl": 2.136328125, |
| "learning_rate": 2.001440459953258e-06, |
| "loss": 0.1876, |
| "reward": 2.003125101327896, |
| "reward_std": 0.4476234719157219, |
| "rewards/accuracy_reward": 0.20178572181612253, |
| "rewards/format_reward": 0.883035758137703, |
| "rewards/tag_count_reward": 0.9183036148548126, |
| "step": 1325 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 748.7973480224609, |
| "epoch": 0.4540411368097636, |
| "grad_norm": 19.314414978027344, |
| "kl": 1.78818359375, |
| "learning_rate": 1.99300732683554e-06, |
| "loss": 0.1903, |
| "reward": 2.0245536625385285, |
| "reward_std": 0.39623707011342046, |
| "rewards/accuracy_reward": 0.1741071510128677, |
| "rewards/format_reward": 0.9125000447034836, |
| "rewards/tag_count_reward": 0.9379464775323868, |
| "step": 1330 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 795.800927734375, |
| "epoch": 0.4557480583767176, |
| "grad_norm": 1.6212748289108276, |
| "kl": 2.395703125, |
| "learning_rate": 1.9845566871491923e-06, |
| "loss": 0.2654, |
| "reward": 1.9578125953674317, |
| "reward_std": 0.4312289670109749, |
| "rewards/accuracy_reward": 0.14017857611179352, |
| "rewards/format_reward": 0.8919643282890319, |
| "rewards/tag_count_reward": 0.9256696820259094, |
| "step": 1335 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 736.050927734375, |
| "epoch": 0.45745497994367157, |
| "grad_norm": 3.6908681392669678, |
| "kl": 2.1092041015625, |
| "learning_rate": 1.9760888409743456e-06, |
| "loss": 0.2535, |
| "reward": 2.0162947356700895, |
| "reward_std": 0.40158444084227085, |
| "rewards/accuracy_reward": 0.1982142932713032, |
| "rewards/format_reward": 0.894642898440361, |
| "rewards/tag_count_reward": 0.9234375447034836, |
| "step": 1340 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 764.6018188476562, |
| "epoch": 0.4591619015106256, |
| "grad_norm": 5.394177436828613, |
| "kl": 1.6607421875, |
| "learning_rate": 1.96760408900213e-06, |
| "loss": 0.2132, |
| "reward": 2.040178656578064, |
| "reward_std": 0.4042182721197605, |
| "rewards/accuracy_reward": 0.19285715064033865, |
| "rewards/format_reward": 0.9107143253087997, |
| "rewards/tag_count_reward": 0.9366071790456771, |
| "step": 1345 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 786.1000396728516, |
| "epoch": 0.4608688230775796, |
| "grad_norm": 4.516533374786377, |
| "kl": 2.57392578125, |
| "learning_rate": 1.9591027325239968e-06, |
| "loss": 0.2251, |
| "reward": 1.9424108147621155, |
| "reward_std": 0.5002983555197715, |
| "rewards/accuracy_reward": 0.16517857778817416, |
| "rewards/format_reward": 0.86607146859169, |
| "rewards/tag_count_reward": 0.9111607521772385, |
| "step": 1350 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 763.5866394042969, |
| "epoch": 0.4625757446445336, |
| "grad_norm": 2.373253107070923, |
| "kl": 2.108984375, |
| "learning_rate": 1.950585073421018e-06, |
| "loss": 0.2665, |
| "reward": 1.9013393759727477, |
| "reward_std": 0.4912081308662891, |
| "rewards/accuracy_reward": 0.11517857778817416, |
| "rewards/format_reward": 0.875892898440361, |
| "rewards/tag_count_reward": 0.910267898440361, |
| "step": 1355 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 803.5339721679687, |
| "epoch": 0.46428266621148756, |
| "grad_norm": 1.557633399963379, |
| "kl": 1.86875, |
| "learning_rate": 1.942051414153169e-06, |
| "loss": 0.2379, |
| "reward": 1.9354911744594574, |
| "reward_std": 0.47398936599493025, |
| "rewards/accuracy_reward": 0.13928572162985803, |
| "rewards/format_reward": 0.8794643312692643, |
| "rewards/tag_count_reward": 0.9167411029338837, |
| "step": 1360 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 810.9598541259766, |
| "epoch": 0.46598958777844157, |
| "grad_norm": 1.5708023309707642, |
| "kl": 2.6498046875, |
| "learning_rate": 1.933502057748587e-06, |
| "loss": 0.2696, |
| "reward": 1.9158483147621155, |
| "reward_std": 0.5079516015946866, |
| "rewards/accuracy_reward": 0.1535714370198548, |
| "rewards/format_reward": 0.8607143282890319, |
| "rewards/tag_count_reward": 0.9015625387430191, |
| "step": 1365 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 849.8884338378906, |
| "epoch": 0.4676965093453956, |
| "grad_norm": 10.703516006469727, |
| "kl": 2.755859375, |
| "learning_rate": 1.9249373077928083e-06, |
| "loss": 0.3247, |
| "reward": 1.8904018700122833, |
| "reward_std": 0.5384089753031731, |
| "rewards/accuracy_reward": 0.1571428648196161, |
| "rewards/format_reward": 0.8473214685916901, |
| "rewards/tag_count_reward": 0.8859375476837158, |
| "step": 1370 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 816.4321716308593, |
| "epoch": 0.4694034309123496, |
| "grad_norm": 4.501167297363281, |
| "kl": 2.9134765625, |
| "learning_rate": 1.916357468417994e-06, |
| "loss": 0.3764, |
| "reward": 1.8883929431438446, |
| "reward_std": 0.554328379034996, |
| "rewards/accuracy_reward": 0.16339286603033543, |
| "rewards/format_reward": 0.8419643253087997, |
| "rewards/tag_count_reward": 0.883035758137703, |
| "step": 1375 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 816.8000396728515, |
| "epoch": 0.47111035247930355, |
| "grad_norm": 4.633331298828125, |
| "kl": 2.7892578125, |
| "learning_rate": 1.9077628442921244e-06, |
| "loss": 0.3534, |
| "reward": 1.9020090043544768, |
| "reward_std": 0.5272788584232331, |
| "rewards/accuracy_reward": 0.15625000484287738, |
| "rewards/format_reward": 0.8517857521772385, |
| "rewards/tag_count_reward": 0.8939732611179352, |
| "step": 1380 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 771.9250305175781, |
| "epoch": 0.47281727404625756, |
| "grad_norm": 10.220499992370605, |
| "kl": 3.145703125, |
| "learning_rate": 1.8991537406081833e-06, |
| "loss": 0.3939, |
| "reward": 1.8629464983940125, |
| "reward_std": 0.49162818491458893, |
| "rewards/accuracy_reward": 0.14107143618166446, |
| "rewards/format_reward": 0.8366071820259094, |
| "rewards/tag_count_reward": 0.8852679044008255, |
| "step": 1385 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 794.0920043945313, |
| "epoch": 0.4745241956132116, |
| "grad_norm": 5.842833518981934, |
| "kl": 2.51865234375, |
| "learning_rate": 1.8905304630733202e-06, |
| "loss": 0.3048, |
| "reward": 1.9725447416305542, |
| "reward_std": 0.5114175193011761, |
| "rewards/accuracy_reward": 0.23035715110599994, |
| "rewards/format_reward": 0.854464328289032, |
| "rewards/tag_count_reward": 0.8877232521772385, |
| "step": 1390 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 776.0598571777343, |
| "epoch": 0.4762311171801656, |
| "grad_norm": 6.666676998138428, |
| "kl": 2.329638671875, |
| "learning_rate": 1.881893317897994e-06, |
| "loss": 0.28, |
| "reward": 1.9508929431438446, |
| "reward_std": 0.4666281685233116, |
| "rewards/accuracy_reward": 0.17053572190925478, |
| "rewards/format_reward": 0.873214328289032, |
| "rewards/tag_count_reward": 0.907142898440361, |
| "step": 1395 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 845.3866455078125, |
| "epoch": 0.47793803874711954, |
| "grad_norm": 4.843286991119385, |
| "kl": 2.96796875, |
| "learning_rate": 1.8732426117851007e-06, |
| "loss": 0.3336, |
| "reward": 1.872991156578064, |
| "reward_std": 0.531745757162571, |
| "rewards/accuracy_reward": 0.15714286407455802, |
| "rewards/format_reward": 0.835714328289032, |
| "rewards/tag_count_reward": 0.8801339656114578, |
| "step": 1400 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 811.6437866210938, |
| "epoch": 0.47964496031407355, |
| "grad_norm": 5.695433139801025, |
| "kl": 2.312890625, |
| "learning_rate": 1.8645786519190823e-06, |
| "loss": 0.2987, |
| "reward": 1.8801340222358705, |
| "reward_std": 0.5154039770364761, |
| "rewards/accuracy_reward": 0.13571429047733546, |
| "rewards/format_reward": 0.845535758137703, |
| "rewards/tag_count_reward": 0.8988839715719223, |
| "step": 1405 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 846.7286102294922, |
| "epoch": 0.48135188188102757, |
| "grad_norm": 6.439587593078613, |
| "kl": 2.5794921875, |
| "learning_rate": 1.8559017459550167e-06, |
| "loss": 0.3875, |
| "reward": 1.8656250655651092, |
| "reward_std": 0.5760970249772072, |
| "rewards/accuracy_reward": 0.17053572209551932, |
| "rewards/format_reward": 0.8205357521772385, |
| "rewards/tag_count_reward": 0.8745536088943482, |
| "step": 1410 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 954.6134429931641, |
| "epoch": 0.4830588034479816, |
| "grad_norm": 6.831388473510742, |
| "kl": 3.415234375, |
| "learning_rate": 1.8472122020076958e-06, |
| "loss": 0.4372, |
| "reward": 1.7779018580913544, |
| "reward_std": 0.6864479184150696, |
| "rewards/accuracy_reward": 0.15178572349250316, |
| "rewards/format_reward": 0.7741071820259094, |
| "rewards/tag_count_reward": 0.8520089715719223, |
| "step": 1415 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 928.3268310546875, |
| "epoch": 0.4847657250149356, |
| "grad_norm": 4.245449542999268, |
| "kl": 3.4875, |
| "learning_rate": 1.8385103286406828e-06, |
| "loss": 0.5199, |
| "reward": 1.7602679431438446, |
| "reward_std": 0.658353678882122, |
| "rewards/accuracy_reward": 0.17142857844009995, |
| "rewards/format_reward": 0.7517857402563095, |
| "rewards/tag_count_reward": 0.8370536118745804, |
| "step": 1420 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 945.8277252197265, |
| "epoch": 0.48647264658188955, |
| "grad_norm": 4.0879902839660645, |
| "kl": 3.248046875, |
| "learning_rate": 1.8297964348553555e-06, |
| "loss": 0.4751, |
| "reward": 1.7975447297096252, |
| "reward_std": 0.651876625418663, |
| "rewards/accuracy_reward": 0.16517857937142252, |
| "rewards/format_reward": 0.7839286118745804, |
| "rewards/tag_count_reward": 0.8484375387430191, |
| "step": 1425 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 878.1509368896484, |
| "epoch": 0.48817956814884356, |
| "grad_norm": 4.380373001098633, |
| "kl": 2.2369140625, |
| "learning_rate": 1.821070830079935e-06, |
| "loss": 0.3507, |
| "reward": 1.8169643521308898, |
| "reward_std": 0.5784257367253304, |
| "rewards/accuracy_reward": 0.14553571958094835, |
| "rewards/format_reward": 0.810714328289032, |
| "rewards/tag_count_reward": 0.8607143253087998, |
| "step": 1430 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 820.9803894042968, |
| "epoch": 0.48988648971579757, |
| "grad_norm": 2.2317075729370117, |
| "kl": 1.8021484375, |
| "learning_rate": 1.812333824158494e-06, |
| "loss": 0.2936, |
| "reward": 1.8819197297096253, |
| "reward_std": 0.5624213635921478, |
| "rewards/accuracy_reward": 0.1625000074505806, |
| "rewards/format_reward": 0.8375000447034836, |
| "rewards/tag_count_reward": 0.8819196820259094, |
| "step": 1435 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 823.3062927246094, |
| "epoch": 0.4915934112827516, |
| "grad_norm": 2.928972005844116, |
| "kl": 2.2279296875, |
| "learning_rate": 1.80358572733996e-06, |
| "loss": 0.2098, |
| "reward": 1.8060268759727478, |
| "reward_std": 0.5900841251015663, |
| "rewards/accuracy_reward": 0.1669642923399806, |
| "rewards/format_reward": 0.7946428924798965, |
| "rewards/tag_count_reward": 0.8444196790456772, |
| "step": 1440 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 766.2384307861328, |
| "epoch": 0.49330033284970554, |
| "grad_norm": 2.458934783935547, |
| "kl": 1.76279296875, |
| "learning_rate": 1.7948268502670936e-06, |
| "loss": 0.2435, |
| "reward": 1.9732143819332122, |
| "reward_std": 0.4333352468907833, |
| "rewards/accuracy_reward": 0.17767857909202575, |
| "rewards/format_reward": 0.8839286148548127, |
| "rewards/tag_count_reward": 0.9116071820259094, |
| "step": 1445 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 753.6009185791015, |
| "epoch": 0.49500725441665955, |
| "grad_norm": 2.1454453468322754, |
| "kl": 1.71591796875, |
| "learning_rate": 1.7860575039654605e-06, |
| "loss": 0.2184, |
| "reward": 1.9578125774860382, |
| "reward_std": 0.4223016068339348, |
| "rewards/accuracy_reward": 0.16517857760190963, |
| "rewards/format_reward": 0.8830357551574707, |
| "rewards/tag_count_reward": 0.9095982551574707, |
| "step": 1450 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 834.9000396728516, |
| "epoch": 0.49671417598361356, |
| "grad_norm": 4.523981094360352, |
| "kl": 2.29609375, |
| "learning_rate": 1.7772779998323859e-06, |
| "loss": 0.3067, |
| "reward": 1.8763393580913543, |
| "reward_std": 0.5456696435809135, |
| "rewards/accuracy_reward": 0.172321436367929, |
| "rewards/format_reward": 0.8348214656114579, |
| "rewards/tag_count_reward": 0.8691964626312256, |
| "step": 1455 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 839.538427734375, |
| "epoch": 0.49842109755056757, |
| "grad_norm": 3.2237324714660645, |
| "kl": 2.2123046875, |
| "learning_rate": 1.768488649625897e-06, |
| "loss": 0.3073, |
| "reward": 1.825446492433548, |
| "reward_std": 0.5417851440608501, |
| "rewards/accuracy_reward": 0.11696429047733545, |
| "rewards/format_reward": 0.8312500387430191, |
| "rewards/tag_count_reward": 0.8772321790456772, |
| "step": 1460 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 828.7875366210938, |
| "epoch": 0.5001280191175216, |
| "grad_norm": 6.970083713531494, |
| "kl": 2.060546875, |
| "learning_rate": 1.7596897654536527e-06, |
| "loss": 0.2593, |
| "reward": 1.8203125894069672, |
| "reward_std": 0.5907119750976563, |
| "rewards/accuracy_reward": 0.14107143683359027, |
| "rewards/format_reward": 0.8169643253087997, |
| "rewards/tag_count_reward": 0.862276816368103, |
| "step": 1465 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 805.7134307861328, |
| "epoch": 0.5018349406844755, |
| "grad_norm": 15.752748489379883, |
| "kl": 2.48359375, |
| "learning_rate": 1.7508816597618611e-06, |
| "loss": 0.3889, |
| "reward": 1.835714375972748, |
| "reward_std": 0.5934141919016838, |
| "rewards/accuracy_reward": 0.147321436740458, |
| "rewards/format_reward": 0.8223214685916901, |
| "rewards/tag_count_reward": 0.8660714715719223, |
| "step": 1470 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 995.4866485595703, |
| "epoch": 0.5035418622514295, |
| "grad_norm": 10.973194122314453, |
| "kl": 4.366015625, |
| "learning_rate": 1.742064645324183e-06, |
| "loss": 0.5055, |
| "reward": 1.472098284959793, |
| "reward_std": 0.7849221974611282, |
| "rewards/accuracy_reward": 0.16071429327130318, |
| "rewards/format_reward": 0.5678571671247482, |
| "rewards/tag_count_reward": 0.743526816368103, |
| "step": 1475 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1102.144708251953, |
| "epoch": 0.5052487838183836, |
| "grad_norm": 6.444234371185303, |
| "kl": 4.0703125, |
| "learning_rate": 1.7332390352306282e-06, |
| "loss": 0.3969, |
| "reward": 0.9915179014205933, |
| "reward_std": 0.6856313347816467, |
| "rewards/accuracy_reward": 0.1000000050291419, |
| "rewards/format_reward": 0.28392858281731603, |
| "rewards/tag_count_reward": 0.607589316368103, |
| "step": 1480 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 862.6955749511719, |
| "epoch": 0.5069557053853375, |
| "grad_norm": 3.410790205001831, |
| "kl": 1.875390625, |
| "learning_rate": 1.7244051428764343e-06, |
| "loss": 0.2012, |
| "reward": 0.9638393312692642, |
| "reward_std": 0.6327465578913689, |
| "rewards/accuracy_reward": 0.14732143711298704, |
| "rewards/format_reward": 0.21071429494768382, |
| "rewards/tag_count_reward": 0.6058035999536514, |
| "step": 1485 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 769.9509246826171, |
| "epoch": 0.5086626269522916, |
| "grad_norm": 1.9697375297546387, |
| "kl": 1.87890625, |
| "learning_rate": 1.7155632819509417e-06, |
| "loss": 0.2169, |
| "reward": 0.9707589715719223, |
| "reward_std": 0.6286859422922134, |
| "rewards/accuracy_reward": 0.13571429383009673, |
| "rewards/format_reward": 0.22767858132719992, |
| "rewards/tag_count_reward": 0.607366093993187, |
| "step": 1490 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 766.0982421875, |
| "epoch": 0.5103695485192455, |
| "grad_norm": 10.65464973449707, |
| "kl": 2.2140625, |
| "learning_rate": 1.7067137664264521e-06, |
| "loss": 0.2392, |
| "reward": 1.3678571969270705, |
| "reward_std": 0.768636429309845, |
| "rewards/accuracy_reward": 0.13571429224684833, |
| "rewards/format_reward": 0.4928571656346321, |
| "rewards/tag_count_reward": 0.7392857491970062, |
| "step": 1495 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 736.3580718994141, |
| "epoch": 0.5120764700861995, |
| "grad_norm": 2.2699103355407715, |
| "kl": 1.77646484375, |
| "learning_rate": 1.6978569105470792e-06, |
| "loss": 0.1815, |
| "reward": 1.8080357968807221, |
| "reward_std": 0.6647521004080772, |
| "rewards/accuracy_reward": 0.19642858039587735, |
| "rewards/format_reward": 0.7562500357627868, |
| "rewards/tag_count_reward": 0.8553571820259094, |
| "step": 1500 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 779.2143188476563, |
| "epoch": 0.5137833916531536, |
| "grad_norm": 6.321624279022217, |
| "kl": 2.9748046875, |
| "learning_rate": 1.6889930288175922e-06, |
| "loss": 0.3307, |
| "reward": 1.8591518819332122, |
| "reward_std": 0.6267977371811867, |
| "rewards/accuracy_reward": 0.18214286677539349, |
| "rewards/format_reward": 0.8017857581377029, |
| "rewards/tag_count_reward": 0.8752232521772385, |
| "step": 1505 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 795.1321807861328, |
| "epoch": 0.5154903132201075, |
| "grad_norm": 1.8599531650543213, |
| "kl": 3.016796875, |
| "learning_rate": 1.6801224359922466e-06, |
| "loss": 0.3243, |
| "reward": 1.79151793718338, |
| "reward_std": 0.6220594555139541, |
| "rewards/accuracy_reward": 0.14910714784637094, |
| "rewards/format_reward": 0.7839286088943481, |
| "rewards/tag_count_reward": 0.8584821820259094, |
| "step": 1510 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 790.3187866210938, |
| "epoch": 0.5171972347870616, |
| "grad_norm": 4.756523132324219, |
| "kl": 1.9025390625, |
| "learning_rate": 1.6712454470636052e-06, |
| "loss": 0.2568, |
| "reward": 1.8700893700122834, |
| "reward_std": 0.558867233991623, |
| "rewards/accuracy_reward": 0.19375000949949026, |
| "rewards/format_reward": 0.8107143223285675, |
| "rewards/tag_count_reward": 0.8656250447034836, |
| "step": 1515 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 774.834848022461, |
| "epoch": 0.5189041563540155, |
| "grad_norm": 1.1406761407852173, |
| "kl": 1.52021484375, |
| "learning_rate": 1.6623623772513576e-06, |
| "loss": 0.2219, |
| "reward": 1.9486608028411865, |
| "reward_std": 0.46728189289569855, |
| "rewards/accuracy_reward": 0.17767857825383543, |
| "rewards/format_reward": 0.8642857551574707, |
| "rewards/tag_count_reward": 0.9066964685916901, |
| "step": 1520 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 765.0571685791016, |
| "epoch": 0.5206110779209695, |
| "grad_norm": 1.1958781480789185, |
| "kl": 1.4518798828125, |
| "learning_rate": 1.6534735419911228e-06, |
| "loss": 0.2041, |
| "reward": 1.9100447177886963, |
| "reward_std": 0.4585052601993084, |
| "rewards/accuracy_reward": 0.15178572274744512, |
| "rewards/format_reward": 0.8625000387430191, |
| "rewards/tag_count_reward": 0.89575896859169, |
| "step": 1525 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 802.6937805175781, |
| "epoch": 0.5223179994879236, |
| "grad_norm": 4.186196327209473, |
| "kl": 1.6386474609375, |
| "learning_rate": 1.6445792569232486e-06, |
| "loss": 0.1724, |
| "reward": 1.8738840222358704, |
| "reward_std": 0.5293861232697964, |
| "rewards/accuracy_reward": 0.15535714933648706, |
| "rewards/format_reward": 0.8401786059141159, |
| "rewards/tag_count_reward": 0.8783482551574707, |
| "step": 1530 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 802.0018249511719, |
| "epoch": 0.5240249210548775, |
| "grad_norm": 4.158358573913574, |
| "kl": 0.99736328125, |
| "learning_rate": 1.635679837881606e-06, |
| "loss": 0.1695, |
| "reward": 1.9618304431438447, |
| "reward_std": 0.4689994312822819, |
| "rewards/accuracy_reward": 0.17321429261937737, |
| "rewards/format_reward": 0.8794643312692643, |
| "rewards/tag_count_reward": 0.9091518253087998, |
| "step": 1535 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 778.8018188476562, |
| "epoch": 0.5257318426218315, |
| "grad_norm": 1.9054951667785645, |
| "kl": 0.9982421875, |
| "learning_rate": 1.6267756008823701e-06, |
| "loss": 0.1331, |
| "reward": 1.9642858028411865, |
| "reward_std": 0.4854655273258686, |
| "rewards/accuracy_reward": 0.21785715147852897, |
| "rewards/format_reward": 0.8535714656114578, |
| "rewards/tag_count_reward": 0.8928571850061416, |
| "step": 1540 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 790.87236328125, |
| "epoch": 0.5274387641887855, |
| "grad_norm": 3.2148427963256836, |
| "kl": 0.77548828125, |
| "learning_rate": 1.6178668621128018e-06, |
| "loss": 0.1566, |
| "reward": 1.8600447177886963, |
| "reward_std": 0.534805352985859, |
| "rewards/accuracy_reward": 0.1491071498952806, |
| "rewards/format_reward": 0.8348214715719223, |
| "rewards/tag_count_reward": 0.8761161118745804, |
| "step": 1545 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 772.5991455078125, |
| "epoch": 0.5291456857557395, |
| "grad_norm": 2.275534152984619, |
| "kl": 0.71689453125, |
| "learning_rate": 1.6089539379200189e-06, |
| "loss": 0.1425, |
| "reward": 1.9479911565780639, |
| "reward_std": 0.4742655538022518, |
| "rewards/accuracy_reward": 0.16696429336443544, |
| "rewards/format_reward": 0.8714286148548126, |
| "rewards/tag_count_reward": 0.9095982581377029, |
| "step": 1550 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 743.4250305175781, |
| "epoch": 0.5308526073226936, |
| "grad_norm": 1.629341959953308, |
| "kl": 0.627587890625, |
| "learning_rate": 1.6000371447997617e-06, |
| "loss": 0.1077, |
| "reward": 2.0165179669857025, |
| "reward_std": 0.4406208969652653, |
| "rewards/accuracy_reward": 0.20625001080334188, |
| "rewards/format_reward": 0.8910714685916901, |
| "rewards/tag_count_reward": 0.91919646859169, |
| "step": 1555 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 693.431283569336, |
| "epoch": 0.5325595288896475, |
| "grad_norm": 0.5048644542694092, |
| "kl": 0.6044921875, |
| "learning_rate": 1.591116799385156e-06, |
| "loss": 0.0448, |
| "reward": 2.0198661506175997, |
| "reward_std": 0.44208099469542506, |
| "rewards/accuracy_reward": 0.23660715185105802, |
| "rewards/format_reward": 0.8732143312692642, |
| "rewards/tag_count_reward": 0.910044687986374, |
| "step": 1560 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 712.6607482910156, |
| "epoch": 0.5342664504566015, |
| "grad_norm": 0.7713425755500793, |
| "kl": 0.63125, |
| "learning_rate": 1.5821932184354677e-06, |
| "loss": 0.0739, |
| "reward": 2.0334822297096253, |
| "reward_std": 0.4626117169857025, |
| "rewards/accuracy_reward": 0.2223214386962354, |
| "rewards/format_reward": 0.8910714745521545, |
| "rewards/tag_count_reward": 0.9200893342494965, |
| "step": 1565 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 731.1116363525391, |
| "epoch": 0.5359733720235555, |
| "grad_norm": 1.309543251991272, |
| "kl": 0.68330078125, |
| "learning_rate": 1.5732667188248568e-06, |
| "loss": 0.0613, |
| "reward": 1.997991144657135, |
| "reward_std": 0.4713860541582108, |
| "rewards/accuracy_reward": 0.1919642967171967, |
| "rewards/format_reward": 0.886607187986374, |
| "rewards/tag_count_reward": 0.9194196850061417, |
| "step": 1570 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 718.6598480224609, |
| "epoch": 0.5376802935905095, |
| "grad_norm": 0.7583140730857849, |
| "kl": 0.94140625, |
| "learning_rate": 1.5643376175311233e-06, |
| "loss": 0.0464, |
| "reward": 2.0303572535514833, |
| "reward_std": 0.46192915737628937, |
| "rewards/accuracy_reward": 0.23392858356237411, |
| "rewards/format_reward": 0.8848214775323868, |
| "rewards/tag_count_reward": 0.9116071879863739, |
| "step": 1575 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 776.775927734375, |
| "epoch": 0.5393872151574635, |
| "grad_norm": 2.4456801414489746, |
| "kl": 0.96123046875, |
| "learning_rate": 1.555406231624453e-06, |
| "loss": 0.0798, |
| "reward": 1.9665179789066314, |
| "reward_std": 0.4243281804025173, |
| "rewards/accuracy_reward": 0.16785715064033865, |
| "rewards/format_reward": 0.8848214656114578, |
| "rewards/tag_count_reward": 0.913839328289032, |
| "step": 1580 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 745.4625274658204, |
| "epoch": 0.5410941367244175, |
| "grad_norm": 2.722489595413208, |
| "kl": 1.0029296875, |
| "learning_rate": 1.5464728782561578e-06, |
| "loss": 0.0884, |
| "reward": 1.9870536386966706, |
| "reward_std": 0.3701733611524105, |
| "rewards/accuracy_reward": 0.17500000689178705, |
| "rewards/format_reward": 0.8928571879863739, |
| "rewards/tag_count_reward": 0.91919646859169, |
| "step": 1585 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 767.0625396728516, |
| "epoch": 0.5428010582913715, |
| "grad_norm": 9.577692031860352, |
| "kl": 1.68056640625, |
| "learning_rate": 1.537537874647413e-06, |
| "loss": 0.1848, |
| "reward": 2.045535796880722, |
| "reward_std": 0.3802845995873213, |
| "rewards/accuracy_reward": 0.21428572600707413, |
| "rewards/format_reward": 0.9026786178350449, |
| "rewards/tag_count_reward": 0.92857146859169, |
| "step": 1590 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 841.7741516113281, |
| "epoch": 0.5445079798583256, |
| "grad_norm": 5.46830415725708, |
| "kl": 2.172265625, |
| "learning_rate": 1.5286015380779939e-06, |
| "loss": 0.2761, |
| "reward": 2.0111608028411867, |
| "reward_std": 0.4875886231660843, |
| "rewards/accuracy_reward": 0.21785715352743865, |
| "rewards/format_reward": 0.8812500417232514, |
| "rewards/tag_count_reward": 0.9120536148548126, |
| "step": 1595 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 802.6259307861328, |
| "epoch": 0.5462149014252795, |
| "grad_norm": 56.66122817993164, |
| "kl": 2.2943359375, |
| "learning_rate": 1.5196641858750092e-06, |
| "loss": 0.3174, |
| "reward": 1.9904018878936767, |
| "reward_std": 0.43015862852334974, |
| "rewards/accuracy_reward": 0.18750000754371285, |
| "rewards/format_reward": 0.8866071850061417, |
| "rewards/tag_count_reward": 0.9162946820259095, |
| "step": 1600 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 812.5286102294922, |
| "epoch": 0.5479218229922335, |
| "grad_norm": 10.48519229888916, |
| "kl": 2.0384765625, |
| "learning_rate": 1.5107261354016317e-06, |
| "loss": 0.2719, |
| "reward": 2.0328125834465025, |
| "reward_std": 0.4505886062979698, |
| "rewards/accuracy_reward": 0.22857143878936767, |
| "rewards/format_reward": 0.8883928984403611, |
| "rewards/tag_count_reward": 0.9158482551574707, |
| "step": 1605 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 788.7830780029296, |
| "epoch": 0.5496287445591875, |
| "grad_norm": 1.7132465839385986, |
| "kl": 2.090625, |
| "learning_rate": 1.5017877040458307e-06, |
| "loss": 0.3233, |
| "reward": 1.9564733147621154, |
| "reward_std": 0.4755387619137764, |
| "rewards/accuracy_reward": 0.18482143748551608, |
| "rewards/format_reward": 0.866964328289032, |
| "rewards/tag_count_reward": 0.9046875417232514, |
| "step": 1610 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 811.6839599609375, |
| "epoch": 0.5513356661261415, |
| "grad_norm": 6.485858917236328, |
| "kl": 2.4689453125, |
| "learning_rate": 1.4928492092091e-06, |
| "loss": 0.3277, |
| "reward": 1.9069197118282317, |
| "reward_std": 0.5133912198245525, |
| "rewards/accuracy_reward": 0.1741071511991322, |
| "rewards/format_reward": 0.846428605914116, |
| "rewards/tag_count_reward": 0.8863839745521546, |
| "step": 1615 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 810.3795043945313, |
| "epoch": 0.5530425876930956, |
| "grad_norm": 3.7540383338928223, |
| "kl": 1.98447265625, |
| "learning_rate": 1.4839109682951868e-06, |
| "loss": 0.248, |
| "reward": 1.9069197177886963, |
| "reward_std": 0.5398491598665714, |
| "rewards/accuracy_reward": 0.177678579185158, |
| "rewards/format_reward": 0.8455357551574707, |
| "rewards/tag_count_reward": 0.8837054014205933, |
| "step": 1620 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 801.8080688476563, |
| "epoch": 0.5547495092600495, |
| "grad_norm": 7.626502513885498, |
| "kl": 2.75556640625, |
| "learning_rate": 1.4749732986988233e-06, |
| "loss": 0.2917, |
| "reward": 1.9462054371833801, |
| "reward_std": 0.49239194244146345, |
| "rewards/accuracy_reward": 0.1937500095926225, |
| "rewards/format_reward": 0.8562500357627869, |
| "rewards/tag_count_reward": 0.8962053984403611, |
| "step": 1625 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 810.5491424560547, |
| "epoch": 0.5564564308270035, |
| "grad_norm": 11.224091529846191, |
| "kl": 2.851171875, |
| "learning_rate": 1.4660365177944528e-06, |
| "loss": 0.3272, |
| "reward": 1.950892949104309, |
| "reward_std": 0.5697008088231087, |
| "rewards/accuracy_reward": 0.21250000949949027, |
| "rewards/format_reward": 0.8482143223285675, |
| "rewards/tag_count_reward": 0.8901786178350448, |
| "step": 1630 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 755.3411041259766, |
| "epoch": 0.5581633523939575, |
| "grad_norm": 5.468372344970703, |
| "kl": 2.611328125, |
| "learning_rate": 1.4571009429249621e-06, |
| "loss": 0.2821, |
| "reward": 1.9037947297096252, |
| "reward_std": 0.4984534740447998, |
| "rewards/accuracy_reward": 0.1607142923399806, |
| "rewards/format_reward": 0.8517857551574707, |
| "rewards/tag_count_reward": 0.8912946790456772, |
| "step": 1635 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 756.8464630126953, |
| "epoch": 0.5598702739609115, |
| "grad_norm": 6.450639724731445, |
| "kl": 1.516015625, |
| "learning_rate": 1.448166891390412e-06, |
| "loss": 0.2243, |
| "reward": 1.97678582072258, |
| "reward_std": 0.3970135450363159, |
| "rewards/accuracy_reward": 0.17410715036094188, |
| "rewards/format_reward": 0.8866071790456772, |
| "rewards/tag_count_reward": 0.916071480512619, |
| "step": 1640 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 774.0580657958984, |
| "epoch": 0.5615771955278654, |
| "grad_norm": 8.960746765136719, |
| "kl": 2.1162353515625, |
| "learning_rate": 1.4392346804367697e-06, |
| "loss": 0.2795, |
| "reward": 2.002232217788696, |
| "reward_std": 0.5248230457305908, |
| "rewards/accuracy_reward": 0.2196428682655096, |
| "rewards/format_reward": 0.8714286148548126, |
| "rewards/tag_count_reward": 0.9111607581377029, |
| "step": 1645 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 785.7687835693359, |
| "epoch": 0.5632841170948195, |
| "grad_norm": 3.409419536590576, |
| "kl": 2.0033203125, |
| "learning_rate": 1.4303046272446437e-06, |
| "loss": 0.2474, |
| "reward": 2.006473296880722, |
| "reward_std": 0.477356181293726, |
| "rewards/accuracy_reward": 0.2330357264727354, |
| "rewards/format_reward": 0.8714286118745804, |
| "rewards/tag_count_reward": 0.9020089685916901, |
| "step": 1650 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 799.9205718994141, |
| "epoch": 0.5649910386617735, |
| "grad_norm": 4.918757438659668, |
| "kl": 2.9984375, |
| "learning_rate": 1.4213770489180224e-06, |
| "loss": 0.3257, |
| "reward": 1.9767858028411864, |
| "reward_std": 0.46925376951694486, |
| "rewards/accuracy_reward": 0.20892858104780315, |
| "rewards/format_reward": 0.8669643223285675, |
| "rewards/tag_count_reward": 0.9008928924798966, |
| "step": 1655 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 786.0205749511719, |
| "epoch": 0.5666979602287275, |
| "grad_norm": 7.537283897399902, |
| "kl": 2.5080078125, |
| "learning_rate": 1.4124522624730095e-06, |
| "loss": 0.2612, |
| "reward": 1.9294643819332122, |
| "reward_std": 0.4918681502342224, |
| "rewards/accuracy_reward": 0.19285715091973543, |
| "rewards/format_reward": 0.8500000417232514, |
| "rewards/tag_count_reward": 0.8866071850061417, |
| "step": 1660 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 761.3580657958985, |
| "epoch": 0.5684048817956815, |
| "grad_norm": 5.60185432434082, |
| "kl": 2.038671875, |
| "learning_rate": 1.403530584826573e-06, |
| "loss": 0.2786, |
| "reward": 1.985044741630554, |
| "reward_std": 0.45016813948750495, |
| "rewards/accuracy_reward": 0.19553572395816446, |
| "rewards/format_reward": 0.8785714656114578, |
| "rewards/tag_count_reward": 0.9109375476837158, |
| "step": 1665 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 766.7759307861328, |
| "epoch": 0.5701118033626354, |
| "grad_norm": 14.261216163635254, |
| "kl": 2.019921875, |
| "learning_rate": 1.3946123327852855e-06, |
| "loss": 0.2267, |
| "reward": 2.0183036804199217, |
| "reward_std": 0.4540314465761185, |
| "rewards/accuracy_reward": 0.2089285825379193, |
| "rewards/format_reward": 0.8883929014205932, |
| "rewards/tag_count_reward": 0.9209821850061417, |
| "step": 1670 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 788.6384246826171, |
| "epoch": 0.5718187249295895, |
| "grad_norm": 5.337741851806641, |
| "kl": 2.634375, |
| "learning_rate": 1.3856978230340789e-06, |
| "loss": 0.2992, |
| "reward": 2.0158482909202577, |
| "reward_std": 0.5203843086957931, |
| "rewards/accuracy_reward": 0.21875001154839993, |
| "rewards/format_reward": 0.8812500476837158, |
| "rewards/tag_count_reward": 0.9158482611179352, |
| "step": 1675 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 763.0973571777344, |
| "epoch": 0.5735256464965435, |
| "grad_norm": 3.45206618309021, |
| "kl": 2.1083984375, |
| "learning_rate": 1.3767873721249963e-06, |
| "loss": 0.2189, |
| "reward": 1.9062501072883606, |
| "reward_std": 0.500207568705082, |
| "rewards/accuracy_reward": 0.17053572209551932, |
| "rewards/format_reward": 0.8473214715719223, |
| "rewards/tag_count_reward": 0.8883928954601288, |
| "step": 1680 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 764.3705688476563, |
| "epoch": 0.5752325680634974, |
| "grad_norm": 4.182073593139648, |
| "kl": 1.9408203125, |
| "learning_rate": 1.3678812964659528e-06, |
| "loss": 0.1796, |
| "reward": 1.9564732909202576, |
| "reward_std": 0.5043147563934326, |
| "rewards/accuracy_reward": 0.21696429643779994, |
| "rewards/format_reward": 0.8491071879863739, |
| "rewards/tag_count_reward": 0.890401828289032, |
| "step": 1685 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 745.931283569336, |
| "epoch": 0.5769394896304515, |
| "grad_norm": 3.476262092590332, |
| "kl": 2.228125, |
| "learning_rate": 1.358979912309499e-06, |
| "loss": 0.1999, |
| "reward": 1.9243304431438446, |
| "reward_std": 0.4896064311265945, |
| "rewards/accuracy_reward": 0.14642857890576125, |
| "rewards/format_reward": 0.8696429014205933, |
| "rewards/tag_count_reward": 0.9082589715719223, |
| "step": 1690 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 736.755387878418, |
| "epoch": 0.5786464111974055, |
| "grad_norm": 3.4234468936920166, |
| "kl": 1.89208984375, |
| "learning_rate": 1.3500835357415933e-06, |
| "loss": 0.1948, |
| "reward": 1.987053632736206, |
| "reward_std": 0.48905017524957656, |
| "rewards/accuracy_reward": 0.20267857918515803, |
| "rewards/format_reward": 0.8732143342494965, |
| "rewards/tag_count_reward": 0.9111607521772385, |
| "step": 1695 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 719.3107406616211, |
| "epoch": 0.5803533327643595, |
| "grad_norm": 2.9903573989868164, |
| "kl": 2.109375, |
| "learning_rate": 1.341192482670372e-06, |
| "loss": 0.1728, |
| "reward": 1.975446528196335, |
| "reward_std": 0.46964697986841203, |
| "rewards/accuracy_reward": 0.20625000912696123, |
| "rewards/format_reward": 0.8678571820259094, |
| "rewards/tag_count_reward": 0.9013393312692642, |
| "step": 1700 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 752.3080657958984, |
| "epoch": 0.5820602543313135, |
| "grad_norm": 9.797762870788574, |
| "kl": 1.139453125, |
| "learning_rate": 1.3323070688149395e-06, |
| "loss": 0.1828, |
| "reward": 2.018750089406967, |
| "reward_std": 0.45740093365311624, |
| "rewards/accuracy_reward": 0.22053572498261928, |
| "rewards/format_reward": 0.8875000387430191, |
| "rewards/tag_count_reward": 0.9107143312692643, |
| "step": 1705 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 727.2919998168945, |
| "epoch": 0.5837671758982674, |
| "grad_norm": 8.485093116760254, |
| "kl": 1.298193359375, |
| "learning_rate": 1.3234276096941503e-06, |
| "loss": 0.1714, |
| "reward": 2.0808036625385284, |
| "reward_std": 0.3648061454296112, |
| "rewards/accuracy_reward": 0.24821429811418055, |
| "rewards/format_reward": 0.9026786178350449, |
| "rewards/tag_count_reward": 0.9299107611179351, |
| "step": 1710 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 705.1152099609375, |
| "epoch": 0.5854740974652215, |
| "grad_norm": 4.864824295043945, |
| "kl": 1.031396484375, |
| "learning_rate": 1.314554420615409e-06, |
| "loss": 0.1304, |
| "reward": 2.053125095367432, |
| "reward_std": 0.32865179777145387, |
| "rewards/accuracy_reward": 0.2026785796508193, |
| "rewards/format_reward": 0.9133928924798965, |
| "rewards/tag_count_reward": 0.9370536088943482, |
| "step": 1715 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 705.3196685791015, |
| "epoch": 0.5871810190321755, |
| "grad_norm": 1.7557798624038696, |
| "kl": 1.1629150390625, |
| "learning_rate": 1.3056878166634721e-06, |
| "loss": 0.1955, |
| "reward": 2.0312501013278963, |
| "reward_std": 0.36564070135355, |
| "rewards/accuracy_reward": 0.17589286509901286, |
| "rewards/format_reward": 0.9151786088943481, |
| "rewards/tag_count_reward": 0.9401786118745804, |
| "step": 1720 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 717.8866394042968, |
| "epoch": 0.5888879405991295, |
| "grad_norm": 2.3789522647857666, |
| "kl": 1.434619140625, |
| "learning_rate": 1.2968281126892603e-06, |
| "loss": 0.189, |
| "reward": 2.010937583446503, |
| "reward_std": 0.32410271763801574, |
| "rewards/accuracy_reward": 0.15446429206058382, |
| "rewards/format_reward": 0.916964316368103, |
| "rewards/tag_count_reward": 0.9395089656114578, |
| "step": 1725 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 764.8107513427734, |
| "epoch": 0.5905948621660835, |
| "grad_norm": 6.4479899406433105, |
| "kl": 1.55869140625, |
| "learning_rate": 1.2879756232986763e-06, |
| "loss": 0.2201, |
| "reward": 2.0031250774860383, |
| "reward_std": 0.44354828745126723, |
| "rewards/accuracy_reward": 0.2142857240512967, |
| "rewards/format_reward": 0.8767857551574707, |
| "rewards/tag_count_reward": 0.9120536148548126, |
| "step": 1730 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 760.9750305175781, |
| "epoch": 0.5923017837330374, |
| "grad_norm": 1.7079081535339355, |
| "kl": 1.798828125, |
| "learning_rate": 1.2791306628414377e-06, |
| "loss": 0.2179, |
| "reward": 1.9656250894069671, |
| "reward_std": 0.3878778383135796, |
| "rewards/accuracy_reward": 0.15892857862636448, |
| "rewards/format_reward": 0.8883928924798965, |
| "rewards/tag_count_reward": 0.9183036118745804, |
| "step": 1735 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 713.3893127441406, |
| "epoch": 0.5940087052999915, |
| "grad_norm": 1.6621439456939697, |
| "kl": 1.703564453125, |
| "learning_rate": 1.2702935453999079e-06, |
| "loss": 0.1869, |
| "reward": 2.0366072416305543, |
| "reward_std": 0.4229817047715187, |
| "rewards/accuracy_reward": 0.20267858179286122, |
| "rewards/format_reward": 0.9017857640981675, |
| "rewards/tag_count_reward": 0.9321429014205933, |
| "step": 1740 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 728.0705627441406, |
| "epoch": 0.5957156268669455, |
| "grad_norm": 2.5876617431640625, |
| "kl": 1.56240234375, |
| "learning_rate": 1.2614645847779498e-06, |
| "loss": 0.1968, |
| "reward": 1.926562601327896, |
| "reward_std": 0.4078594759106636, |
| "rewards/accuracy_reward": 0.16071429485455155, |
| "rewards/format_reward": 0.8642857521772385, |
| "rewards/tag_count_reward": 0.9015625417232513, |
| "step": 1745 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 736.7232452392578, |
| "epoch": 0.5974225484338994, |
| "grad_norm": 5.281065464019775, |
| "kl": 1.70810546875, |
| "learning_rate": 1.252644094489778e-06, |
| "loss": 0.1785, |
| "reward": 1.942410796880722, |
| "reward_std": 0.44451272040605544, |
| "rewards/accuracy_reward": 0.17678572433069348, |
| "rewards/format_reward": 0.8625000447034836, |
| "rewards/tag_count_reward": 0.9031250447034835, |
| "step": 1750 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 677.8294891357422, |
| "epoch": 0.5991294700008535, |
| "grad_norm": 4.909358501434326, |
| "kl": 1.2296875, |
| "learning_rate": 1.2438323877488274e-06, |
| "loss": 0.1429, |
| "reward": 2.0571429431438446, |
| "reward_std": 0.4347854398190975, |
| "rewards/accuracy_reward": 0.2392857251688838, |
| "rewards/format_reward": 0.8910714685916901, |
| "rewards/tag_count_reward": 0.9267857581377029, |
| "step": 1755 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 740.7134246826172, |
| "epoch": 0.6008363915678074, |
| "grad_norm": 9.942142486572266, |
| "kl": 1.40419921875, |
| "learning_rate": 1.2350297774566337e-06, |
| "loss": 0.1766, |
| "reward": 1.97477685213089, |
| "reward_std": 0.4526204660534859, |
| "rewards/accuracy_reward": 0.19196429187431932, |
| "rewards/format_reward": 0.8750000357627868, |
| "rewards/tag_count_reward": 0.9078125417232513, |
| "step": 1760 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 722.588427734375, |
| "epoch": 0.6025433131347615, |
| "grad_norm": 2.3722033500671387, |
| "kl": 1.3779296875, |
| "learning_rate": 1.2262365761917163e-06, |
| "loss": 0.1971, |
| "reward": 2.035267961025238, |
| "reward_std": 0.4693600654602051, |
| "rewards/accuracy_reward": 0.2125000089406967, |
| "rewards/format_reward": 0.8955357551574707, |
| "rewards/tag_count_reward": 0.9272321820259094, |
| "step": 1765 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 708.3277038574219, |
| "epoch": 0.6042502347017155, |
| "grad_norm": 4.353895664215088, |
| "kl": 1.6890625, |
| "learning_rate": 1.2174530961984853e-06, |
| "loss": 0.2061, |
| "reward": 1.922321516275406, |
| "reward_std": 0.49082919061183927, |
| "rewards/accuracy_reward": 0.1955357225611806, |
| "rewards/format_reward": 0.8410714715719223, |
| "rewards/tag_count_reward": 0.885714328289032, |
| "step": 1770 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 765.6419982910156, |
| "epoch": 0.6059571562686694, |
| "grad_norm": 3.145759105682373, |
| "kl": 2.332421875, |
| "learning_rate": 1.2086796493761495e-06, |
| "loss": 0.2836, |
| "reward": 1.865178644657135, |
| "reward_std": 0.5404426328837871, |
| "rewards/accuracy_reward": 0.16428572041913866, |
| "rewards/format_reward": 0.8258929014205932, |
| "rewards/tag_count_reward": 0.8750000387430191, |
| "step": 1775 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 696.7169952392578, |
| "epoch": 0.6076640778356235, |
| "grad_norm": 1.4645618200302124, |
| "kl": 1.76787109375, |
| "learning_rate": 1.1999165472676426e-06, |
| "loss": 0.1652, |
| "reward": 1.9060268819332122, |
| "reward_std": 0.5109671518206597, |
| "rewards/accuracy_reward": 0.18392857955768704, |
| "rewards/format_reward": 0.8392857491970063, |
| "rewards/tag_count_reward": 0.8828125417232513, |
| "step": 1780 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 690.5178894042969, |
| "epoch": 0.6093709994025774, |
| "grad_norm": 1.7024650573730469, |
| "kl": 1.23330078125, |
| "learning_rate": 1.1911641010485598e-06, |
| "loss": 0.1556, |
| "reward": 2.016741174459457, |
| "reward_std": 0.43561617806553843, |
| "rewards/accuracy_reward": 0.22678572572767736, |
| "rewards/format_reward": 0.8758928954601288, |
| "rewards/tag_count_reward": 0.9140625476837159, |
| "step": 1785 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 721.9571762084961, |
| "epoch": 0.6110779209695314, |
| "grad_norm": 1.3666507005691528, |
| "kl": 1.13935546875, |
| "learning_rate": 1.182422621516109e-06, |
| "loss": 0.1252, |
| "reward": 1.9524554371833802, |
| "reward_std": 0.398265440762043, |
| "rewards/accuracy_reward": 0.15625000530853866, |
| "rewards/format_reward": 0.8839286118745804, |
| "rewards/tag_count_reward": 0.9122768253087997, |
| "step": 1790 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 684.8982421875, |
| "epoch": 0.6127848425364855, |
| "grad_norm": 2.1875545978546143, |
| "kl": 1.0215087890625, |
| "learning_rate": 1.1736924190780725e-06, |
| "loss": 0.0728, |
| "reward": 2.037276875972748, |
| "reward_std": 0.3939563654363155, |
| "rewards/accuracy_reward": 0.19464286416769028, |
| "rewards/format_reward": 0.9089286148548126, |
| "rewards/tag_count_reward": 0.933705398440361, |
| "step": 1795 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 758.3491455078125, |
| "epoch": 0.6144917641034394, |
| "grad_norm": 3.1847763061523438, |
| "kl": 1.2671875, |
| "learning_rate": 1.1649738037417878e-06, |
| "loss": 0.1705, |
| "reward": 1.9837054431438446, |
| "reward_std": 0.40774648189544677, |
| "rewards/accuracy_reward": 0.16517857890576124, |
| "rewards/format_reward": 0.8955357521772385, |
| "rewards/tag_count_reward": 0.9229911148548127, |
| "step": 1800 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 709.7312835693359, |
| "epoch": 0.6161986856703935, |
| "grad_norm": 1.9738051891326904, |
| "kl": 1.80869140625, |
| "learning_rate": 1.1562670851031345e-06, |
| "loss": 0.2007, |
| "reward": 1.983035796880722, |
| "reward_std": 0.4977798670530319, |
| "rewards/accuracy_reward": 0.21428572218865155, |
| "rewards/format_reward": 0.8633928954601288, |
| "rewards/tag_count_reward": 0.9053571909666062, |
| "step": 1805 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 755.1071746826171, |
| "epoch": 0.6179056072373474, |
| "grad_norm": 7.7020673751831055, |
| "kl": 1.845703125, |
| "learning_rate": 1.1475725723355462e-06, |
| "loss": 0.1874, |
| "reward": 1.864955449104309, |
| "reward_std": 0.5472064293920994, |
| "rewards/accuracy_reward": 0.17678572256118058, |
| "rewards/format_reward": 0.8205357551574707, |
| "rewards/tag_count_reward": 0.8676339685916901, |
| "step": 1810 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 727.8607482910156, |
| "epoch": 0.6196125288043014, |
| "grad_norm": 2.8931884765625, |
| "kl": 2.0552734375, |
| "learning_rate": 1.1388905741790269e-06, |
| "loss": 0.2344, |
| "reward": 1.922767949104309, |
| "reward_std": 0.4940613195300102, |
| "rewards/accuracy_reward": 0.15089286286383868, |
| "rewards/format_reward": 0.8651786088943482, |
| "rewards/tag_count_reward": 0.9066964745521545, |
| "step": 1815 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 765.1125366210938, |
| "epoch": 0.6213194503712555, |
| "grad_norm": 1.5861130952835083, |
| "kl": 1.46142578125, |
| "learning_rate": 1.1302213989291914e-06, |
| "loss": 0.2015, |
| "reward": 1.9593750953674316, |
| "reward_std": 0.4679669015109539, |
| "rewards/accuracy_reward": 0.1785714362747967, |
| "rewards/format_reward": 0.8732143253087997, |
| "rewards/tag_count_reward": 0.9075893282890319, |
| "step": 1820 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 763.0911102294922, |
| "epoch": 0.6230263719382094, |
| "grad_norm": 2.04757022857666, |
| "kl": 1.56689453125, |
| "learning_rate": 1.1215653544263147e-06, |
| "loss": 0.1845, |
| "reward": 2.006250095367432, |
| "reward_std": 0.46510125435888766, |
| "rewards/accuracy_reward": 0.21250000968575478, |
| "rewards/format_reward": 0.8776786118745804, |
| "rewards/tag_count_reward": 0.9160714685916901, |
| "step": 1825 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 743.4795013427735, |
| "epoch": 0.6247332935051635, |
| "grad_norm": 3.3067221641540527, |
| "kl": 1.68193359375, |
| "learning_rate": 1.1129227480444041e-06, |
| "loss": 0.1842, |
| "reward": 1.9939732909202577, |
| "reward_std": 0.4414864867925644, |
| "rewards/accuracy_reward": 0.20446429373696445, |
| "rewards/format_reward": 0.8767857551574707, |
| "rewards/tag_count_reward": 0.9127232581377029, |
| "step": 1830 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 765.1839599609375, |
| "epoch": 0.6264402150721174, |
| "grad_norm": 3.4389750957489014, |
| "kl": 1.82890625, |
| "learning_rate": 1.10429388668028e-06, |
| "loss": 0.2676, |
| "reward": 2.0104911625385284, |
| "reward_std": 0.5313868165016175, |
| "rewards/accuracy_reward": 0.22589287031441926, |
| "rewards/format_reward": 0.8705357581377029, |
| "rewards/tag_count_reward": 0.9140625417232513, |
| "step": 1835 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 746.4893157958984, |
| "epoch": 0.6281471366390714, |
| "grad_norm": 2.3858678340911865, |
| "kl": 1.932421875, |
| "learning_rate": 1.0956790767426834e-06, |
| "loss": 0.2194, |
| "reward": 1.9825893580913543, |
| "reward_std": 0.5101016968488693, |
| "rewards/accuracy_reward": 0.2044642936438322, |
| "rewards/format_reward": 0.8723214715719223, |
| "rewards/tag_count_reward": 0.9058036118745804, |
| "step": 1840 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 746.9839630126953, |
| "epoch": 0.6298540582060255, |
| "grad_norm": 3.954383134841919, |
| "kl": 2.05859375, |
| "learning_rate": 1.0870786241413909e-06, |
| "loss": 0.2344, |
| "reward": 1.9395090341567993, |
| "reward_std": 0.5304494857788086, |
| "rewards/accuracy_reward": 0.2142857256345451, |
| "rewards/format_reward": 0.8401786118745804, |
| "rewards/tag_count_reward": 0.8850446850061416, |
| "step": 1845 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 755.4393157958984, |
| "epoch": 0.6315609797729794, |
| "grad_norm": 6.762625694274902, |
| "kl": 1.98203125, |
| "learning_rate": 1.078492834276354e-06, |
| "loss": 0.2525, |
| "reward": 1.8654018759727478, |
| "reward_std": 0.5434098705649376, |
| "rewards/accuracy_reward": 0.15803572097793223, |
| "rewards/format_reward": 0.8303571879863739, |
| "rewards/tag_count_reward": 0.8770089685916901, |
| "step": 1850 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 788.8839660644531, |
| "epoch": 0.6332679013399334, |
| "grad_norm": 2.6459848880767822, |
| "kl": 2.58125, |
| "learning_rate": 1.069922012026854e-06, |
| "loss": 0.2477, |
| "reward": 1.8825893640518188, |
| "reward_std": 0.6637891173362732, |
| "rewards/accuracy_reward": 0.19910715203732252, |
| "rewards/format_reward": 0.8151786118745804, |
| "rewards/tag_count_reward": 0.8683036148548127, |
| "step": 1855 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 723.5955688476563, |
| "epoch": 0.6349748229068874, |
| "grad_norm": 2.0201399326324463, |
| "kl": 1.7146484375, |
| "learning_rate": 1.0613664617406762e-06, |
| "loss": 0.1506, |
| "reward": 1.9308036625385285, |
| "reward_std": 0.5138662807643414, |
| "rewards/accuracy_reward": 0.17678572088479996, |
| "rewards/format_reward": 0.8562500417232514, |
| "rewards/tag_count_reward": 0.8977679014205933, |
| "step": 1860 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 743.973257446289, |
| "epoch": 0.6366817444738414, |
| "grad_norm": 2.8136751651763916, |
| "kl": 1.8068359375, |
| "learning_rate": 1.0528264872233018e-06, |
| "loss": 0.1668, |
| "reward": 1.8919643580913543, |
| "reward_std": 0.5319583684206008, |
| "rewards/accuracy_reward": 0.18839286705479025, |
| "rewards/format_reward": 0.8267857581377029, |
| "rewards/tag_count_reward": 0.8767857551574707, |
| "step": 1865 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 695.7518188476563, |
| "epoch": 0.6383886660407955, |
| "grad_norm": 2.7147600650787354, |
| "kl": 2.24091796875, |
| "learning_rate": 1.0443023917271202e-06, |
| "loss": 0.1887, |
| "reward": 1.868303656578064, |
| "reward_std": 0.5464206546545028, |
| "rewards/accuracy_reward": 0.20178572367876768, |
| "rewards/format_reward": 0.8080357521772384, |
| "rewards/tag_count_reward": 0.8584821790456771, |
| "step": 1870 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 718.8482482910156, |
| "epoch": 0.6400955876077494, |
| "grad_norm": 3.581350326538086, |
| "kl": 1.817578125, |
| "learning_rate": 1.0357944779406609e-06, |
| "loss": 0.1201, |
| "reward": 1.9125000953674316, |
| "reward_std": 0.5139884263277054, |
| "rewards/accuracy_reward": 0.19821429224684833, |
| "rewards/format_reward": 0.8348214745521545, |
| "rewards/tag_count_reward": 0.8794643253087997, |
| "step": 1875 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 686.5134216308594, |
| "epoch": 0.6418025091747034, |
| "grad_norm": 1.0345350503921509, |
| "kl": 1.70791015625, |
| "learning_rate": 1.0273030479778456e-06, |
| "loss": 0.1892, |
| "reward": 1.9660715281963348, |
| "reward_std": 0.45822909846901894, |
| "rewards/accuracy_reward": 0.1767857238650322, |
| "rewards/format_reward": 0.877678605914116, |
| "rewards/tag_count_reward": 0.9116071879863739, |
| "step": 1880 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 720.682177734375, |
| "epoch": 0.6435094307416575, |
| "grad_norm": 0.9296639561653137, |
| "kl": 1.1458984375, |
| "learning_rate": 1.0188284033672586e-06, |
| "loss": 0.1595, |
| "reward": 1.9872768700122834, |
| "reward_std": 0.40355037674307825, |
| "rewards/accuracy_reward": 0.15714286332949995, |
| "rewards/format_reward": 0.9053571850061417, |
| "rewards/tag_count_reward": 0.924776828289032, |
| "step": 1885 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 651.3928894042969, |
| "epoch": 0.6452163523086114, |
| "grad_norm": 5.478949069976807, |
| "kl": 0.98974609375, |
| "learning_rate": 1.0103708450414404e-06, |
| "loss": 0.079, |
| "reward": 2.0517858028411866, |
| "reward_std": 0.419903215020895, |
| "rewards/accuracy_reward": 0.2044642929919064, |
| "rewards/format_reward": 0.9125000387430191, |
| "rewards/tag_count_reward": 0.93482146859169, |
| "step": 1890 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 710.156283569336, |
| "epoch": 0.6469232738755654, |
| "grad_norm": 2.714566469192505, |
| "kl": 1.3677734375, |
| "learning_rate": 1.0019306733262022e-06, |
| "loss": 0.1255, |
| "reward": 1.9647322475910187, |
| "reward_std": 0.40584927052259445, |
| "rewards/accuracy_reward": 0.16160715082660318, |
| "rewards/format_reward": 0.8901786118745804, |
| "rewards/tag_count_reward": 0.9129464715719223, |
| "step": 1895 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 703.6277053833007, |
| "epoch": 0.6486301954425194, |
| "grad_norm": 1.2529172897338867, |
| "kl": 1.42275390625, |
| "learning_rate": 9.935081879299605e-07, |
| "loss": 0.1632, |
| "reward": 1.9410715401172638, |
| "reward_std": 0.4627831496298313, |
| "rewards/accuracy_reward": 0.1928571521304548, |
| "rewards/format_reward": 0.8544643193483352, |
| "rewards/tag_count_reward": 0.8937500357627869, |
| "step": 1900 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 745.7062866210938, |
| "epoch": 0.6503371170094734, |
| "grad_norm": 2.3366475105285645, |
| "kl": 1.8337890625, |
| "learning_rate": 9.851036879330958e-07, |
| "loss": 0.1861, |
| "reward": 1.936830472946167, |
| "reward_std": 0.5271413020789624, |
| "rewards/accuracy_reward": 0.18035714915022255, |
| "rewards/format_reward": 0.8553571850061417, |
| "rewards/tag_count_reward": 0.9011161088943481, |
| "step": 1905 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 735.8169952392578, |
| "epoch": 0.6520440385764275, |
| "grad_norm": 2.5596821308135986, |
| "kl": 1.5796875, |
| "learning_rate": 9.767174717773307e-07, |
| "loss": 0.1504, |
| "reward": 1.9691965103149414, |
| "reward_std": 0.4714687965810299, |
| "rewards/accuracy_reward": 0.1839285807684064, |
| "rewards/format_reward": 0.8750000447034836, |
| "rewards/tag_count_reward": 0.9102679014205932, |
| "step": 1910 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 727.2643218994141, |
| "epoch": 0.6537509601433814, |
| "grad_norm": 0.8387001156806946, |
| "kl": 1.6423828125, |
| "learning_rate": 9.683498372551335e-07, |
| "loss": 0.1925, |
| "reward": 1.9691965103149414, |
| "reward_std": 0.4812636740505695, |
| "rewards/accuracy_reward": 0.19553572311997414, |
| "rewards/format_reward": 0.8669643342494965, |
| "rewards/tag_count_reward": 0.9066964685916901, |
| "step": 1915 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 734.2857482910156, |
| "epoch": 0.6554578817103354, |
| "grad_norm": 0.9733322262763977, |
| "kl": 1.50322265625, |
| "learning_rate": 9.600010814991425e-07, |
| "loss": 0.1831, |
| "reward": 1.9265625894069671, |
| "reward_std": 0.4761272206902504, |
| "rewards/accuracy_reward": 0.16339286472648382, |
| "rewards/format_reward": 0.8625000387430191, |
| "rewards/tag_count_reward": 0.9006696850061416, |
| "step": 1920 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 743.7411041259766, |
| "epoch": 0.6571648032772894, |
| "grad_norm": 1.7600935697555542, |
| "kl": 1.1025390625, |
| "learning_rate": 9.51671500971617e-07, |
| "loss": 0.1222, |
| "reward": 1.9857143700122832, |
| "reward_std": 0.41541323736310004, |
| "rewards/accuracy_reward": 0.17946429513394832, |
| "rewards/format_reward": 0.8883929014205932, |
| "rewards/tag_count_reward": 0.9178571850061417, |
| "step": 1925 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 734.8607513427735, |
| "epoch": 0.6588717248442434, |
| "grad_norm": 1.9581345319747925, |
| "kl": 1.437109375, |
| "learning_rate": 9.433613914539076e-07, |
| "loss": 0.1572, |
| "reward": 1.9459822356700898, |
| "reward_std": 0.4454167552292347, |
| "rewards/accuracy_reward": 0.1678571510128677, |
| "rewards/format_reward": 0.873214328289032, |
| "rewards/tag_count_reward": 0.9049107521772385, |
| "step": 1930 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 690.2170013427734, |
| "epoch": 0.6605786464111975, |
| "grad_norm": 4.418382167816162, |
| "kl": 1.1951171875, |
| "learning_rate": 9.350710480359549e-07, |
| "loss": 0.1354, |
| "reward": 2.0120536625385284, |
| "reward_std": 0.39660380184650423, |
| "rewards/accuracy_reward": 0.18303572237491608, |
| "rewards/format_reward": 0.9000000387430191, |
| "rewards/tag_count_reward": 0.92901791036129, |
| "step": 1935 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 697.366098022461, |
| "epoch": 0.6622855679781514, |
| "grad_norm": 1.5262176990509033, |
| "kl": 1.357177734375, |
| "learning_rate": 9.268007651058089e-07, |
| "loss": 0.129, |
| "reward": 2.02790185213089, |
| "reward_std": 0.4235570203512907, |
| "rewards/accuracy_reward": 0.22767858300358057, |
| "rewards/format_reward": 0.8857143253087998, |
| "rewards/tag_count_reward": 0.9145089715719223, |
| "step": 1940 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 733.0839538574219, |
| "epoch": 0.6639924895451054, |
| "grad_norm": 1.9810816049575806, |
| "kl": 1.683984375, |
| "learning_rate": 9.185508363391787e-07, |
| "loss": 0.1943, |
| "reward": 1.9560268938541412, |
| "reward_std": 0.48697994872927663, |
| "rewards/accuracy_reward": 0.1714285804890096, |
| "rewards/format_reward": 0.8723214715719223, |
| "rewards/tag_count_reward": 0.912276828289032, |
| "step": 1945 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 760.0661041259766, |
| "epoch": 0.6656994111120594, |
| "grad_norm": 2.6021721363067627, |
| "kl": 1.9115234375, |
| "learning_rate": 9.103215546890001e-07, |
| "loss": 0.1917, |
| "reward": 1.939285784959793, |
| "reward_std": 0.5354632467031479, |
| "rewards/accuracy_reward": 0.18482143925502897, |
| "rewards/format_reward": 0.8562500417232514, |
| "rewards/tag_count_reward": 0.8982143223285675, |
| "step": 1950 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 708.5402069091797, |
| "epoch": 0.6674063326790134, |
| "grad_norm": 5.726203918457031, |
| "kl": 1.6966796875, |
| "learning_rate": 9.021132123750361e-07, |
| "loss": 0.1782, |
| "reward": 1.943303656578064, |
| "reward_std": 0.4957593522965908, |
| "rewards/accuracy_reward": 0.1562500072643161, |
| "rewards/format_reward": 0.8767857551574707, |
| "rewards/tag_count_reward": 0.9102679014205932, |
| "step": 1955 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 738.5419921875, |
| "epoch": 0.6691132542459673, |
| "grad_norm": 3.667335271835327, |
| "kl": 1.2876220703125, |
| "learning_rate": 8.93926100873498e-07, |
| "loss": 0.1574, |
| "reward": 2.027678656578064, |
| "reward_std": 0.41424002312123775, |
| "rewards/accuracy_reward": 0.18660715240985154, |
| "rewards/format_reward": 0.9053571850061417, |
| "rewards/tag_count_reward": 0.9357143193483353, |
| "step": 1960 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 715.632177734375, |
| "epoch": 0.6708201758129214, |
| "grad_norm": 5.630631923675537, |
| "kl": 1.3962890625, |
| "learning_rate": 8.857605109066977e-07, |
| "loss": 0.1684, |
| "reward": 2.0866072475910187, |
| "reward_std": 0.4239813320338726, |
| "rewards/accuracy_reward": 0.23660715185105802, |
| "rewards/format_reward": 0.9151786088943481, |
| "rewards/tag_count_reward": 0.9348214745521546, |
| "step": 1965 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 724.7134246826172, |
| "epoch": 0.6725270973798754, |
| "grad_norm": 0.8905249834060669, |
| "kl": 1.224609375, |
| "learning_rate": 8.776167324327203e-07, |
| "loss": 0.1153, |
| "reward": 2.027678686380386, |
| "reward_std": 0.3887888576835394, |
| "rewards/accuracy_reward": 0.16875000894069672, |
| "rewards/format_reward": 0.9169643223285675, |
| "rewards/tag_count_reward": 0.9419643253087997, |
| "step": 1970 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 782.5875366210937, |
| "epoch": 0.6742340189468294, |
| "grad_norm": 2.29498291015625, |
| "kl": 1.78818359375, |
| "learning_rate": 8.694950546351335e-07, |
| "loss": 0.2179, |
| "reward": 1.9397322356700897, |
| "reward_std": 0.43477702885866165, |
| "rewards/accuracy_reward": 0.14375000745058059, |
| "rewards/format_reward": 0.8812500387430191, |
| "rewards/tag_count_reward": 0.9147321850061416, |
| "step": 1975 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 711.3321807861328, |
| "epoch": 0.6759409405137834, |
| "grad_norm": 1.6042215824127197, |
| "kl": 1.51044921875, |
| "learning_rate": 8.61395765912712e-07, |
| "loss": 0.1558, |
| "reward": 2.017857217788696, |
| "reward_std": 0.41501733139157293, |
| "rewards/accuracy_reward": 0.18750000819563867, |
| "rewards/format_reward": 0.9035714656114578, |
| "rewards/tag_count_reward": 0.9267857581377029, |
| "step": 1980 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 749.9616394042969, |
| "epoch": 0.6776478620807374, |
| "grad_norm": 6.817398548126221, |
| "kl": 1.70859375, |
| "learning_rate": 8.533191538692026e-07, |
| "loss": 0.199, |
| "reward": 2.0314733028411864, |
| "reward_std": 0.4689814649522305, |
| "rewards/accuracy_reward": 0.2258928676135838, |
| "rewards/format_reward": 0.8875000447034835, |
| "rewards/tag_count_reward": 0.9180804014205932, |
| "step": 1985 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 775.2875335693359, |
| "epoch": 0.6793547836476914, |
| "grad_norm": 5.016016006469727, |
| "kl": 1.9013671875, |
| "learning_rate": 8.452655053031066e-07, |
| "loss": 0.2043, |
| "reward": 1.953571516275406, |
| "reward_std": 0.5462075427174569, |
| "rewards/accuracy_reward": 0.19375000931322575, |
| "rewards/format_reward": 0.8625000476837158, |
| "rewards/tag_count_reward": 0.89732146859169, |
| "step": 1990 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 765.9411102294922, |
| "epoch": 0.6810617052146454, |
| "grad_norm": 1.6670773029327393, |
| "kl": 2.117578125, |
| "learning_rate": 8.372351061975014e-07, |
| "loss": 0.2836, |
| "reward": 1.9348215281963348, |
| "reward_std": 0.5541624218225479, |
| "rewards/accuracy_reward": 0.2035714391618967, |
| "rewards/format_reward": 0.8491071790456772, |
| "rewards/tag_count_reward": 0.8821429014205933, |
| "step": 1995 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 807.7107513427734, |
| "epoch": 0.6827686267815993, |
| "grad_norm": 1.560651183128357, |
| "kl": 1.6337890625, |
| "learning_rate": 8.292282417098763e-07, |
| "loss": 0.2252, |
| "reward": 1.9933036804199218, |
| "reward_std": 0.47950020134449006, |
| "rewards/accuracy_reward": 0.20089286658912897, |
| "rewards/format_reward": 0.8812500447034836, |
| "rewards/tag_count_reward": 0.9111607640981674, |
| "step": 2000 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 756.7250396728516, |
| "epoch": 0.6844755483485534, |
| "grad_norm": 4.225320816040039, |
| "kl": 1.6544921875, |
| "learning_rate": 8.212451961620176e-07, |
| "loss": 0.2091, |
| "reward": 1.9883929371833802, |
| "reward_std": 0.4503480665385723, |
| "rewards/accuracy_reward": 0.19910714793950318, |
| "rewards/format_reward": 0.8785714775323867, |
| "rewards/tag_count_reward": 0.9107143253087997, |
| "step": 2005 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 753.9098541259766, |
| "epoch": 0.6861824699155074, |
| "grad_norm": 1.1238821744918823, |
| "kl": 1.6462890625, |
| "learning_rate": 8.132862530299031e-07, |
| "loss": 0.2113, |
| "reward": 1.9857143700122832, |
| "reward_std": 0.5074703797698021, |
| "rewards/accuracy_reward": 0.21875000707805156, |
| "rewards/format_reward": 0.8669643253087997, |
| "rewards/tag_count_reward": 0.9000000417232513, |
| "step": 2010 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 776.651821899414, |
| "epoch": 0.6878893914824614, |
| "grad_norm": 5.102161407470703, |
| "kl": 2.066015625, |
| "learning_rate": 8.053516949336425e-07, |
| "loss": 0.2291, |
| "reward": 1.9631697356700897, |
| "reward_std": 0.4666756376624107, |
| "rewards/accuracy_reward": 0.18392858169972898, |
| "rewards/format_reward": 0.875892898440361, |
| "rewards/tag_count_reward": 0.9033482551574707, |
| "step": 2015 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 731.7232482910156, |
| "epoch": 0.6895963130494154, |
| "grad_norm": 4.221590042114258, |
| "kl": 1.782421875, |
| "learning_rate": 7.974418036274371e-07, |
| "loss": 0.1794, |
| "reward": 1.9631697356700897, |
| "reward_std": 0.5094508707523346, |
| "rewards/accuracy_reward": 0.19821429466828705, |
| "rewards/format_reward": 0.8633928954601288, |
| "rewards/tag_count_reward": 0.9015625447034836, |
| "step": 2020 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 745.8018249511719, |
| "epoch": 0.6913032346163693, |
| "grad_norm": 2.238372802734375, |
| "kl": 1.6064453125, |
| "learning_rate": 7.895568599895763e-07, |
| "loss": 0.1517, |
| "reward": 2.003348296880722, |
| "reward_std": 0.4860348865389824, |
| "rewards/accuracy_reward": 0.20178572349250318, |
| "rewards/format_reward": 0.882142898440361, |
| "rewards/tag_count_reward": 0.9194196820259094, |
| "step": 2025 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 776.3482482910156, |
| "epoch": 0.6930101561833234, |
| "grad_norm": 2.6449360847473145, |
| "kl": 1.395166015625, |
| "learning_rate": 7.816971440124661e-07, |
| "loss": 0.1546, |
| "reward": 2.007142949104309, |
| "reward_std": 0.4705513596534729, |
| "rewards/accuracy_reward": 0.2071428656578064, |
| "rewards/format_reward": 0.8848214685916901, |
| "rewards/tag_count_reward": 0.9151786148548127, |
| "step": 2030 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 720.1062850952148, |
| "epoch": 0.6947170777502774, |
| "grad_norm": 0.6730054020881653, |
| "kl": 1.17783203125, |
| "learning_rate": 7.738629347926818e-07, |
| "loss": 0.1375, |
| "reward": 2.0524554371833803, |
| "reward_std": 0.4183092713356018, |
| "rewards/accuracy_reward": 0.20446429429575802, |
| "rewards/format_reward": 0.9133928954601288, |
| "rewards/tag_count_reward": 0.9345982581377029, |
| "step": 2035 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 796.7393218994141, |
| "epoch": 0.6964239993172314, |
| "grad_norm": 0.5611494779586792, |
| "kl": 1.42998046875, |
| "learning_rate": 7.660545105210627e-07, |
| "loss": 0.1679, |
| "reward": 2.0196429550647736, |
| "reward_std": 0.4546675443649292, |
| "rewards/accuracy_reward": 0.20982143972069026, |
| "rewards/format_reward": 0.8875000417232514, |
| "rewards/tag_count_reward": 0.9223214745521545, |
| "step": 2040 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 772.7375335693359, |
| "epoch": 0.6981309208841854, |
| "grad_norm": 1.859115719795227, |
| "kl": 1.3640625, |
| "learning_rate": 7.582721484728289e-07, |
| "loss": 0.22, |
| "reward": 1.988392949104309, |
| "reward_std": 0.39518385380506516, |
| "rewards/accuracy_reward": 0.1821428638882935, |
| "rewards/format_reward": 0.8857143223285675, |
| "rewards/tag_count_reward": 0.9205357611179352, |
| "step": 2045 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 725.6500366210937, |
| "epoch": 0.6998378424511393, |
| "grad_norm": 6.75427770614624, |
| "kl": 1.10439453125, |
| "learning_rate": 7.50516124997738e-07, |
| "loss": 0.1238, |
| "reward": 1.9763393878936768, |
| "reward_std": 0.39795525595545767, |
| "rewards/accuracy_reward": 0.16071429178118707, |
| "rewards/format_reward": 0.8919643223285675, |
| "rewards/tag_count_reward": 0.9236607491970062, |
| "step": 2050 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 745.8535980224609, |
| "epoch": 0.7015447640180934, |
| "grad_norm": 1.4931570291519165, |
| "kl": 1.0126953125, |
| "learning_rate": 7.427867155102712e-07, |
| "loss": 0.133, |
| "reward": 2.013169747591019, |
| "reward_std": 0.380087611079216, |
| "rewards/accuracy_reward": 0.1669642947614193, |
| "rewards/format_reward": 0.9080357491970062, |
| "rewards/tag_count_reward": 0.9381696850061416, |
| "step": 2055 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 733.42861328125, |
| "epoch": 0.7032516855850474, |
| "grad_norm": 5.1911211013793945, |
| "kl": 1.214453125, |
| "learning_rate": 7.350841944798547e-07, |
| "loss": 0.16, |
| "reward": 1.9904018700122834, |
| "reward_std": 0.4548638232052326, |
| "rewards/accuracy_reward": 0.2062500131316483, |
| "rewards/format_reward": 0.8714286029338837, |
| "rewards/tag_count_reward": 0.9127232551574707, |
| "step": 2060 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 751.2232513427734, |
| "epoch": 0.7049586071520013, |
| "grad_norm": 1.5467967987060547, |
| "kl": 1.441796875, |
| "learning_rate": 7.2740883542111e-07, |
| "loss": 0.1354, |
| "reward": 1.958035796880722, |
| "reward_std": 0.5087195709347725, |
| "rewards/accuracy_reward": 0.21071429485455156, |
| "rewards/format_reward": 0.851785758137703, |
| "rewards/tag_count_reward": 0.8955357581377029, |
| "step": 2065 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 778.2982543945312, |
| "epoch": 0.7066655287189554, |
| "grad_norm": 4.287073612213135, |
| "kl": 1.624609375, |
| "learning_rate": 7.197609108841465e-07, |
| "loss": 0.137, |
| "reward": 1.9265625834465028, |
| "reward_std": 0.5439430341124535, |
| "rewards/accuracy_reward": 0.18303572461009027, |
| "rewards/format_reward": 0.8419643342494965, |
| "rewards/tag_count_reward": 0.9015625417232513, |
| "step": 2070 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 745.2491424560546, |
| "epoch": 0.7083724502859093, |
| "grad_norm": 1.5213736295700073, |
| "kl": 1.6482421875, |
| "learning_rate": 7.121406924448783e-07, |
| "loss": 0.1766, |
| "reward": 1.944196528196335, |
| "reward_std": 0.5264617592096329, |
| "rewards/accuracy_reward": 0.18392858002334833, |
| "rewards/format_reward": 0.8553571790456772, |
| "rewards/tag_count_reward": 0.9049107551574707, |
| "step": 2075 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 733.4545013427735, |
| "epoch": 0.7100793718528634, |
| "grad_norm": 1.2759286165237427, |
| "kl": 1.5237548828125, |
| "learning_rate": 7.045484506953832e-07, |
| "loss": 0.1214, |
| "reward": 1.9647322237491607, |
| "reward_std": 0.4902063623070717, |
| "rewards/accuracy_reward": 0.20535715045407416, |
| "rewards/format_reward": 0.8589286088943482, |
| "rewards/tag_count_reward": 0.9004464775323868, |
| "step": 2080 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 730.1562896728516, |
| "epoch": 0.7117862934198174, |
| "grad_norm": 4.01860237121582, |
| "kl": 1.3330078125, |
| "learning_rate": 6.969844552342939e-07, |
| "loss": 0.1437, |
| "reward": 1.993303656578064, |
| "reward_std": 0.42157181948423383, |
| "rewards/accuracy_reward": 0.1848214372061193, |
| "rewards/format_reward": 0.8848214656114578, |
| "rewards/tag_count_reward": 0.923660758137703, |
| "step": 2085 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 728.0446838378906, |
| "epoch": 0.7134932149867713, |
| "grad_norm": 1.2710025310516357, |
| "kl": 1.3125, |
| "learning_rate": 6.894489746572252e-07, |
| "loss": 0.1311, |
| "reward": 1.9805804371833802, |
| "reward_std": 0.44029773622751234, |
| "rewards/accuracy_reward": 0.16696429029107093, |
| "rewards/format_reward": 0.8910714685916901, |
| "rewards/tag_count_reward": 0.9225446879863739, |
| "step": 2090 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 714.4286010742187, |
| "epoch": 0.7152001365537254, |
| "grad_norm": 2.4798624515533447, |
| "kl": 1.593359375, |
| "learning_rate": 6.819422765472337e-07, |
| "loss": 0.2047, |
| "reward": 2.0042411744594575, |
| "reward_std": 0.46089204400777817, |
| "rewards/accuracy_reward": 0.2044642947614193, |
| "rewards/format_reward": 0.8839286178350448, |
| "rewards/tag_count_reward": 0.9158482521772384, |
| "step": 2095 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 691.5536010742187, |
| "epoch": 0.7169070581206793, |
| "grad_norm": 2.2551651000976562, |
| "kl": 1.31337890625, |
| "learning_rate": 6.744646274653198e-07, |
| "loss": 0.1504, |
| "reward": 2.024553656578064, |
| "reward_std": 0.4200122021138668, |
| "rewards/accuracy_reward": 0.23571429625153542, |
| "rewards/format_reward": 0.8785714685916901, |
| "rewards/tag_count_reward": 0.9102678954601288, |
| "step": 2100 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 737.5652160644531, |
| "epoch": 0.7186139796876334, |
| "grad_norm": 1.9618374109268188, |
| "kl": 1.533203125, |
| "learning_rate": 6.670162929409572e-07, |
| "loss": 0.1547, |
| "reward": 1.949330461025238, |
| "reward_std": 0.47244517505168915, |
| "rewards/accuracy_reward": 0.17321429355069995, |
| "rewards/format_reward": 0.8625000447034836, |
| "rewards/tag_count_reward": 0.9136161088943482, |
| "step": 2105 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 735.7500244140625, |
| "epoch": 0.7203209012545874, |
| "grad_norm": 3.131383180618286, |
| "kl": 1.41767578125, |
| "learning_rate": 6.595975374626699e-07, |
| "loss": 0.182, |
| "reward": 1.9044643700122834, |
| "reward_std": 0.48238158598542213, |
| "rewards/accuracy_reward": 0.13839286249130964, |
| "rewards/format_reward": 0.8616071850061416, |
| "rewards/tag_count_reward": 0.9044643253087997, |
| "step": 2110 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 700.0759216308594, |
| "epoch": 0.7220278228215413, |
| "grad_norm": 1.9941433668136597, |
| "kl": 1.3751953125, |
| "learning_rate": 6.522086244686351e-07, |
| "loss": 0.1702, |
| "reward": 2.022098296880722, |
| "reward_std": 0.42175976261496545, |
| "rewards/accuracy_reward": 0.2017857232131064, |
| "rewards/format_reward": 0.8910714715719223, |
| "rewards/tag_count_reward": 0.9292411118745804, |
| "step": 2115 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 732.7411071777344, |
| "epoch": 0.7237347443884954, |
| "grad_norm": 1.4535045623779297, |
| "kl": 1.6833984375, |
| "learning_rate": 6.448498163373324e-07, |
| "loss": 0.2005, |
| "reward": 1.98214293718338, |
| "reward_std": 0.4729441896080971, |
| "rewards/accuracy_reward": 0.19821429708972574, |
| "rewards/format_reward": 0.8723214745521546, |
| "rewards/tag_count_reward": 0.9116071820259094, |
| "step": 2120 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 744.5536071777344, |
| "epoch": 0.7254416659554493, |
| "grad_norm": 5.207380294799805, |
| "kl": 1.4818359375, |
| "learning_rate": 6.375213743782236e-07, |
| "loss": 0.2118, |
| "reward": 1.9560268759727477, |
| "reward_std": 0.4524582926183939, |
| "rewards/accuracy_reward": 0.17678572228178382, |
| "rewards/format_reward": 0.873214328289032, |
| "rewards/tag_count_reward": 0.9060268312692642, |
| "step": 2125 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 720.8357482910156, |
| "epoch": 0.7271485875224033, |
| "grad_norm": 3.1080732345581055, |
| "kl": 1.273681640625, |
| "learning_rate": 6.302235588224764e-07, |
| "loss": 0.1525, |
| "reward": 1.907366156578064, |
| "reward_std": 0.4530935399234295, |
| "rewards/accuracy_reward": 0.18035715045407413, |
| "rewards/format_reward": 0.8446428924798965, |
| "rewards/tag_count_reward": 0.8823661178350448, |
| "step": 2130 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 697.6750335693359, |
| "epoch": 0.7288555090893574, |
| "grad_norm": 1.5537960529327393, |
| "kl": 1.09833984375, |
| "learning_rate": 6.229566288137212e-07, |
| "loss": 0.1274, |
| "reward": 2.033705461025238, |
| "reward_std": 0.41119332425296307, |
| "rewards/accuracy_reward": 0.192857151851058, |
| "rewards/format_reward": 0.9071429014205933, |
| "rewards/tag_count_reward": 0.933705398440361, |
| "step": 2135 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 718.7955627441406, |
| "epoch": 0.7305624306563113, |
| "grad_norm": 1.1402044296264648, |
| "kl": 1.59580078125, |
| "learning_rate": 6.157208423988513e-07, |
| "loss": 0.1568, |
| "reward": 1.930357241630554, |
| "reward_std": 0.4769420772790909, |
| "rewards/accuracy_reward": 0.1625000076368451, |
| "rewards/format_reward": 0.8651786118745803, |
| "rewards/tag_count_reward": 0.9026786148548126, |
| "step": 2140 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 703.5714599609375, |
| "epoch": 0.7322693522232654, |
| "grad_norm": 5.189487934112549, |
| "kl": 2.553125, |
| "learning_rate": 6.085164565188594e-07, |
| "loss": 0.2333, |
| "reward": 1.837723284959793, |
| "reward_std": 0.5507455065846443, |
| "rewards/accuracy_reward": 0.15982143608853222, |
| "rewards/format_reward": 0.8116071820259094, |
| "rewards/tag_count_reward": 0.866294676065445, |
| "step": 2145 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 735.147348022461, |
| "epoch": 0.7339762737902193, |
| "grad_norm": 5.181674480438232, |
| "kl": 2.1427734375, |
| "learning_rate": 6.013437269997111e-07, |
| "loss": 0.2152, |
| "reward": 1.794866132736206, |
| "reward_std": 0.6164493501186371, |
| "rewards/accuracy_reward": 0.15178572125732898, |
| "rewards/format_reward": 0.7875000447034836, |
| "rewards/tag_count_reward": 0.855580398440361, |
| "step": 2150 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 721.4187896728515, |
| "epoch": 0.7356831953571733, |
| "grad_norm": 2.7006282806396484, |
| "kl": 1.9798828125, |
| "learning_rate": 5.942029085432636e-07, |
| "loss": 0.2448, |
| "reward": 1.9575893998146057, |
| "reward_std": 0.5865769028663635, |
| "rewards/accuracy_reward": 0.2482142984867096, |
| "rewards/format_reward": 0.8321428954601288, |
| "rewards/tag_count_reward": 0.8772321879863739, |
| "step": 2155 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 761.2812805175781, |
| "epoch": 0.7373901169241274, |
| "grad_norm": 3.3802006244659424, |
| "kl": 2.14765625, |
| "learning_rate": 5.87094254718219e-07, |
| "loss": 0.1986, |
| "reward": 1.8209822237491609, |
| "reward_std": 0.5044765949249268, |
| "rewards/accuracy_reward": 0.12767857704311608, |
| "rewards/format_reward": 0.8241071820259094, |
| "rewards/tag_count_reward": 0.8691964715719223, |
| "step": 2160 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 717.7089599609375, |
| "epoch": 0.7390970384910813, |
| "grad_norm": 3.371708631515503, |
| "kl": 1.8705078125, |
| "learning_rate": 5.80018017951123e-07, |
| "loss": 0.2285, |
| "reward": 1.9609375894069672, |
| "reward_std": 0.527063025534153, |
| "rewards/accuracy_reward": 0.20446429466828703, |
| "rewards/format_reward": 0.8553571850061417, |
| "rewards/tag_count_reward": 0.9011161148548126, |
| "step": 2165 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 724.678598022461, |
| "epoch": 0.7408039600580353, |
| "grad_norm": 1.3309767246246338, |
| "kl": 1.638671875, |
| "learning_rate": 5.729744495173978e-07, |
| "loss": 0.1628, |
| "reward": 1.9218750953674317, |
| "reward_std": 0.4835737131536007, |
| "rewards/accuracy_reward": 0.1580357243306935, |
| "rewards/format_reward": 0.8607143253087998, |
| "rewards/tag_count_reward": 0.9031250417232514, |
| "step": 2170 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 709.6544952392578, |
| "epoch": 0.7425108816249893, |
| "grad_norm": 2.5176899433135986, |
| "kl": 1.289453125, |
| "learning_rate": 5.659637995324229e-07, |
| "loss": 0.1573, |
| "reward": 1.9895090281963348, |
| "reward_std": 0.4916133493185043, |
| "rewards/accuracy_reward": 0.2044642967171967, |
| "rewards/format_reward": 0.875892898440361, |
| "rewards/tag_count_reward": 0.9091518193483352, |
| "step": 2175 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 747.656283569336, |
| "epoch": 0.7442178031919433, |
| "grad_norm": 0.922635018825531, |
| "kl": 1.71025390625, |
| "learning_rate": 5.589863169426506e-07, |
| "loss": 0.1529, |
| "reward": 1.9279018759727478, |
| "reward_std": 0.5085976898670197, |
| "rewards/accuracy_reward": 0.1892857227474451, |
| "rewards/format_reward": 0.8473214715719223, |
| "rewards/tag_count_reward": 0.8912946850061416, |
| "step": 2180 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 734.5178894042969, |
| "epoch": 0.7459247247588974, |
| "grad_norm": 1.6850134134292603, |
| "kl": 1.76162109375, |
| "learning_rate": 5.520422495167671e-07, |
| "loss": 0.24, |
| "reward": 1.9770090103149414, |
| "reward_std": 0.47378580197691916, |
| "rewards/accuracy_reward": 0.2151785809546709, |
| "rewards/format_reward": 0.8607143282890319, |
| "rewards/tag_count_reward": 0.9011161088943481, |
| "step": 2185 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 728.4803924560547, |
| "epoch": 0.7476316463258513, |
| "grad_norm": 3.3271028995513916, |
| "kl": 1.80859375, |
| "learning_rate": 5.451318438368943e-07, |
| "loss": 0.2188, |
| "reward": 1.9725447177886963, |
| "reward_std": 0.49253502711653707, |
| "rewards/accuracy_reward": 0.20267858086153864, |
| "rewards/format_reward": 0.8651786088943482, |
| "rewards/tag_count_reward": 0.9046875417232514, |
| "step": 2190 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 706.0098602294922, |
| "epoch": 0.7493385678928053, |
| "grad_norm": 2.4637181758880615, |
| "kl": 1.594140625, |
| "learning_rate": 5.382553452898354e-07, |
| "loss": 0.1562, |
| "reward": 1.9122768878936767, |
| "reward_std": 0.4774385288357735, |
| "rewards/accuracy_reward": 0.15089286360889673, |
| "rewards/format_reward": 0.8616071850061416, |
| "rewards/tag_count_reward": 0.8997768282890319, |
| "step": 2195 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 713.5857437133789, |
| "epoch": 0.7510454894597594, |
| "grad_norm": 3.416898012161255, |
| "kl": 2.0388671875, |
| "learning_rate": 5.314129980583572e-07, |
| "loss": 0.2084, |
| "reward": 1.9265625834465028, |
| "reward_std": 0.5427880018949509, |
| "rewards/accuracy_reward": 0.21250001024454832, |
| "rewards/format_reward": 0.832142898440361, |
| "rewards/tag_count_reward": 0.8819196820259094, |
| "step": 2200 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 720.0196838378906, |
| "epoch": 0.7527524110267133, |
| "grad_norm": 2.489032506942749, |
| "kl": 2.124609375, |
| "learning_rate": 5.246050451125244e-07, |
| "loss": 0.1666, |
| "reward": 1.8587054431438446, |
| "reward_std": 0.6338795974850655, |
| "rewards/accuracy_reward": 0.18035715045407413, |
| "rewards/format_reward": 0.8107143253087997, |
| "rewards/tag_count_reward": 0.8676339685916901, |
| "step": 2205 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 750.5732421875, |
| "epoch": 0.7544593325936674, |
| "grad_norm": 1.8052889108657837, |
| "kl": 2.0294921875, |
| "learning_rate": 5.178317282010667e-07, |
| "loss": 0.2124, |
| "reward": 1.9091518640518188, |
| "reward_std": 0.5391280226409435, |
| "rewards/accuracy_reward": 0.1982142969965935, |
| "rewards/format_reward": 0.8312500417232513, |
| "rewards/tag_count_reward": 0.8796875447034835, |
| "step": 2210 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 720.4928985595703, |
| "epoch": 0.7561662541606213, |
| "grad_norm": 2.0707011222839355, |
| "kl": 1.9193359375, |
| "learning_rate": 5.110932878427982e-07, |
| "loss": 0.1962, |
| "reward": 1.904017949104309, |
| "reward_std": 0.4930908754467964, |
| "rewards/accuracy_reward": 0.19107143729925155, |
| "rewards/format_reward": 0.8366071820259094, |
| "rewards/tag_count_reward": 0.8763393223285675, |
| "step": 2215 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 684.6955688476562, |
| "epoch": 0.7578731757275753, |
| "grad_norm": 2.0002994537353516, |
| "kl": 1.47373046875, |
| "learning_rate": 5.043899633180737e-07, |
| "loss": 0.1337, |
| "reward": 2.0084822356700895, |
| "reward_std": 0.466427081823349, |
| "rewards/accuracy_reward": 0.20178572162985803, |
| "rewards/format_reward": 0.8866071820259094, |
| "rewards/tag_count_reward": 0.920089328289032, |
| "step": 2220 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 749.0723571777344, |
| "epoch": 0.7595800972945294, |
| "grad_norm": 1.084472894668579, |
| "kl": 1.1036865234375, |
| "learning_rate": 4.977219926602959e-07, |
| "loss": 0.1531, |
| "reward": 2.0250000953674316, |
| "reward_std": 0.39613064378499985, |
| "rewards/accuracy_reward": 0.19732143655419349, |
| "rewards/format_reward": 0.8982143253087997, |
| "rewards/tag_count_reward": 0.9294643312692642, |
| "step": 2225 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 731.555386352539, |
| "epoch": 0.7612870188614833, |
| "grad_norm": 2.8694820404052734, |
| "kl": 1.3392578125, |
| "learning_rate": 4.910896126474581e-07, |
| "loss": 0.1268, |
| "reward": 2.042410796880722, |
| "reward_std": 0.3969473861157894, |
| "rewards/accuracy_reward": 0.186607151851058, |
| "rewards/format_reward": 0.9142857521772385, |
| "rewards/tag_count_reward": 0.9415179044008255, |
| "step": 2230 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 703.9161041259765, |
| "epoch": 0.7629939404284373, |
| "grad_norm": 4.769933223724365, |
| "kl": 1.3875, |
| "learning_rate": 4.844930587937399e-07, |
| "loss": 0.1978, |
| "reward": 2.0287947416305543, |
| "reward_std": 0.41266813427209853, |
| "rewards/accuracy_reward": 0.1946428656578064, |
| "rewards/format_reward": 0.9017857551574707, |
| "rewards/tag_count_reward": 0.9323661208152771, |
| "step": 2235 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 730.9107452392578, |
| "epoch": 0.7647008619953913, |
| "grad_norm": 2.770189046859741, |
| "kl": 1.587255859375, |
| "learning_rate": 4.779325653411413e-07, |
| "loss": 0.1698, |
| "reward": 2.007366156578064, |
| "reward_std": 0.4938293993473053, |
| "rewards/accuracy_reward": 0.20446429401636124, |
| "rewards/format_reward": 0.8839286208152771, |
| "rewards/tag_count_reward": 0.9189732581377029, |
| "step": 2240 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 758.0080657958985, |
| "epoch": 0.7664077835623453, |
| "grad_norm": 3.440570592880249, |
| "kl": 1.26435546875, |
| "learning_rate": 4.714083652511686e-07, |
| "loss": 0.1752, |
| "reward": 1.99553582072258, |
| "reward_std": 0.4044483944773674, |
| "rewards/accuracy_reward": 0.1919642947614193, |
| "rewards/format_reward": 0.8848214656114578, |
| "rewards/tag_count_reward": 0.9187500447034835, |
| "step": 2245 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 725.3080688476563, |
| "epoch": 0.7681147051292994, |
| "grad_norm": 2.006042242050171, |
| "kl": 1.3291015625, |
| "learning_rate": 4.6492069019655783e-07, |
| "loss": 0.1251, |
| "reward": 1.9899554550647736, |
| "reward_std": 0.4256874620914459, |
| "rewards/accuracy_reward": 0.16964286305010318, |
| "rewards/format_reward": 0.8937500417232513, |
| "rewards/tag_count_reward": 0.9265625447034835, |
| "step": 2250 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 754.3616333007812, |
| "epoch": 0.7698216266962533, |
| "grad_norm": 1.4680705070495605, |
| "kl": 1.687109375, |
| "learning_rate": 4.5846977055305117e-07, |
| "loss": 0.2298, |
| "reward": 1.923214364051819, |
| "reward_std": 0.47255007922649384, |
| "rewards/accuracy_reward": 0.15446429289877414, |
| "rewards/format_reward": 0.8607143342494965, |
| "rewards/tag_count_reward": 0.9080357551574707, |
| "step": 2255 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 766.7187866210937, |
| "epoch": 0.7715285482632073, |
| "grad_norm": 2.3678011894226074, |
| "kl": 1.8763671875, |
| "learning_rate": 4.5205583539121457e-07, |
| "loss": 0.2091, |
| "reward": 1.9611608386039734, |
| "reward_std": 0.4554700754582882, |
| "rewards/accuracy_reward": 0.16160715138539672, |
| "rewards/format_reward": 0.8812500447034836, |
| "rewards/tag_count_reward": 0.9183036118745804, |
| "step": 2260 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 746.3866333007812, |
| "epoch": 0.7732354698301613, |
| "grad_norm": 3.2156410217285156, |
| "kl": 1.5232421875, |
| "learning_rate": 4.456791124683043e-07, |
| "loss": 0.1613, |
| "reward": 1.9868304550647735, |
| "reward_std": 0.4991786405444145, |
| "rewards/accuracy_reward": 0.19017858076840638, |
| "rewards/format_reward": 0.879464328289032, |
| "rewards/tag_count_reward": 0.9171875447034836, |
| "step": 2265 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 729.1241424560546, |
| "epoch": 0.7749423913971153, |
| "grad_norm": 1.6405709981918335, |
| "kl": 1.175, |
| "learning_rate": 4.3933982822017883e-07, |
| "loss": 0.1704, |
| "reward": 1.9705358147621155, |
| "reward_std": 0.38443772345781324, |
| "rewards/accuracy_reward": 0.1714285803027451, |
| "rewards/format_reward": 0.8857143223285675, |
| "rewards/tag_count_reward": 0.9133929014205933, |
| "step": 2270 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 736.1670013427735, |
| "epoch": 0.7766493129640692, |
| "grad_norm": 2.096561908721924, |
| "kl": 1.32587890625, |
| "learning_rate": 4.330382077532594e-07, |
| "loss": 0.1661, |
| "reward": 1.9901786684989928, |
| "reward_std": 0.4159936264157295, |
| "rewards/accuracy_reward": 0.17678572433069348, |
| "rewards/format_reward": 0.8892857521772385, |
| "rewards/tag_count_reward": 0.9241071790456772, |
| "step": 2275 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 713.774136352539, |
| "epoch": 0.7783562345310233, |
| "grad_norm": 1.4854867458343506, |
| "kl": 1.338671875, |
| "learning_rate": 4.2677447483653544e-07, |
| "loss": 0.1976, |
| "reward": 2.014509016275406, |
| "reward_std": 0.3893769010901451, |
| "rewards/accuracy_reward": 0.1946428687311709, |
| "rewards/format_reward": 0.8955357551574707, |
| "rewards/tag_count_reward": 0.924330398440361, |
| "step": 2280 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 744.8598602294921, |
| "epoch": 0.7800631560979773, |
| "grad_norm": 1.2633461952209473, |
| "kl": 1.43720703125, |
| "learning_rate": 4.2054885189361833e-07, |
| "loss": 0.2002, |
| "reward": 1.9819197416305543, |
| "reward_std": 0.4485273748636246, |
| "rewards/accuracy_reward": 0.17142857760190963, |
| "rewards/format_reward": 0.8857143342494964, |
| "rewards/tag_count_reward": 0.924776828289032, |
| "step": 2285 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 748.2277160644531, |
| "epoch": 0.7817700776649313, |
| "grad_norm": 1.8592782020568848, |
| "kl": 1.3447265625, |
| "learning_rate": 4.143615599948437e-07, |
| "loss": 0.2039, |
| "reward": 1.9948661625385284, |
| "reward_std": 0.46121391505002973, |
| "rewards/accuracy_reward": 0.18482143487781286, |
| "rewards/format_reward": 0.8883929014205932, |
| "rewards/tag_count_reward": 0.9216518223285675, |
| "step": 2290 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 755.8902099609375, |
| "epoch": 0.7834769992318853, |
| "grad_norm": 3.4845056533813477, |
| "kl": 1.9146484375, |
| "learning_rate": 4.0821281884942145e-07, |
| "loss": 0.2436, |
| "reward": 1.9580358266830444, |
| "reward_std": 0.48862814009189603, |
| "rewards/accuracy_reward": 0.17857143571600317, |
| "rewards/format_reward": 0.869642898440361, |
| "rewards/tag_count_reward": 0.9098214715719223, |
| "step": 2295 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 750.5259246826172, |
| "epoch": 0.7851839207988393, |
| "grad_norm": 1.9733744859695435, |
| "kl": 1.48447265625, |
| "learning_rate": 4.021028467976341e-07, |
| "loss": 0.2299, |
| "reward": 1.9584822475910186, |
| "reward_std": 0.4035663403570652, |
| "rewards/accuracy_reward": 0.16071429373696447, |
| "rewards/format_reward": 0.8794643253087997, |
| "rewards/tag_count_reward": 0.918303620815277, |
| "step": 2300 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 721.5187866210938, |
| "epoch": 0.7868908423657933, |
| "grad_norm": 4.208427429199219, |
| "kl": 1.501953125, |
| "learning_rate": 3.9603186080308253e-07, |
| "loss": 0.2006, |
| "reward": 2.011160784959793, |
| "reward_std": 0.47146844640374186, |
| "rewards/accuracy_reward": 0.20178572395816446, |
| "rewards/format_reward": 0.8857143312692642, |
| "rewards/tag_count_reward": 0.9236607551574707, |
| "step": 2305 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 697.9473556518554, |
| "epoch": 0.7885977639327473, |
| "grad_norm": 1.42350435256958, |
| "kl": 1.84296875, |
| "learning_rate": 3.90000076444983e-07, |
| "loss": 0.2227, |
| "reward": 1.9866072297096253, |
| "reward_std": 0.4845219224691391, |
| "rewards/accuracy_reward": 0.19553572265431285, |
| "rewards/format_reward": 0.8750000417232513, |
| "rewards/tag_count_reward": 0.9160714626312256, |
| "step": 2310 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 710.6911041259766, |
| "epoch": 0.7903046854997013, |
| "grad_norm": 1.7006930112838745, |
| "kl": 1.32939453125, |
| "learning_rate": 3.8400770791051087e-07, |
| "loss": 0.1753, |
| "reward": 1.9910715341567993, |
| "reward_std": 0.4488883726298809, |
| "rewards/accuracy_reward": 0.16785715138539672, |
| "rewards/format_reward": 0.8937500417232513, |
| "rewards/tag_count_reward": 0.929464328289032, |
| "step": 2315 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 761.4884246826172, |
| "epoch": 0.7920116070666553, |
| "grad_norm": 1.214569091796875, |
| "kl": 1.682421875, |
| "learning_rate": 3.7805496798719545e-07, |
| "loss": 0.2685, |
| "reward": 1.9638393938541412, |
| "reward_std": 0.4939615406095982, |
| "rewards/accuracy_reward": 0.18035714877769352, |
| "rewards/format_reward": 0.8732143253087997, |
| "rewards/tag_count_reward": 0.9102679044008255, |
| "step": 2320 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 724.4518188476562, |
| "epoch": 0.7937185286336093, |
| "grad_norm": 3.557129383087158, |
| "kl": 1.6748046875, |
| "learning_rate": 3.721420680553634e-07, |
| "loss": 0.1957, |
| "reward": 1.9421875953674317, |
| "reward_std": 0.4420395828783512, |
| "rewards/accuracy_reward": 0.15625000586733223, |
| "rewards/format_reward": 0.8741071879863739, |
| "rewards/tag_count_reward": 0.9118303954601288, |
| "step": 2325 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 725.4232543945312, |
| "epoch": 0.7954254502005633, |
| "grad_norm": 2.902493476867676, |
| "kl": 1.6931640625, |
| "learning_rate": 3.6626921808063434e-07, |
| "loss": 0.2131, |
| "reward": 1.9752233147621154, |
| "reward_std": 0.48989410772919656, |
| "rewards/accuracy_reward": 0.1910714352503419, |
| "rewards/format_reward": 0.8741071850061417, |
| "rewards/tag_count_reward": 0.910044687986374, |
| "step": 2330 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 760.6036010742188, |
| "epoch": 0.7971323717675173, |
| "grad_norm": 1.2533881664276123, |
| "kl": 1.611328125, |
| "learning_rate": 3.604366266064625e-07, |
| "loss": 0.2517, |
| "reward": 1.9265625774860382, |
| "reward_std": 0.49659521505236626, |
| "rewards/accuracy_reward": 0.16696429457515477, |
| "rewards/format_reward": 0.8598214685916901, |
| "rewards/tag_count_reward": 0.8997768223285675, |
| "step": 2335 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 689.780386352539, |
| "epoch": 0.7988392933344712, |
| "grad_norm": 1.823298454284668, |
| "kl": 1.47578125, |
| "learning_rate": 3.546445007467333e-07, |
| "loss": 0.2197, |
| "reward": 1.9589286863803863, |
| "reward_std": 0.45438055247068404, |
| "rewards/accuracy_reward": 0.17678572349250316, |
| "rewards/format_reward": 0.8741071939468383, |
| "rewards/tag_count_reward": 0.908035758137703, |
| "step": 2340 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 760.0036041259766, |
| "epoch": 0.8005462149014253, |
| "grad_norm": 1.8940644264221191, |
| "kl": 1.762109375, |
| "learning_rate": 3.488930461784075e-07, |
| "loss": 0.2641, |
| "reward": 1.8991072356700898, |
| "reward_std": 0.4983711659908295, |
| "rewards/accuracy_reward": 0.1392857225611806, |
| "rewards/format_reward": 0.8589286118745804, |
| "rewards/tag_count_reward": 0.9008928954601287, |
| "step": 2345 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 718.2661041259765, |
| "epoch": 0.8022531364683793, |
| "grad_norm": 5.181538105010986, |
| "kl": 1.598828125, |
| "learning_rate": 3.431824671342198e-07, |
| "loss": 0.1734, |
| "reward": 1.9828125834465027, |
| "reward_std": 0.47254706025123594, |
| "rewards/accuracy_reward": 0.19107143431901932, |
| "rewards/format_reward": 0.8776786148548126, |
| "rewards/tag_count_reward": 0.9140625417232513, |
| "step": 2350 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 737.5125274658203, |
| "epoch": 0.8039600580353333, |
| "grad_norm": 2.329116106033325, |
| "kl": 1.43359375, |
| "learning_rate": 3.375129663954233e-07, |
| "loss": 0.1932, |
| "reward": 1.9479911565780639, |
| "reward_std": 0.4480812445282936, |
| "rewards/accuracy_reward": 0.16785715129226447, |
| "rewards/format_reward": 0.8732143312692642, |
| "rewards/tag_count_reward": 0.9069196879863739, |
| "step": 2355 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 739.2071746826172, |
| "epoch": 0.8056669796022873, |
| "grad_norm": 0.8552534580230713, |
| "kl": 1.71162109375, |
| "learning_rate": 3.318847452845922e-07, |
| "loss": 0.2293, |
| "reward": 1.9145090103149414, |
| "reward_std": 0.5126624539494514, |
| "rewards/accuracy_reward": 0.16339286379516124, |
| "rewards/format_reward": 0.8553571850061417, |
| "rewards/tag_count_reward": 0.8957589745521546, |
| "step": 2360 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 717.5143218994141, |
| "epoch": 0.8073739011692412, |
| "grad_norm": 2.5750255584716797, |
| "kl": 1.56728515625, |
| "learning_rate": 3.2629800365847046e-07, |
| "loss": 0.2652, |
| "reward": 1.9863840222358704, |
| "reward_std": 0.4886273622512817, |
| "rewards/accuracy_reward": 0.20892858020961286, |
| "rewards/format_reward": 0.869642898440361, |
| "rewards/tag_count_reward": 0.9078125387430191, |
| "step": 2365 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 738.2857482910156, |
| "epoch": 0.8090808227361953, |
| "grad_norm": 4.131805896759033, |
| "kl": 1.611328125, |
| "learning_rate": 3.207529399008756e-07, |
| "loss": 0.2404, |
| "reward": 1.9517858147621154, |
| "reward_std": 0.520108437538147, |
| "rewards/accuracy_reward": 0.169642863702029, |
| "rewards/format_reward": 0.8741071850061417, |
| "rewards/tag_count_reward": 0.908035758137703, |
| "step": 2370 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 752.1777130126953, |
| "epoch": 0.8107877443031493, |
| "grad_norm": 1.5850900411605835, |
| "kl": 1.9701171875, |
| "learning_rate": 3.152497509156543e-07, |
| "loss": 0.2684, |
| "reward": 1.891071516275406, |
| "reward_std": 0.49450200498104097, |
| "rewards/accuracy_reward": 0.15000000754371284, |
| "rewards/format_reward": 0.8517857640981674, |
| "rewards/tag_count_reward": 0.8892857521772385, |
| "step": 2375 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 719.3598571777344, |
| "epoch": 0.8124946658701032, |
| "grad_norm": 2.113619804382324, |
| "kl": 1.4361328125, |
| "learning_rate": 3.0978863211969146e-07, |
| "loss": 0.1825, |
| "reward": 2.00491082072258, |
| "reward_std": 0.41534021943807603, |
| "rewards/accuracy_reward": 0.1901785798370838, |
| "rewards/format_reward": 0.8910714685916901, |
| "rewards/tag_count_reward": 0.9236607611179352, |
| "step": 2380 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 725.2830688476563, |
| "epoch": 0.8142015874370573, |
| "grad_norm": 47.90665817260742, |
| "kl": 1.62421875, |
| "learning_rate": 3.0436977743596823e-07, |
| "loss": 0.1664, |
| "reward": 1.945312601327896, |
| "reward_std": 0.4939949780702591, |
| "rewards/accuracy_reward": 0.17678572284057736, |
| "rewards/format_reward": 0.86607146859169, |
| "rewards/tag_count_reward": 0.9024554044008255, |
| "step": 2385 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 739.9491424560547, |
| "epoch": 0.8159085090040112, |
| "grad_norm": 2.70536470413208, |
| "kl": 1.4115234375, |
| "learning_rate": 2.989933792866793e-07, |
| "loss": 0.1852, |
| "reward": 1.9897322237491608, |
| "reward_std": 0.47392349503934383, |
| "rewards/accuracy_reward": 0.2017857247032225, |
| "rewards/format_reward": 0.8750000476837159, |
| "rewards/tag_count_reward": 0.91294646859169, |
| "step": 2390 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 759.7696807861328, |
| "epoch": 0.8176154305709653, |
| "grad_norm": 1.9668916463851929, |
| "kl": 1.85087890625, |
| "learning_rate": 2.9365962858639733e-07, |
| "loss": 0.3048, |
| "reward": 1.9325893759727477, |
| "reward_std": 0.5062482297420502, |
| "rewards/accuracy_reward": 0.17857143562287092, |
| "rewards/format_reward": 0.8544643253087998, |
| "rewards/tag_count_reward": 0.8995536118745804, |
| "step": 2395 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 748.3286041259765, |
| "epoch": 0.8193223521379193, |
| "grad_norm": 5.197607040405273, |
| "kl": 1.83046875, |
| "learning_rate": 2.8836871473529435e-07, |
| "loss": 0.2788, |
| "reward": 1.923660808801651, |
| "reward_std": 0.5111315354704857, |
| "rewards/accuracy_reward": 0.1714285772293806, |
| "rewards/format_reward": 0.8562500387430191, |
| "rewards/tag_count_reward": 0.8959821909666061, |
| "step": 2400 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 733.7250335693359, |
| "epoch": 0.8210292737048732, |
| "grad_norm": 3.3527982234954834, |
| "kl": 1.71171875, |
| "learning_rate": 2.831208256124167e-07, |
| "loss": 0.2245, |
| "reward": 1.8738840103149415, |
| "reward_std": 0.5179990664124489, |
| "rewards/accuracy_reward": 0.15089286230504512, |
| "rewards/format_reward": 0.8410714715719223, |
| "rewards/tag_count_reward": 0.8819196879863739, |
| "step": 2405 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 720.9964660644531, |
| "epoch": 0.8227361952718273, |
| "grad_norm": 1.7930233478546143, |
| "kl": 1.6314453125, |
| "learning_rate": 2.779161475690135e-07, |
| "loss": 0.1954, |
| "reward": 1.8834822058677674, |
| "reward_std": 0.47747668251395226, |
| "rewards/accuracy_reward": 0.14285715008154512, |
| "rewards/format_reward": 0.8517857611179351, |
| "rewards/tag_count_reward": 0.8888393253087997, |
| "step": 2410 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 720.8866363525391, |
| "epoch": 0.8244431168387812, |
| "grad_norm": 3.0464282035827637, |
| "kl": 1.894921875, |
| "learning_rate": 2.727548654219193e-07, |
| "loss": 0.2449, |
| "reward": 1.9600447237491607, |
| "reward_std": 0.531683550029993, |
| "rewards/accuracy_reward": 0.20982143972069026, |
| "rewards/format_reward": 0.8553571850061417, |
| "rewards/tag_count_reward": 0.8948661118745804, |
| "step": 2415 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 717.7839569091797, |
| "epoch": 0.8261500384057353, |
| "grad_norm": 2.702840805053711, |
| "kl": 1.90859375, |
| "learning_rate": 2.6763716244699057e-07, |
| "loss": 0.2345, |
| "reward": 1.8950893878936768, |
| "reward_std": 0.5322792515158653, |
| "rewards/accuracy_reward": 0.1482142921537161, |
| "rewards/format_reward": 0.8535714685916901, |
| "rewards/tag_count_reward": 0.8933036118745804, |
| "step": 2420 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 749.0768280029297, |
| "epoch": 0.8278569599726893, |
| "grad_norm": 1.9960432052612305, |
| "kl": 1.976953125, |
| "learning_rate": 2.625632203725979e-07, |
| "loss": 0.2261, |
| "reward": 1.9006697237491608, |
| "reward_std": 0.5312141239643097, |
| "rewards/accuracy_reward": 0.16607143729925156, |
| "rewards/format_reward": 0.8464286178350449, |
| "rewards/tag_count_reward": 0.8881696850061417, |
| "step": 2425 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 729.7393218994141, |
| "epoch": 0.8295638815396432, |
| "grad_norm": 2.3977620601654053, |
| "kl": 1.990625, |
| "learning_rate": 2.575332193731732e-07, |
| "loss": 0.2026, |
| "reward": 1.9734375774860382, |
| "reward_std": 0.5333075791597366, |
| "rewards/accuracy_reward": 0.20714286817237734, |
| "rewards/format_reward": 0.8633929014205932, |
| "rewards/tag_count_reward": 0.9029018253087997, |
| "step": 2430 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 722.9384246826172, |
| "epoch": 0.8312708031065973, |
| "grad_norm": 1.5522792339324951, |
| "kl": 1.5064453125, |
| "learning_rate": 2.525473380628127e-07, |
| "loss": 0.1976, |
| "reward": 1.9145089983940125, |
| "reward_std": 0.4954484052956104, |
| "rewards/accuracy_reward": 0.17946429280564188, |
| "rewards/format_reward": 0.8464286088943481, |
| "rewards/tag_count_reward": 0.8886161059141159, |
| "step": 2435 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 731.9803863525391, |
| "epoch": 0.8329777246735512, |
| "grad_norm": 1.8127202987670898, |
| "kl": 1.65087890625, |
| "learning_rate": 2.4760575348893164e-07, |
| "loss": 0.2192, |
| "reward": 1.9658482909202575, |
| "reward_std": 0.4845112472772598, |
| "rewards/accuracy_reward": 0.17946429476141929, |
| "rewards/format_reward": 0.876785758137703, |
| "rewards/tag_count_reward": 0.9095982581377029, |
| "step": 2440 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 703.525033569336, |
| "epoch": 0.8346846462405052, |
| "grad_norm": 1.0752168893814087, |
| "kl": 1.583203125, |
| "learning_rate": 2.427086411259812e-07, |
| "loss": 0.2127, |
| "reward": 1.9837054550647735, |
| "reward_std": 0.4994090169668198, |
| "rewards/accuracy_reward": 0.20625000940635801, |
| "rewards/format_reward": 0.8696429014205933, |
| "rewards/tag_count_reward": 0.9078125357627869, |
| "step": 2445 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 732.0089660644531, |
| "epoch": 0.8363915678074593, |
| "grad_norm": 1.7221641540527344, |
| "kl": 1.6380859375, |
| "learning_rate": 2.378561748692124e-07, |
| "loss": 0.2077, |
| "reward": 1.9569197177886963, |
| "reward_std": 0.47989270444959403, |
| "rewards/accuracy_reward": 0.19285715287551283, |
| "rewards/format_reward": 0.86607146859169, |
| "rewards/tag_count_reward": 0.8979911088943482, |
| "step": 2450 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 703.2803955078125, |
| "epoch": 0.8380984893744132, |
| "grad_norm": 2.0460386276245117, |
| "kl": 1.30048828125, |
| "learning_rate": 2.3304852702850688e-07, |
| "loss": 0.1829, |
| "reward": 2.0325893819332124, |
| "reward_std": 0.44422818124294283, |
| "rewards/accuracy_reward": 0.22678572358563542, |
| "rewards/format_reward": 0.8875000387430191, |
| "rewards/tag_count_reward": 0.9183036088943481, |
| "step": 2455 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 702.3446746826172, |
| "epoch": 0.8398054109413673, |
| "grad_norm": 1.9257980585098267, |
| "kl": 1.203515625, |
| "learning_rate": 2.282858683222535e-07, |
| "loss": 0.185, |
| "reward": 2.0203125894069673, |
| "reward_std": 0.45232805162668227, |
| "rewards/accuracy_reward": 0.2116071492433548, |
| "rewards/format_reward": 0.8883928984403611, |
| "rewards/tag_count_reward": 0.920312550663948, |
| "step": 2460 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 694.3973571777344, |
| "epoch": 0.8415123325083212, |
| "grad_norm": 2.9717674255371094, |
| "kl": 1.59736328125, |
| "learning_rate": 2.2356836787128947e-07, |
| "loss": 0.2189, |
| "reward": 1.9305804431438447, |
| "reward_std": 0.43135242685675623, |
| "rewards/accuracy_reward": 0.17589286426082254, |
| "rewards/format_reward": 0.8580357551574707, |
| "rewards/tag_count_reward": 0.896651828289032, |
| "step": 2465 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 714.9027130126954, |
| "epoch": 0.8432192540752752, |
| "grad_norm": 0.7697265148162842, |
| "kl": 1.35322265625, |
| "learning_rate": 2.188961931928925e-07, |
| "loss": 0.1832, |
| "reward": 2.0316965222358703, |
| "reward_std": 0.45708170533180237, |
| "rewards/accuracy_reward": 0.21428572311997413, |
| "rewards/format_reward": 0.8910714745521545, |
| "rewards/tag_count_reward": 0.9263393253087997, |
| "step": 2470 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 677.9036041259766, |
| "epoch": 0.8449261756422293, |
| "grad_norm": 2.200136661529541, |
| "kl": 1.51708984375, |
| "learning_rate": 2.1426951019483327e-07, |
| "loss": 0.197, |
| "reward": 1.9850447237491609, |
| "reward_std": 0.43524651750922205, |
| "rewards/accuracy_reward": 0.17232143571600317, |
| "rewards/format_reward": 0.8883928984403611, |
| "rewards/tag_count_reward": 0.924330398440361, |
| "step": 2475 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 729.7661071777344, |
| "epoch": 0.8466330972091832, |
| "grad_norm": 2.73653507232666, |
| "kl": 1.453515625, |
| "learning_rate": 2.0968848316948414e-07, |
| "loss": 0.1622, |
| "reward": 1.9546875834465027, |
| "reward_std": 0.47535726577043536, |
| "rewards/accuracy_reward": 0.19017857676371933, |
| "rewards/format_reward": 0.8625000387430191, |
| "rewards/tag_count_reward": 0.9020089656114578, |
| "step": 2480 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 731.2491394042969, |
| "epoch": 0.8483400187761372, |
| "grad_norm": 2.792485475540161, |
| "kl": 1.142578125, |
| "learning_rate": 2.0515327478798601e-07, |
| "loss": 0.1639, |
| "reward": 1.9897322356700897, |
| "reward_std": 0.4151985734701157, |
| "rewards/accuracy_reward": 0.1767857219092548, |
| "rewards/format_reward": 0.8910714656114578, |
| "rewards/tag_count_reward": 0.9218750387430191, |
| "step": 2485 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 712.3964569091797, |
| "epoch": 0.8500469403430913, |
| "grad_norm": 2.1750404834747314, |
| "kl": 1.362890625, |
| "learning_rate": 2.006640460944701e-07, |
| "loss": 0.2054, |
| "reward": 2.036384052038193, |
| "reward_std": 0.4850167170166969, |
| "rewards/accuracy_reward": 0.24107143646106124, |
| "rewards/format_reward": 0.8794643312692643, |
| "rewards/tag_count_reward": 0.9158482551574707, |
| "step": 2490 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 713.9687805175781, |
| "epoch": 0.8517538619100452, |
| "grad_norm": 2.2476091384887695, |
| "kl": 1.6875244140625, |
| "learning_rate": 1.9622095650034077e-07, |
| "loss": 0.2528, |
| "reward": 2.035267961025238, |
| "reward_std": 0.5042560985311866, |
| "rewards/accuracy_reward": 0.24553572237491608, |
| "rewards/format_reward": 0.8767857521772384, |
| "rewards/tag_count_reward": 0.9129464715719223, |
| "step": 2495 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 720.7411010742187, |
| "epoch": 0.8534607834769993, |
| "grad_norm": 2.9233672618865967, |
| "kl": 1.546484375, |
| "learning_rate": 1.9182416377861388e-07, |
| "loss": 0.2024, |
| "reward": 1.935491156578064, |
| "reward_std": 0.4808127790689468, |
| "rewards/accuracy_reward": 0.1607142912223935, |
| "rewards/format_reward": 0.866964328289032, |
| "rewards/tag_count_reward": 0.9078125476837158, |
| "step": 2500 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 708.3143157958984, |
| "epoch": 0.8551677050439532, |
| "grad_norm": 3.2671101093292236, |
| "kl": 1.0187744140625, |
| "learning_rate": 1.8747382405831515e-07, |
| "loss": 0.1549, |
| "reward": 1.9649554669857026, |
| "reward_std": 0.3683423440903425, |
| "rewards/accuracy_reward": 0.1535714365541935, |
| "rewards/format_reward": 0.8919643282890319, |
| "rewards/tag_count_reward": 0.9194196850061417, |
| "step": 2505 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 707.5116363525391, |
| "epoch": 0.8568746266109072, |
| "grad_norm": 1.725104808807373, |
| "kl": 1.13115234375, |
| "learning_rate": 1.8317009181893507e-07, |
| "loss": 0.1587, |
| "reward": 1.9776786625385285, |
| "reward_std": 0.4078820027410984, |
| "rewards/accuracy_reward": 0.17053572311997414, |
| "rewards/format_reward": 0.8883929014205932, |
| "rewards/tag_count_reward": 0.9187500447034835, |
| "step": 2510 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 726.5812835693359, |
| "epoch": 0.8585815481778613, |
| "grad_norm": 0.9512146711349487, |
| "kl": 1.148876953125, |
| "learning_rate": 1.7891311988494523e-07, |
| "loss": 0.1464, |
| "reward": 2.0390625953674317, |
| "reward_std": 0.40070234164595603, |
| "rewards/accuracy_reward": 0.2053571523167193, |
| "rewards/format_reward": 0.904464328289032, |
| "rewards/tag_count_reward": 0.9292411148548126, |
| "step": 2515 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 708.7821838378907, |
| "epoch": 0.8602884697448152, |
| "grad_norm": 1.6017332077026367, |
| "kl": 1.38046875, |
| "learning_rate": 1.7470305942036864e-07, |
| "loss": 0.163, |
| "reward": 2.030134028196335, |
| "reward_std": 0.4416078761219978, |
| "rewards/accuracy_reward": 0.20982143776491285, |
| "rewards/format_reward": 0.8937500447034836, |
| "rewards/tag_count_reward": 0.9265625476837158, |
| "step": 2520 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 694.2955627441406, |
| "epoch": 0.8619953913117693, |
| "grad_norm": 1.2069106101989746, |
| "kl": 1.48408203125, |
| "learning_rate": 1.705400599234152e-07, |
| "loss": 0.2074, |
| "reward": 2.0006697118282317, |
| "reward_std": 0.48490975946187975, |
| "rewards/accuracy_reward": 0.22053572302684188, |
| "rewards/format_reward": 0.8741071850061417, |
| "rewards/tag_count_reward": 0.9060268253087997, |
| "step": 2525 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 695.4312896728516, |
| "epoch": 0.8637023128787232, |
| "grad_norm": 1.0792300701141357, |
| "kl": 1.41123046875, |
| "learning_rate": 1.6642426922117037e-07, |
| "loss": 0.1589, |
| "reward": 1.9928572356700898, |
| "reward_std": 0.4516665853559971, |
| "rewards/accuracy_reward": 0.17232143692672253, |
| "rewards/format_reward": 0.8955357521772385, |
| "rewards/tag_count_reward": 0.9250000417232513, |
| "step": 2530 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 724.9500274658203, |
| "epoch": 0.8654092344456772, |
| "grad_norm": 1.6458358764648438, |
| "kl": 1.53515625, |
| "learning_rate": 1.62355833464347e-07, |
| "loss": 0.1851, |
| "reward": 1.9680804491043091, |
| "reward_std": 0.4724808134138584, |
| "rewards/accuracy_reward": 0.18125000586733223, |
| "rewards/format_reward": 0.876785758137703, |
| "rewards/tag_count_reward": 0.9100446850061417, |
| "step": 2535 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 678.3027099609375, |
| "epoch": 0.8671161560126313, |
| "grad_norm": 2.028154134750366, |
| "kl": 1.37314453125, |
| "learning_rate": 1.5833489712209643e-07, |
| "loss": 0.1602, |
| "reward": 1.980580449104309, |
| "reward_std": 0.47636549547314644, |
| "rewards/accuracy_reward": 0.20178572311997414, |
| "rewards/format_reward": 0.8705357521772384, |
| "rewards/tag_count_reward": 0.9082589656114578, |
| "step": 2540 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 723.8598449707031, |
| "epoch": 0.8688230775795852, |
| "grad_norm": 1.2274961471557617, |
| "kl": 1.5658203125, |
| "learning_rate": 1.5436160297687614e-07, |
| "loss": 0.1623, |
| "reward": 1.9558036625385284, |
| "reward_std": 0.47571387365460394, |
| "rewards/accuracy_reward": 0.16071429289877415, |
| "rewards/format_reward": 0.879464328289032, |
| "rewards/tag_count_reward": 0.9156250357627869, |
| "step": 2545 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 715.9285949707031, |
| "epoch": 0.8705299991465392, |
| "grad_norm": 1.1409246921539307, |
| "kl": 1.7728515625, |
| "learning_rate": 1.5043609211938257e-07, |
| "loss": 0.1836, |
| "reward": 1.9303572237491609, |
| "reward_std": 0.5227501168847084, |
| "rewards/accuracy_reward": 0.16607143748551606, |
| "rewards/format_reward": 0.8642857611179352, |
| "rewards/tag_count_reward": 0.9000000476837158, |
| "step": 2550 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 741.2357452392578, |
| "epoch": 0.8722369207134932, |
| "grad_norm": 2.0067524909973145, |
| "kl": 1.790625, |
| "learning_rate": 1.4655850394353738e-07, |
| "loss": 0.2562, |
| "reward": 1.9258929550647736, |
| "reward_std": 0.5418844744563103, |
| "rewards/accuracy_reward": 0.16250000735744835, |
| "rewards/format_reward": 0.8625000387430191, |
| "rewards/tag_count_reward": 0.9008929014205933, |
| "step": 2555 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 696.0223449707031, |
| "epoch": 0.8739438422804472, |
| "grad_norm": 3.1993730068206787, |
| "kl": 1.7654296875, |
| "learning_rate": 1.4272897614154161e-07, |
| "loss": 0.2202, |
| "reward": 1.9792411744594574, |
| "reward_std": 0.5070497654378414, |
| "rewards/accuracy_reward": 0.20714286770671606, |
| "rewards/format_reward": 0.86607146859169, |
| "rewards/tag_count_reward": 0.9060268193483353, |
| "step": 2560 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 696.559848022461, |
| "epoch": 0.8756507638474013, |
| "grad_norm": 2.2070538997650146, |
| "kl": 1.67294921875, |
| "learning_rate": 1.389476446989828e-07, |
| "loss": 0.2129, |
| "reward": 1.979910808801651, |
| "reward_std": 0.5403969317674637, |
| "rewards/accuracy_reward": 0.21160715445876122, |
| "rewards/format_reward": 0.8625000387430191, |
| "rewards/tag_count_reward": 0.9058036118745804, |
| "step": 2565 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 729.2357513427735, |
| "epoch": 0.8773576854143552, |
| "grad_norm": 1.5151607990264893, |
| "kl": 1.8220703125, |
| "learning_rate": 1.3521464389000853e-07, |
| "loss": 0.2421, |
| "reward": 1.950446504354477, |
| "reward_std": 0.5206233039498329, |
| "rewards/accuracy_reward": 0.18750000931322575, |
| "rewards/format_reward": 0.8598214685916901, |
| "rewards/tag_count_reward": 0.9031250447034835, |
| "step": 2570 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 736.9411010742188, |
| "epoch": 0.8790646069813092, |
| "grad_norm": 3.210714101791382, |
| "kl": 1.8384765625, |
| "learning_rate": 1.3153010627255728e-07, |
| "loss": 0.2388, |
| "reward": 1.8953125953674317, |
| "reward_std": 0.522989672422409, |
| "rewards/accuracy_reward": 0.16071429401636123, |
| "rewards/format_reward": 0.8455357521772384, |
| "rewards/tag_count_reward": 0.8890625387430191, |
| "step": 2575 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 744.8303863525391, |
| "epoch": 0.8807715285482632, |
| "grad_norm": 2.01275897026062, |
| "kl": 2.0505859375, |
| "learning_rate": 1.2789416268365146e-07, |
| "loss": 0.2423, |
| "reward": 1.9046875894069673, |
| "reward_std": 0.588728591799736, |
| "rewards/accuracy_reward": 0.17857143739238382, |
| "rewards/format_reward": 0.8410714775323868, |
| "rewards/tag_count_reward": 0.8850446820259095, |
| "step": 2580 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 711.4723541259766, |
| "epoch": 0.8824784501152172, |
| "grad_norm": 1.6076045036315918, |
| "kl": 1.7802734375, |
| "learning_rate": 1.2430694223475087e-07, |
| "loss": 0.2169, |
| "reward": 1.8776786565780639, |
| "reward_std": 0.5265608415007591, |
| "rewards/accuracy_reward": 0.1544642928056419, |
| "rewards/format_reward": 0.8366071879863739, |
| "rewards/tag_count_reward": 0.8866071820259094, |
| "step": 2585 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 748.5223541259766, |
| "epoch": 0.8841853716821712, |
| "grad_norm": 2.9953057765960693, |
| "kl": 1.766796875, |
| "learning_rate": 1.2076857230717004e-07, |
| "loss": 0.2396, |
| "reward": 1.876339364051819, |
| "reward_std": 0.5042637214064598, |
| "rewards/accuracy_reward": 0.1357142912223935, |
| "rewards/format_reward": 0.8473214656114578, |
| "rewards/tag_count_reward": 0.8933036178350449, |
| "step": 2590 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 685.8982513427734, |
| "epoch": 0.8858922932491252, |
| "grad_norm": 1.664928913116455, |
| "kl": 1.6484375, |
| "learning_rate": 1.1727917854755238e-07, |
| "loss": 0.1894, |
| "reward": 1.946428656578064, |
| "reward_std": 0.5181833237409592, |
| "rewards/accuracy_reward": 0.20089286677539347, |
| "rewards/format_reward": 0.8500000447034836, |
| "rewards/tag_count_reward": 0.8955357581377029, |
| "step": 2595 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 738.9411041259766, |
| "epoch": 0.8875992148160792, |
| "grad_norm": 1.6388742923736572, |
| "kl": 1.8392578125, |
| "learning_rate": 1.1383888486341032e-07, |
| "loss": 0.2812, |
| "reward": 1.861160808801651, |
| "reward_std": 0.5453050881624222, |
| "rewards/accuracy_reward": 0.1428571492433548, |
| "rewards/format_reward": 0.835714328289032, |
| "rewards/tag_count_reward": 0.8825893372297287, |
| "step": 2600 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 692.9500396728515, |
| "epoch": 0.8893061363830332, |
| "grad_norm": 1.3716119527816772, |
| "kl": 1.69130859375, |
| "learning_rate": 1.1044781341872411e-07, |
| "loss": 0.2436, |
| "reward": 1.9834822297096253, |
| "reward_std": 0.5287271916866303, |
| "rewards/accuracy_reward": 0.23928572330623865, |
| "rewards/format_reward": 0.8526786148548127, |
| "rewards/tag_count_reward": 0.8915178954601288, |
| "step": 2605 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 745.263427734375, |
| "epoch": 0.8910130579499872, |
| "grad_norm": 1.7265480756759644, |
| "kl": 1.85703125, |
| "learning_rate": 1.0710608462960486e-07, |
| "loss": 0.2463, |
| "reward": 1.9129464983940125, |
| "reward_std": 0.5115076020359993, |
| "rewards/accuracy_reward": 0.16428571874275805, |
| "rewards/format_reward": 0.8526786118745804, |
| "rewards/tag_count_reward": 0.8959821909666061, |
| "step": 2610 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 706.6116363525391, |
| "epoch": 0.8927199795169412, |
| "grad_norm": 5.472092151641846, |
| "kl": 1.733984375, |
| "learning_rate": 1.038138171600177e-07, |
| "loss": 0.2281, |
| "reward": 1.9308036863803864, |
| "reward_std": 0.4873148113489151, |
| "rewards/accuracy_reward": 0.16517857909202577, |
| "rewards/format_reward": 0.8642857491970062, |
| "rewards/tag_count_reward": 0.9013393223285675, |
| "step": 2615 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 754.4884185791016, |
| "epoch": 0.8944269010838952, |
| "grad_norm": 1.2924634218215942, |
| "kl": 1.869921875, |
| "learning_rate": 1.005711279175694e-07, |
| "loss": 0.2641, |
| "reward": 1.903348296880722, |
| "reward_std": 0.5570731215178967, |
| "rewards/accuracy_reward": 0.16785714756697417, |
| "rewards/format_reward": 0.845535758137703, |
| "rewards/tag_count_reward": 0.8899553954601288, |
| "step": 2620 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 684.0527130126953, |
| "epoch": 0.8961338226508492, |
| "grad_norm": 2.2256343364715576, |
| "kl": 1.68251953125, |
| "learning_rate": 9.737813204935497e-08, |
| "loss": 0.2139, |
| "reward": 1.90401793718338, |
| "reward_std": 0.5087300404906273, |
| "rewards/accuracy_reward": 0.16607143655419349, |
| "rewards/format_reward": 0.8500000387430191, |
| "rewards/tag_count_reward": 0.8879464656114578, |
| "step": 2625 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 731.5366394042969, |
| "epoch": 0.8978407442178032, |
| "grad_norm": 2.7418696880340576, |
| "kl": 1.9642578125, |
| "learning_rate": 9.423494293787082e-08, |
| "loss": 0.2427, |
| "reward": 1.8968750774860381, |
| "reward_std": 0.5185163721442223, |
| "rewards/accuracy_reward": 0.16071429187431932, |
| "rewards/format_reward": 0.8455357551574707, |
| "rewards/tag_count_reward": 0.8906250387430191, |
| "step": 2630 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 667.6437774658203, |
| "epoch": 0.8995476657847572, |
| "grad_norm": 2.741389274597168, |
| "kl": 1.519921875, |
| "learning_rate": 9.114167219698744e-08, |
| "loss": 0.1709, |
| "reward": 1.954017961025238, |
| "reward_std": 0.464486388117075, |
| "rewards/accuracy_reward": 0.1812500087544322, |
| "rewards/format_reward": 0.8678571850061416, |
| "rewards/tag_count_reward": 0.9049107581377029, |
| "step": 2635 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 724.1705718994141, |
| "epoch": 0.9012545873517112, |
| "grad_norm": 1.2222126722335815, |
| "kl": 1.3740234375, |
| "learning_rate": 8.809842966798587e-08, |
| "loss": 0.2209, |
| "reward": 1.959598284959793, |
| "reward_std": 0.5034348502755165, |
| "rewards/accuracy_reward": 0.208035721629858, |
| "rewards/format_reward": 0.8562500476837158, |
| "rewards/tag_count_reward": 0.8953125417232514, |
| "step": 2640 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 710.2991378784179, |
| "epoch": 0.9029615089186652, |
| "grad_norm": 0.7933781147003174, |
| "kl": 1.36630859375, |
| "learning_rate": 8.510532341565807e-08, |
| "loss": 0.1944, |
| "reward": 1.9640626072883607, |
| "reward_std": 0.4950651377439499, |
| "rewards/accuracy_reward": 0.20982143916189672, |
| "rewards/format_reward": 0.8562500387430191, |
| "rewards/tag_count_reward": 0.8979911118745804, |
| "step": 2645 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 701.2223571777344, |
| "epoch": 0.9046684304856192, |
| "grad_norm": 5.042927265167236, |
| "kl": 1.5912109375, |
| "learning_rate": 8.216245972446962e-08, |
| "loss": 0.1986, |
| "reward": 1.9100447297096252, |
| "reward_std": 0.49433635324239733, |
| "rewards/accuracy_reward": 0.15357143441215157, |
| "rewards/format_reward": 0.8571428984403611, |
| "rewards/tag_count_reward": 0.8993303954601288, |
| "step": 2650 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 705.6312866210938, |
| "epoch": 0.9063753520525731, |
| "grad_norm": 1.833943486213684, |
| "kl": 1.58671875, |
| "learning_rate": 7.926994309478403e-08, |
| "loss": 0.2351, |
| "reward": 1.9138393700122833, |
| "reward_std": 0.5347620368003845, |
| "rewards/accuracy_reward": 0.16875000894069672, |
| "rewards/format_reward": 0.8500000417232514, |
| "rewards/tag_count_reward": 0.8950893223285675, |
| "step": 2655 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 691.6018188476562, |
| "epoch": 0.9080822736195272, |
| "grad_norm": 1.80747389793396, |
| "kl": 1.40380859375, |
| "learning_rate": 7.642787623915442e-08, |
| "loss": 0.1708, |
| "reward": 1.991517972946167, |
| "reward_std": 0.4647905558347702, |
| "rewards/accuracy_reward": 0.19464286686852575, |
| "rewards/format_reward": 0.8812500417232514, |
| "rewards/tag_count_reward": 0.9156250387430191, |
| "step": 2660 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 683.5330688476563, |
| "epoch": 0.9097891951864812, |
| "grad_norm": 2.021982192993164, |
| "kl": 1.7751953125, |
| "learning_rate": 7.36363600786733e-08, |
| "loss": 0.2256, |
| "reward": 1.9256697237491607, |
| "reward_std": 0.5254194289445877, |
| "rewards/accuracy_reward": 0.18482143618166447, |
| "rewards/format_reward": 0.8491071820259094, |
| "rewards/tag_count_reward": 0.8917411029338836, |
| "step": 2665 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 707.2678833007812, |
| "epoch": 0.9114961167534352, |
| "grad_norm": 1.5858349800109863, |
| "kl": 1.50009765625, |
| "learning_rate": 7.089549373939186e-08, |
| "loss": 0.2216, |
| "reward": 1.9671875894069673, |
| "reward_std": 0.495637346804142, |
| "rewards/accuracy_reward": 0.18392857955768704, |
| "rewards/format_reward": 0.8741071879863739, |
| "rewards/tag_count_reward": 0.9091518342494964, |
| "step": 2670 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 706.431283569336, |
| "epoch": 0.9132030383203892, |
| "grad_norm": 0.9021114110946655, |
| "kl": 1.64638671875, |
| "learning_rate": 6.8205374548798e-08, |
| "loss": 0.196, |
| "reward": 1.9872768700122834, |
| "reward_std": 0.4923027902841568, |
| "rewards/accuracy_reward": 0.2142857253551483, |
| "rewards/format_reward": 0.869642898440361, |
| "rewards/tag_count_reward": 0.9033482581377029, |
| "step": 2675 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 721.2375335693359, |
| "epoch": 0.9149099598873431, |
| "grad_norm": 1.658331274986267, |
| "kl": 1.6201171875, |
| "learning_rate": 6.556609803236108e-08, |
| "loss": 0.243, |
| "reward": 1.9319197297096253, |
| "reward_std": 0.5096075862646103, |
| "rewards/accuracy_reward": 0.19285714998841286, |
| "rewards/format_reward": 0.8491071850061417, |
| "rewards/tag_count_reward": 0.8899554044008255, |
| "step": 2680 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 711.7491394042969, |
| "epoch": 0.9166168814542972, |
| "grad_norm": 1.9178466796875, |
| "kl": 1.453515625, |
| "learning_rate": 6.297775791013933e-08, |
| "loss": 0.2094, |
| "reward": 1.983035808801651, |
| "reward_std": 0.5048068448901176, |
| "rewards/accuracy_reward": 0.1875000089406967, |
| "rewards/format_reward": 0.8785714656114578, |
| "rewards/tag_count_reward": 0.916964328289032, |
| "step": 2685 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 708.7125305175781, |
| "epoch": 0.9183238030212512, |
| "grad_norm": 2.0225086212158203, |
| "kl": 1.497705078125, |
| "learning_rate": 6.044044609345228e-08, |
| "loss": 0.2327, |
| "reward": 1.9497768878936768, |
| "reward_std": 0.5056387215852738, |
| "rewards/accuracy_reward": 0.17142857983708382, |
| "rewards/format_reward": 0.8678571850061416, |
| "rewards/tag_count_reward": 0.9104911088943481, |
| "step": 2690 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 713.8303894042969, |
| "epoch": 0.9200307245882051, |
| "grad_norm": 1.8685790300369263, |
| "kl": 1.5716796875, |
| "learning_rate": 5.7954252681617304e-08, |
| "loss": 0.2246, |
| "reward": 1.9587054550647736, |
| "reward_std": 0.48516621366143226, |
| "rewards/accuracy_reward": 0.21428572656586767, |
| "rewards/format_reward": 0.8535714656114578, |
| "rewards/tag_count_reward": 0.8908482551574707, |
| "step": 2695 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 719.8518188476562, |
| "epoch": 0.9217376461551592, |
| "grad_norm": 2.501157283782959, |
| "kl": 1.40458984375, |
| "learning_rate": 5.5519265958749066e-08, |
| "loss": 0.1797, |
| "reward": 1.930803644657135, |
| "reward_std": 0.4772623166441917, |
| "rewards/accuracy_reward": 0.1508928645402193, |
| "rewards/format_reward": 0.87232146859169, |
| "rewards/tag_count_reward": 0.9075893342494965, |
| "step": 2700 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 711.0937927246093, |
| "epoch": 0.9234445677221131, |
| "grad_norm": 1.3878949880599976, |
| "kl": 1.0825927734375, |
| "learning_rate": 5.313557239062627e-08, |
| "loss": 0.1808, |
| "reward": 1.945089375972748, |
| "reward_std": 0.43458477333188056, |
| "rewards/accuracy_reward": 0.14017857844009995, |
| "rewards/format_reward": 0.8848214745521545, |
| "rewards/tag_count_reward": 0.9200893342494965, |
| "step": 2705 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 715.4616394042969, |
| "epoch": 0.9251514892890672, |
| "grad_norm": 1.9537073373794556, |
| "kl": 1.6349609375, |
| "learning_rate": 5.0803256621619445e-08, |
| "loss": 0.2297, |
| "reward": 1.9232143819332124, |
| "reward_std": 0.4841707475483418, |
| "rewards/accuracy_reward": 0.15625000800937414, |
| "rewards/format_reward": 0.8616071879863739, |
| "rewards/tag_count_reward": 0.9053571820259094, |
| "step": 2710 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 702.2705780029297, |
| "epoch": 0.9268584108560212, |
| "grad_norm": 1.7269147634506226, |
| "kl": 1.5267578125, |
| "learning_rate": 4.852240147168696e-08, |
| "loss": 0.1783, |
| "reward": 1.9468750834465027, |
| "reward_std": 0.5081598028540611, |
| "rewards/accuracy_reward": 0.1892857238650322, |
| "rewards/format_reward": 0.8535714685916901, |
| "rewards/tag_count_reward": 0.9040178984403611, |
| "step": 2715 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 712.5902099609375, |
| "epoch": 0.9285653324229751, |
| "grad_norm": 1.7688935995101929, |
| "kl": 1.719921875, |
| "learning_rate": 4.629308793343229e-08, |
| "loss": 0.2492, |
| "reward": 1.9412947356700898, |
| "reward_std": 0.5291612073779106, |
| "rewards/accuracy_reward": 0.19642857909202577, |
| "rewards/format_reward": 0.8500000387430191, |
| "rewards/tag_count_reward": 0.8948661148548126, |
| "step": 2720 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 704.3607452392578, |
| "epoch": 0.9302722539899292, |
| "grad_norm": 4.1701436042785645, |
| "kl": 1.4392578125, |
| "learning_rate": 4.4115395169230074e-08, |
| "loss": 0.2062, |
| "reward": 1.94665185213089, |
| "reward_std": 0.48943726569414137, |
| "rewards/accuracy_reward": 0.18125000819563866, |
| "rewards/format_reward": 0.8633928954601288, |
| "rewards/tag_count_reward": 0.9020089745521546, |
| "step": 2725 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 711.0794952392578, |
| "epoch": 0.9319791755568831, |
| "grad_norm": 2.8235621452331543, |
| "kl": 1.57685546875, |
| "learning_rate": 4.1989400508413264e-08, |
| "loss": 0.2257, |
| "reward": 1.9709822356700897, |
| "reward_std": 0.5317689374089241, |
| "rewards/accuracy_reward": 0.20000000819563865, |
| "rewards/format_reward": 0.8660714745521545, |
| "rewards/tag_count_reward": 0.9049107611179352, |
| "step": 2730 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 722.0044952392578, |
| "epoch": 0.9336860971238372, |
| "grad_norm": 2.165998935699463, |
| "kl": 1.6806640625, |
| "learning_rate": 3.991517944452827e-08, |
| "loss": 0.2257, |
| "reward": 1.9388393700122832, |
| "reward_std": 0.5329652637243271, |
| "rewards/accuracy_reward": 0.17857143664732575, |
| "rewards/format_reward": 0.8598214626312256, |
| "rewards/tag_count_reward": 0.9004464685916901, |
| "step": 2735 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 696.9875305175781, |
| "epoch": 0.9353930186907912, |
| "grad_norm": 1.3871349096298218, |
| "kl": 1.5708984375, |
| "learning_rate": 3.789280563265346e-08, |
| "loss": 0.2273, |
| "reward": 1.905803632736206, |
| "reward_std": 0.5343507960438728, |
| "rewards/accuracy_reward": 0.16250000800937414, |
| "rewards/format_reward": 0.8535714656114578, |
| "rewards/tag_count_reward": 0.8897321790456771, |
| "step": 2740 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 654.9214599609375, |
| "epoch": 0.9370999402577451, |
| "grad_norm": 1.9422177076339722, |
| "kl": 1.5888671875, |
| "learning_rate": 3.592235088678458e-08, |
| "loss": 0.1971, |
| "reward": 2.000000089406967, |
| "reward_std": 0.4781561218202114, |
| "rewards/accuracy_reward": 0.21785715334117411, |
| "rewards/format_reward": 0.873214328289032, |
| "rewards/tag_count_reward": 0.9089286148548126, |
| "step": 2745 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 710.5107421875, |
| "epoch": 0.9388068618246992, |
| "grad_norm": 1.3229178190231323, |
| "kl": 1.6900390625, |
| "learning_rate": 3.400388517728348e-08, |
| "loss": 0.2153, |
| "reward": 1.9354911625385285, |
| "reward_std": 0.5064380072057247, |
| "rewards/accuracy_reward": 0.15625000633299352, |
| "rewards/format_reward": 0.8687500476837158, |
| "rewards/tag_count_reward": 0.9104911088943481, |
| "step": 2750 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 694.1669952392579, |
| "epoch": 0.9405137833916531, |
| "grad_norm": 2.7505228519439697, |
| "kl": 1.655078125, |
| "learning_rate": 3.2137476628395054e-08, |
| "loss": 0.1961, |
| "reward": 1.9437500953674316, |
| "reward_std": 0.4771438464522362, |
| "rewards/accuracy_reward": 0.1848214389756322, |
| "rewards/format_reward": 0.8598214745521545, |
| "rewards/tag_count_reward": 0.8991071879863739, |
| "step": 2755 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 741.4875366210938, |
| "epoch": 0.9422207049586071, |
| "grad_norm": 3.672633409500122, |
| "kl": 1.748828125, |
| "learning_rate": 3.0323191515826076e-08, |
| "loss": 0.2204, |
| "reward": 1.8966518878936767, |
| "reward_std": 0.5492107257246971, |
| "rewards/accuracy_reward": 0.16160714998841286, |
| "rewards/format_reward": 0.8446429044008255, |
| "rewards/tag_count_reward": 0.8904018312692642, |
| "step": 2760 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 706.8419982910157, |
| "epoch": 0.9439276265255612, |
| "grad_norm": 2.4161508083343506, |
| "kl": 1.6482421875, |
| "learning_rate": 2.856109426439435e-08, |
| "loss": 0.2147, |
| "reward": 1.9370536506175995, |
| "reward_std": 0.5207428842782974, |
| "rewards/accuracy_reward": 0.183035721629858, |
| "rewards/format_reward": 0.8562500387430191, |
| "rewards/tag_count_reward": 0.897767898440361, |
| "step": 2765 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 731.3250274658203, |
| "epoch": 0.9456345480925151, |
| "grad_norm": 2.095442771911621, |
| "kl": 1.7171875, |
| "learning_rate": 2.6851247445738247e-08, |
| "loss": 0.2129, |
| "reward": 1.8669643700122833, |
| "reward_std": 0.536584535241127, |
| "rewards/accuracy_reward": 0.14910715017467738, |
| "rewards/format_reward": 0.8392857581377029, |
| "rewards/tag_count_reward": 0.8785714656114578, |
| "step": 2770 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 713.9116394042969, |
| "epoch": 0.9473414696594692, |
| "grad_norm": 2.0644876956939697, |
| "kl": 1.33837890625, |
| "learning_rate": 2.519371177609714e-08, |
| "loss": 0.1655, |
| "reward": 1.9444197297096253, |
| "reward_std": 0.4576558813452721, |
| "rewards/accuracy_reward": 0.1696428654715419, |
| "rewards/format_reward": 0.8705357521772384, |
| "rewards/tag_count_reward": 0.9042411178350449, |
| "step": 2775 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 738.437533569336, |
| "epoch": 0.9490483912264231, |
| "grad_norm": 2.707775354385376, |
| "kl": 1.655078125, |
| "learning_rate": 2.358854611415362e-08, |
| "loss": 0.2686, |
| "reward": 1.9511161744594574, |
| "reward_std": 0.5168059259653092, |
| "rewards/accuracy_reward": 0.20178572358563543, |
| "rewards/format_reward": 0.8526786148548127, |
| "rewards/tag_count_reward": 0.896651828289032, |
| "step": 2780 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 666.6893157958984, |
| "epoch": 0.9507553127933771, |
| "grad_norm": 2.604048013687134, |
| "kl": 1.36875, |
| "learning_rate": 2.2035807458944845e-08, |
| "loss": 0.1994, |
| "reward": 2.0129465401172637, |
| "reward_std": 0.44142256677150726, |
| "rewards/accuracy_reward": 0.2133928656578064, |
| "rewards/format_reward": 0.8803571850061417, |
| "rewards/tag_count_reward": 0.9191964626312256, |
| "step": 2785 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 699.650032043457, |
| "epoch": 0.9524622343603312, |
| "grad_norm": 0.9698619246482849, |
| "kl": 1.36435546875, |
| "learning_rate": 2.0535550947837824e-08, |
| "loss": 0.2257, |
| "reward": 1.9832590162754058, |
| "reward_std": 0.5147185429930687, |
| "rewards/accuracy_reward": 0.2035714380443096, |
| "rewards/format_reward": 0.8687500417232513, |
| "rewards/tag_count_reward": 0.9109375387430191, |
| "step": 2790 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 724.2723602294922, |
| "epoch": 0.9541691559272851, |
| "grad_norm": 2.4312736988067627, |
| "kl": 1.88720703125, |
| "learning_rate": 1.9087829854571137e-08, |
| "loss": 0.2303, |
| "reward": 1.9149554252624512, |
| "reward_std": 0.5436064839363098, |
| "rewards/accuracy_reward": 0.19196429159492254, |
| "rewards/format_reward": 0.8366071879863739, |
| "rewards/tag_count_reward": 0.8863839745521546, |
| "step": 2795 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 687.9143157958985, |
| "epoch": 0.9558760774942391, |
| "grad_norm": 1.6051888465881348, |
| "kl": 1.741796875, |
| "learning_rate": 1.7692695587363804e-08, |
| "loss": 0.2005, |
| "reward": 1.9395090281963348, |
| "reward_std": 0.5374544084072113, |
| "rewards/accuracy_reward": 0.1964285785332322, |
| "rewards/format_reward": 0.8526786088943481, |
| "rewards/tag_count_reward": 0.890401828289032, |
| "step": 2800 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 733.0598510742187, |
| "epoch": 0.9575829990611932, |
| "grad_norm": 1.3395869731903076, |
| "kl": 1.4322265625, |
| "learning_rate": 1.6350197687089897e-08, |
| "loss": 0.2022, |
| "reward": 1.9455358266830445, |
| "reward_std": 0.4857571929693222, |
| "rewards/accuracy_reward": 0.1794642936438322, |
| "rewards/format_reward": 0.8642857521772385, |
| "rewards/tag_count_reward": 0.9017857551574707, |
| "step": 2805 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 743.7893157958985, |
| "epoch": 0.9592899206281471, |
| "grad_norm": 2.4686439037323, |
| "kl": 1.9875, |
| "learning_rate": 1.5060383825518943e-08, |
| "loss": 0.1842, |
| "reward": 1.9044643819332123, |
| "reward_std": 0.5226494466885925, |
| "rewards/accuracy_reward": 0.17142857694998384, |
| "rewards/format_reward": 0.8437500417232513, |
| "rewards/tag_count_reward": 0.8892857521772385, |
| "step": 2810 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 695.3214599609375, |
| "epoch": 0.9609968421951012, |
| "grad_norm": 1.3053197860717773, |
| "kl": 1.5158203125, |
| "learning_rate": 1.3823299803622957e-08, |
| "loss": 0.2142, |
| "reward": 1.9276786625385285, |
| "reward_std": 0.45726575776934625, |
| "rewards/accuracy_reward": 0.15892857713624836, |
| "rewards/format_reward": 0.866964328289032, |
| "rewards/tag_count_reward": 0.9017857551574707, |
| "step": 2815 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 665.1518127441407, |
| "epoch": 0.9627037637620551, |
| "grad_norm": 1.9331916570663452, |
| "kl": 1.36396484375, |
| "learning_rate": 1.2638989549950742e-08, |
| "loss": 0.2132, |
| "reward": 1.9772322356700898, |
| "reward_std": 0.44482519626617434, |
| "rewards/accuracy_reward": 0.17500000707805158, |
| "rewards/format_reward": 0.8839286059141159, |
| "rewards/tag_count_reward": 0.9183036178350449, |
| "step": 2820 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 741.8071838378906, |
| "epoch": 0.9644106853290091, |
| "grad_norm": 2.253441572189331, |
| "kl": 1.952734375, |
| "learning_rate": 1.150749511906729e-08, |
| "loss": 0.2679, |
| "reward": 1.9303572177886963, |
| "reward_std": 0.5730097323656083, |
| "rewards/accuracy_reward": 0.213392869848758, |
| "rewards/format_reward": 0.8303571820259095, |
| "rewards/tag_count_reward": 0.8866071850061417, |
| "step": 2825 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 726.2223510742188, |
| "epoch": 0.9661176068959632, |
| "grad_norm": 1.3774980306625366, |
| "kl": 1.4826171875, |
| "learning_rate": 1.0428856690061161e-08, |
| "loss": 0.2107, |
| "reward": 1.9537947297096252, |
| "reward_std": 0.49637353494763375, |
| "rewards/accuracy_reward": 0.17500000782310962, |
| "rewards/format_reward": 0.8705357491970063, |
| "rewards/tag_count_reward": 0.9082589685916901, |
| "step": 2830 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 691.8553924560547, |
| "epoch": 0.9678245284629171, |
| "grad_norm": 2.7155189514160156, |
| "kl": 1.510546875, |
| "learning_rate": 9.403112565116612e-09, |
| "loss": 0.2297, |
| "reward": 1.971428644657135, |
| "reward_std": 0.4926897309720516, |
| "rewards/accuracy_reward": 0.1866071511991322, |
| "rewards/format_reward": 0.8732143253087997, |
| "rewards/tag_count_reward": 0.9116071850061417, |
| "step": 2835 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 736.6098480224609, |
| "epoch": 0.9695314500298712, |
| "grad_norm": 1.3236608505249023, |
| "kl": 1.8693359375, |
| "learning_rate": 8.430299168154853e-09, |
| "loss": 0.163, |
| "reward": 1.8939732909202576, |
| "reward_std": 0.5278340607881546, |
| "rewards/accuracy_reward": 0.15625000735744835, |
| "rewards/format_reward": 0.8473214715719223, |
| "rewards/tag_count_reward": 0.8904018312692642, |
| "step": 2840 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 715.6053894042968, |
| "epoch": 0.9712383715968251, |
| "grad_norm": 1.6628596782684326, |
| "kl": 1.5591796875, |
| "learning_rate": 7.510451043539923e-09, |
| "loss": 0.2034, |
| "reward": 1.9250000953674316, |
| "reward_std": 0.5011376716196537, |
| "rewards/accuracy_reward": 0.146428578812629, |
| "rewards/format_reward": 0.869642898440361, |
| "rewards/tag_count_reward": 0.9089286118745804, |
| "step": 2845 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 694.3768157958984, |
| "epoch": 0.9729452931637791, |
| "grad_norm": 1.6406400203704834, |
| "kl": 1.3693359375, |
| "learning_rate": 6.643600854851828e-09, |
| "loss": 0.1667, |
| "reward": 1.9819197297096252, |
| "reward_std": 0.4982582703232765, |
| "rewards/accuracy_reward": 0.19285715315490962, |
| "rewards/format_reward": 0.8776786088943481, |
| "rewards/tag_count_reward": 0.9113839745521546, |
| "step": 2850 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 706.5500335693359, |
| "epoch": 0.9746522147307332, |
| "grad_norm": 1.9430170059204102, |
| "kl": 1.658251953125, |
| "learning_rate": 5.829779383726808e-09, |
| "loss": 0.207, |
| "reward": 1.9593750953674316, |
| "reward_std": 0.5111976288259029, |
| "rewards/accuracy_reward": 0.188392866961658, |
| "rewards/format_reward": 0.86607146859169, |
| "rewards/tag_count_reward": 0.9049107611179352, |
| "step": 2855 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 663.8089660644531, |
| "epoch": 0.9763591362976871, |
| "grad_norm": 2.3743481636047363, |
| "kl": 1.263671875, |
| "learning_rate": 5.069015528765042e-09, |
| "loss": 0.1589, |
| "reward": 1.9944197475910186, |
| "reward_std": 0.45694540068507195, |
| "rewards/accuracy_reward": 0.17410715138539673, |
| "rewards/format_reward": 0.8955357521772385, |
| "rewards/tag_count_reward": 0.924776828289032, |
| "step": 2860 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 694.6955627441406, |
| "epoch": 0.9780660578646411, |
| "grad_norm": 1.926790714263916, |
| "kl": 1.60859375, |
| "learning_rate": 4.361336304503305e-09, |
| "loss": 0.1852, |
| "reward": 1.9319197297096253, |
| "reward_std": 0.5001587726175785, |
| "rewards/accuracy_reward": 0.1946428671479225, |
| "rewards/format_reward": 0.850892898440361, |
| "rewards/tag_count_reward": 0.8863839656114578, |
| "step": 2865 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 706.7044982910156, |
| "epoch": 0.9797729794315951, |
| "grad_norm": 0.963211715221405, |
| "kl": 1.59814453125, |
| "learning_rate": 3.7067668404563994e-09, |
| "loss": 0.2251, |
| "reward": 1.9821429610252381, |
| "reward_std": 0.509521733224392, |
| "rewards/accuracy_reward": 0.21160715334117414, |
| "rewards/format_reward": 0.8651786148548126, |
| "rewards/tag_count_reward": 0.9053571850061417, |
| "step": 2870 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 714.2839630126953, |
| "epoch": 0.9814799009985491, |
| "grad_norm": 1.6829819679260254, |
| "kl": 1.930078125, |
| "learning_rate": 3.105330380224536e-09, |
| "loss": 0.2473, |
| "reward": 1.8424108028411865, |
| "reward_std": 0.5233809776604176, |
| "rewards/accuracy_reward": 0.13750000530853868, |
| "rewards/format_reward": 0.8294643253087998, |
| "rewards/tag_count_reward": 0.8754464626312256, |
| "step": 2875 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 713.0312805175781, |
| "epoch": 0.9831868225655032, |
| "grad_norm": 1.9121073484420776, |
| "kl": 1.67421875, |
| "learning_rate": 2.5570482806681615e-09, |
| "loss": 0.2478, |
| "reward": 1.891071504354477, |
| "reward_std": 0.5421105667948722, |
| "rewards/accuracy_reward": 0.1598214370198548, |
| "rewards/format_reward": 0.8437500447034836, |
| "rewards/tag_count_reward": 0.8875000387430191, |
| "step": 2880 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 735.0348571777344, |
| "epoch": 0.9848937441324571, |
| "grad_norm": 2.0713632106781006, |
| "kl": 1.6849609375, |
| "learning_rate": 2.061940011149566e-09, |
| "loss": 0.2128, |
| "reward": 1.889509028196335, |
| "reward_std": 0.47978220880031586, |
| "rewards/accuracy_reward": 0.1500000067986548, |
| "rewards/format_reward": 0.850892898440361, |
| "rewards/tag_count_reward": 0.8886161148548126, |
| "step": 2885 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 706.9044921875, |
| "epoch": 0.9866006656994111, |
| "grad_norm": 2.39575457572937, |
| "kl": 1.4171875, |
| "learning_rate": 1.6200231528412657e-09, |
| "loss": 0.1681, |
| "reward": 1.9350447356700897, |
| "reward_std": 0.49725582599639895, |
| "rewards/accuracy_reward": 0.17589286342263222, |
| "rewards/format_reward": 0.859821480512619, |
| "rewards/tag_count_reward": 0.8993303954601288, |
| "step": 2890 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 748.419677734375, |
| "epoch": 0.9883075872663651, |
| "grad_norm": 2.3295395374298096, |
| "kl": 1.81171875, |
| "learning_rate": 1.2313133981020074e-09, |
| "loss": 0.2504, |
| "reward": 1.9466518700122832, |
| "reward_std": 0.5394668459892273, |
| "rewards/accuracy_reward": 0.19464286556467414, |
| "rewards/format_reward": 0.855357187986374, |
| "rewards/tag_count_reward": 0.896651828289032, |
| "step": 2895 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 719.3134246826172, |
| "epoch": 0.9900145088333191, |
| "grad_norm": 3.916964292526245, |
| "kl": 1.43359375, |
| "learning_rate": 8.958245499192108e-10, |
| "loss": 0.2138, |
| "reward": 1.9185268938541413, |
| "reward_std": 0.47515787184238434, |
| "rewards/accuracy_reward": 0.16071429513394833, |
| "rewards/format_reward": 0.8625000417232513, |
| "rewards/tag_count_reward": 0.8953125387430191, |
| "step": 2900 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 722.6116424560547, |
| "epoch": 0.9917214304002732, |
| "grad_norm": 1.7201131582260132, |
| "kl": 1.637890625, |
| "learning_rate": 6.13568521419361e-10, |
| "loss": 0.2208, |
| "reward": 1.8674107909202575, |
| "reward_std": 0.4943607971072197, |
| "rewards/accuracy_reward": 0.15535714905709028, |
| "rewards/format_reward": 0.8321429014205932, |
| "rewards/tag_count_reward": 0.8799107581377029, |
| "step": 2905 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 698.618782043457, |
| "epoch": 0.9934283519672271, |
| "grad_norm": 1.546420931816101, |
| "kl": 1.32275390625, |
| "learning_rate": 3.8455533544418106e-10, |
| "loss": 0.1588, |
| "reward": 2.012500077486038, |
| "reward_std": 0.461934956908226, |
| "rewards/accuracy_reward": 0.19642858058214188, |
| "rewards/format_reward": 0.8928571909666061, |
| "rewards/tag_count_reward": 0.9232143253087998, |
| "step": 2910 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 721.3750366210937, |
| "epoch": 0.9951352735341811, |
| "grad_norm": 4.14056396484375, |
| "kl": 1.686181640625, |
| "learning_rate": 2.0879312419574969e-10, |
| "loss": 0.2039, |
| "reward": 1.9272322297096252, |
| "reward_std": 0.5018501503393054, |
| "rewards/accuracy_reward": 0.17857143925502897, |
| "rewards/format_reward": 0.8553571850061417, |
| "rewards/tag_count_reward": 0.8933036118745804, |
| "step": 2915 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 747.6687835693359, |
| "epoch": 0.9968421951011351, |
| "grad_norm": 2.800555944442749, |
| "kl": 1.7328125, |
| "learning_rate": 8.628812894656557e-11, |
| "loss": 0.2431, |
| "reward": 1.953125101327896, |
| "reward_std": 0.5268502771854401, |
| "rewards/accuracy_reward": 0.19375001024454833, |
| "rewards/format_reward": 0.8625000387430191, |
| "rewards/tag_count_reward": 0.8968750387430191, |
| "step": 2920 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 714.4089721679687, |
| "epoch": 0.9985491166680891, |
| "grad_norm": 2.166287660598755, |
| "kl": 1.687109375, |
| "learning_rate": 1.7044699819057652e-11, |
| "loss": 0.2281, |
| "reward": 1.9647322416305542, |
| "reward_std": 0.4754629820585251, |
| "rewards/accuracy_reward": 0.18571429569274187, |
| "rewards/format_reward": 0.8696429014205933, |
| "rewards/tag_count_reward": 0.9093750327825546, |
| "step": 2925 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 734.4546127319336, |
| "epoch": 0.9999146539216524, |
| "kl": 1.63232421875, |
| "reward": 1.9659599140286446, |
| "reward_std": 0.5131321512162685, |
| "rewards/accuracy_reward": 0.1875000074505806, |
| "rewards/format_reward": 0.8705357499420643, |
| "rewards/tag_count_reward": 0.9079241491854191, |
| "step": 2929, |
| "total_flos": 0.0, |
| "train_loss": 0.22819482568542057, |
| "train_runtime": 410492.8024, |
| "train_samples_per_second": 0.228, |
| "train_steps_per_second": 0.007 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 2929, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": false, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|