Qwen-2.5-7B-Simple-RL-OpenR1-Data / trainer_state.json
miniHui's picture
Model save
8e65c32 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999146539216524,
"eval_steps": 500,
"global_step": 2929,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 1028.7812957763672,
"epoch": 0.0003413843133907997,
"grad_norm": 0.27919793128967285,
"kl": 0.0,
"learning_rate": 1.0238907849829352e-08,
"loss": 0.0496,
"reward": 0.2734375111758709,
"reward_std": 0.28523072227835655,
"rewards/accuracy_reward": 0.19196430034935474,
"rewards/format_reward": 0.017857144121080637,
"rewards/tag_count_reward": 0.06361607415601611,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 1013.6094207763672,
"epoch": 0.0017069215669539984,
"grad_norm": 0.5027251839637756,
"kl": 0.0002154707908630371,
"learning_rate": 5.119453924914676e-08,
"loss": 0.017,
"reward": 0.2388392947614193,
"reward_std": 0.24704269948415458,
"rewards/accuracy_reward": 0.1718750090803951,
"rewards/format_reward": 0.016741072293370962,
"rewards/tag_count_reward": 0.050223216734593734,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 1001.812548828125,
"epoch": 0.003413843133907997,
"grad_norm": 0.3978239595890045,
"kl": 0.00025196075439453124,
"learning_rate": 1.0238907849829352e-07,
"loss": 0.042,
"reward": 0.23370536863803865,
"reward_std": 0.24270428121089935,
"rewards/accuracy_reward": 0.17500000707805158,
"rewards/format_reward": 0.010714286286383868,
"rewards/tag_count_reward": 0.04799107422586531,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 963.5545104980469,
"epoch": 0.005120764700861995,
"grad_norm": 0.47105857729911804,
"kl": 0.0002925872802734375,
"learning_rate": 1.5358361774744026e-07,
"loss": 0.0327,
"reward": 0.2330357251688838,
"reward_std": 0.21997303143143654,
"rewards/accuracy_reward": 0.17500000745058059,
"rewards/format_reward": 0.012500000558793545,
"rewards/tag_count_reward": 0.045535716018639504,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 1039.2929138183595,
"epoch": 0.006827686267815994,
"grad_norm": 0.3969825208187103,
"kl": 0.00026645660400390623,
"learning_rate": 2.0477815699658704e-07,
"loss": 0.0455,
"reward": 0.2522321570664644,
"reward_std": 0.25163545124232767,
"rewards/accuracy_reward": 0.18928572181612252,
"rewards/format_reward": 0.011607143469154835,
"rewards/tag_count_reward": 0.05133928842842579,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 989.4536163330079,
"epoch": 0.008534607834769992,
"grad_norm": 0.24376584589481354,
"kl": 0.00028543472290039064,
"learning_rate": 2.559726962457338e-07,
"loss": 0.0339,
"reward": 0.23214286826550962,
"reward_std": 0.20356001779437066,
"rewards/accuracy_reward": 0.1812500089406967,
"rewards/format_reward": 0.009821429010480642,
"rewards/tag_count_reward": 0.04107143094297498,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 974.0723571777344,
"epoch": 0.01024152940172399,
"grad_norm": 0.42095184326171875,
"kl": 0.00038013458251953127,
"learning_rate": 3.0716723549488053e-07,
"loss": 0.0278,
"reward": 0.21383929550647734,
"reward_std": 0.21393342763185502,
"rewards/accuracy_reward": 0.15892857927829027,
"rewards/format_reward": 0.008035714644938708,
"rewards/tag_count_reward": 0.04687500244472176,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 982.08486328125,
"epoch": 0.01194845096867799,
"grad_norm": 0.7862982153892517,
"kl": 0.0004618644714355469,
"learning_rate": 3.583617747440273e-07,
"loss": 0.0243,
"reward": 0.2435267960652709,
"reward_std": 0.26311668269336225,
"rewards/accuracy_reward": 0.1633928634226322,
"rewards/format_reward": 0.014285715017467737,
"rewards/tag_count_reward": 0.06584821809083223,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 954.0571929931641,
"epoch": 0.013655372535631987,
"grad_norm": 0.5552359819412231,
"kl": 0.005234432220458984,
"learning_rate": 4.0955631399317407e-07,
"loss": 0.0108,
"reward": 0.22209822572767735,
"reward_std": 0.22155285775661468,
"rewards/accuracy_reward": 0.15803572135046123,
"rewards/format_reward": 0.007142857555299998,
"rewards/tag_count_reward": 0.0569196455180645,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 1002.6321838378906,
"epoch": 0.015362294102585987,
"grad_norm": 0.424653023481369,
"kl": 0.0035940170288085937,
"learning_rate": 4.6075085324232084e-07,
"loss": 0.0258,
"reward": 0.2645089395344257,
"reward_std": 0.2839862532913685,
"rewards/accuracy_reward": 0.18482143767178058,
"rewards/format_reward": 0.01785714365541935,
"rewards/tag_count_reward": 0.06183036016300321,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 1023.3384460449219,
"epoch": 0.017069215669539985,
"grad_norm": 0.43382683396339417,
"kl": 0.0026798248291015625,
"learning_rate": 5.119453924914676e-07,
"loss": 0.0376,
"reward": 0.26004465520381925,
"reward_std": 0.2732875030487776,
"rewards/accuracy_reward": 0.15357143431901932,
"rewards/format_reward": 0.020535715203732253,
"rewards/tag_count_reward": 0.08593750447034836,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 954.4053985595704,
"epoch": 0.018776137236493982,
"grad_norm": 0.7390643358230591,
"kl": 0.017375946044921875,
"learning_rate": 5.631399317406143e-07,
"loss": 0.0153,
"reward": 0.2988839440047741,
"reward_std": 0.32841442078351973,
"rewards/accuracy_reward": 0.15000000689178705,
"rewards/format_reward": 0.03035714467987418,
"rewards/tag_count_reward": 0.11852679029107094,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 975.8955688476562,
"epoch": 0.02048305880344798,
"grad_norm": 0.5477886199951172,
"kl": 0.021570587158203126,
"learning_rate": 6.143344709897611e-07,
"loss": 0.0459,
"reward": 0.4064732290804386,
"reward_std": 0.42607217878103254,
"rewards/accuracy_reward": 0.18392858225852252,
"rewards/format_reward": 0.05892857350409031,
"rewards/tag_count_reward": 0.1636160772293806,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 849.24736328125,
"epoch": 0.02218998037040198,
"grad_norm": 0.4776475429534912,
"kl": 0.0297119140625,
"learning_rate": 6.655290102389079e-07,
"loss": 0.0415,
"reward": 0.47053573578596114,
"reward_std": 0.470707942545414,
"rewards/accuracy_reward": 0.20803572395816444,
"rewards/format_reward": 0.06785714607685804,
"rewards/tag_count_reward": 0.19464286640286446,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 1000.8661163330078,
"epoch": 0.02389690193735598,
"grad_norm": 0.5569175481796265,
"kl": 0.018768310546875,
"learning_rate": 7.167235494880546e-07,
"loss": 0.031,
"reward": 0.4183035880327225,
"reward_std": 0.4322426520287991,
"rewards/accuracy_reward": 0.1446428621187806,
"rewards/format_reward": 0.07053571781143546,
"rewards/tag_count_reward": 0.20312501080334186,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 996.6652221679688,
"epoch": 0.025603823504309977,
"grad_norm": 0.3041313588619232,
"kl": 0.0203216552734375,
"learning_rate": 7.679180887372013e-07,
"loss": 0.0353,
"reward": 0.529464314877987,
"reward_std": 0.5286803618073463,
"rewards/accuracy_reward": 0.16250000819563865,
"rewards/format_reward": 0.12142857620492578,
"rewards/tag_count_reward": 0.24553572833538057,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 1012.7580841064453,
"epoch": 0.027310745071263975,
"grad_norm": 0.6267422437667847,
"kl": 0.0195526123046875,
"learning_rate": 8.191126279863481e-07,
"loss": 0.0586,
"reward": 0.5212053760886193,
"reward_std": 0.5019170552492142,
"rewards/accuracy_reward": 0.1580357219092548,
"rewards/format_reward": 0.1089285776950419,
"rewards/tag_count_reward": 0.2542410835623741,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 962.2661163330079,
"epoch": 0.029017666638217973,
"grad_norm": 0.5902665853500366,
"kl": 0.0366546630859375,
"learning_rate": 8.703071672354949e-07,
"loss": 0.015,
"reward": 0.6316964507102967,
"reward_std": 0.5959798350930214,
"rewards/accuracy_reward": 0.18482143711298704,
"rewards/format_reward": 0.1455357214435935,
"rewards/tag_count_reward": 0.30133929997682574,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 1017.0955841064454,
"epoch": 0.030724588205171974,
"grad_norm": 0.5561809539794922,
"kl": 0.0215423583984375,
"learning_rate": 9.215017064846417e-07,
"loss": 0.049,
"reward": 0.6587053805589675,
"reward_std": 0.5868644163012504,
"rewards/accuracy_reward": 0.18839286528527738,
"rewards/format_reward": 0.15982143692672252,
"rewards/tag_count_reward": 0.3104910857975483,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 1010.9018280029297,
"epoch": 0.03243150977212597,
"grad_norm": 0.3226785659790039,
"kl": 0.0638458251953125,
"learning_rate": 9.726962457337883e-07,
"loss": 0.064,
"reward": 0.6285714544355869,
"reward_std": 0.6115485787391662,
"rewards/accuracy_reward": 0.1625000076368451,
"rewards/format_reward": 0.1616071503609419,
"rewards/tag_count_reward": 0.3044643014669418,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 989.2018280029297,
"epoch": 0.03413843133907997,
"grad_norm": 0.38331085443496704,
"kl": 0.0280029296875,
"learning_rate": 1.0238907849829352e-06,
"loss": 0.044,
"reward": 0.7116071745753288,
"reward_std": 0.6793092235922813,
"rewards/accuracy_reward": 0.15000000689178705,
"rewards/format_reward": 0.20982143543660642,
"rewards/tag_count_reward": 0.3517857328057289,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 989.4839752197265,
"epoch": 0.03584535290603397,
"grad_norm": 0.31946083903312683,
"kl": 0.028564453125,
"learning_rate": 1.075085324232082e-06,
"loss": 0.05,
"reward": 0.7866071730852127,
"reward_std": 0.6944080710411071,
"rewards/accuracy_reward": 0.15625000447034837,
"rewards/format_reward": 0.24464286714792252,
"rewards/tag_count_reward": 0.38571430146694186,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 993.9964782714844,
"epoch": 0.037552274472987965,
"grad_norm": 0.48181456327438354,
"kl": 0.055322265625,
"learning_rate": 1.1262798634812287e-06,
"loss": 0.0478,
"reward": 0.8959821820259094,
"reward_std": 0.7488934248685837,
"rewards/accuracy_reward": 0.15357143450528382,
"rewards/format_reward": 0.30000001564621925,
"rewards/tag_count_reward": 0.44241073727607727,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 982.9768310546875,
"epoch": 0.03925919603994196,
"grad_norm": 0.5322438478469849,
"kl": 0.094439697265625,
"learning_rate": 1.1774744027303754e-06,
"loss": 0.027,
"reward": 0.9075893253087998,
"reward_std": 0.6916713267564774,
"rewards/accuracy_reward": 0.175892864074558,
"rewards/format_reward": 0.29464287012815477,
"rewards/tag_count_reward": 0.43705359250307085,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 1018.8009429931641,
"epoch": 0.04096611760689596,
"grad_norm": 2.290813684463501,
"kl": 1.0316864013671876,
"learning_rate": 1.2286689419795221e-06,
"loss": 0.1651,
"reward": 0.8718750342726708,
"reward_std": 0.71419677734375,
"rewards/accuracy_reward": 0.12053572116419672,
"rewards/format_reward": 0.3089285887777805,
"rewards/tag_count_reward": 0.4424107387661934,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 1019.8580902099609,
"epoch": 0.04267303917384996,
"grad_norm": 1.6987059116363525,
"kl": 0.052471923828125,
"learning_rate": 1.279863481228669e-06,
"loss": 0.0799,
"reward": 0.9720982730388641,
"reward_std": 0.7306801319122315,
"rewards/accuracy_reward": 0.175000006146729,
"rewards/format_reward": 0.32232144474983215,
"rewards/tag_count_reward": 0.47477681189775467,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 1004.2143371582031,
"epoch": 0.04437996074080396,
"grad_norm": 3.3034074306488037,
"kl": 0.07410888671875,
"learning_rate": 1.3310580204778158e-06,
"loss": 0.0982,
"reward": 0.9348214775323868,
"reward_std": 0.7496399849653244,
"rewards/accuracy_reward": 0.13303572088479995,
"rewards/format_reward": 0.3044643007218838,
"rewards/tag_count_reward": 0.4973214492201805,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 1024.3098571777343,
"epoch": 0.04608688230775796,
"grad_norm": 1.845048427581787,
"kl": 0.21453857421875,
"learning_rate": 1.3822525597269625e-06,
"loss": 0.0697,
"reward": 1.1470982640981675,
"reward_std": 0.7625433832406998,
"rewards/accuracy_reward": 0.16428572153672577,
"rewards/format_reward": 0.3892857313156128,
"rewards/tag_count_reward": 0.5935268118977547,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 1005.0714691162109,
"epoch": 0.04779380387471196,
"grad_norm": 1.300987720489502,
"kl": 0.5607666015625,
"learning_rate": 1.4334470989761092e-06,
"loss": 0.1016,
"reward": 1.3178571999073028,
"reward_std": 0.7537211120128632,
"rewards/accuracy_reward": 0.17410714970901608,
"rewards/format_reward": 0.46875002086162565,
"rewards/tag_count_reward": 0.6750000268220901,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 971.468798828125,
"epoch": 0.049500725441665956,
"grad_norm": 1.4947351217269897,
"kl": 0.1273681640625,
"learning_rate": 1.484641638225256e-06,
"loss": 0.0903,
"reward": 1.4821429193019866,
"reward_std": 0.7479644685983657,
"rewards/accuracy_reward": 0.17321429271250963,
"rewards/format_reward": 0.5758928790688514,
"rewards/tag_count_reward": 0.7330357491970062,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 907.8357604980469,
"epoch": 0.051207647008619954,
"grad_norm": 3.5529232025146484,
"kl": 1.070947265625,
"learning_rate": 1.5358361774744026e-06,
"loss": 0.0934,
"reward": 1.5294643580913543,
"reward_std": 0.7225078850984573,
"rewards/accuracy_reward": 0.186607151851058,
"rewards/format_reward": 0.5839285984635353,
"rewards/tag_count_reward": 0.7589286029338836,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 962.1339691162109,
"epoch": 0.05291456857557395,
"grad_norm": 146.53732299804688,
"kl": 0.70908203125,
"learning_rate": 1.5870307167235496e-06,
"loss": 0.0814,
"reward": 1.5712054193019866,
"reward_std": 0.7219390630722046,
"rewards/accuracy_reward": 0.1517857219092548,
"rewards/format_reward": 0.6169643193483353,
"rewards/tag_count_reward": 0.8024553954601288,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 974.2196868896484,
"epoch": 0.05462149014252795,
"grad_norm": 38.124210357666016,
"kl": 2.0916015625,
"learning_rate": 1.6382252559726963e-06,
"loss": 0.1192,
"reward": 1.5966518700122834,
"reward_std": 0.7490073859691619,
"rewards/accuracy_reward": 0.18214286491274834,
"rewards/format_reward": 0.6294643074274063,
"rewards/tag_count_reward": 0.7850446790456772,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 1026.6437927246093,
"epoch": 0.05632841170948195,
"grad_norm": 50.31554412841797,
"kl": 1.8455078125,
"learning_rate": 1.6894197952218432e-06,
"loss": 0.1378,
"reward": 1.5475447177886963,
"reward_std": 0.740731555223465,
"rewards/accuracy_reward": 0.14553572097793221,
"rewards/format_reward": 0.6187500298023224,
"rewards/tag_count_reward": 0.7832589596509933,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 1012.067904663086,
"epoch": 0.058035333276435945,
"grad_norm": 55.0086555480957,
"kl": 2.6337890625,
"learning_rate": 1.7406143344709897e-06,
"loss": 0.1877,
"reward": 1.483928644657135,
"reward_std": 0.8173549324274063,
"rewards/accuracy_reward": 0.15535715073347092,
"rewards/format_reward": 0.5758928820490837,
"rewards/tag_count_reward": 0.752678605914116,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 1027.2464630126954,
"epoch": 0.05974225484338994,
"grad_norm": 58.59043502807617,
"kl": 3.63203125,
"learning_rate": 1.7918088737201367e-06,
"loss": 0.2713,
"reward": 1.4720982789993287,
"reward_std": 0.8386385828256607,
"rewards/accuracy_reward": 0.15535714933648706,
"rewards/format_reward": 0.5723214507102966,
"rewards/tag_count_reward": 0.7444196730852127,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 1085.983087158203,
"epoch": 0.06144917641034395,
"grad_norm": 118.3796615600586,
"kl": 3.1908203125,
"learning_rate": 1.8430034129692834e-06,
"loss": 0.2424,
"reward": 1.3830357909202575,
"reward_std": 0.8672506153583527,
"rewards/accuracy_reward": 0.14375000717118383,
"rewards/format_reward": 0.5339285925030708,
"rewards/tag_count_reward": 0.7053571850061416,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 1069.7571899414063,
"epoch": 0.06315609797729795,
"grad_norm": 1056.704833984375,
"kl": 4.60859375,
"learning_rate": 1.8941979522184299e-06,
"loss": 0.349,
"reward": 1.483705425262451,
"reward_std": 0.8457996159791946,
"rewards/accuracy_reward": 0.17142857862636446,
"rewards/format_reward": 0.5830357432365417,
"rewards/tag_count_reward": 0.729241105914116,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 1033.57861328125,
"epoch": 0.06486301954425194,
"grad_norm": 116.93949890136719,
"kl": 4.104296875,
"learning_rate": 1.9453924914675766e-06,
"loss": 0.2809,
"reward": 1.4609375596046448,
"reward_std": 0.8616278827190399,
"rewards/accuracy_reward": 0.168750009406358,
"rewards/format_reward": 0.5660714522004128,
"rewards/tag_count_reward": 0.7261161029338836,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 1109.1634399414063,
"epoch": 0.06656994111120594,
"grad_norm": 226.57644653320312,
"kl": 4.3140625,
"learning_rate": 1.9965870307167235e-06,
"loss": 0.3192,
"reward": 1.3892857730388641,
"reward_std": 0.8872519373893738,
"rewards/accuracy_reward": 0.11160714821889997,
"rewards/format_reward": 0.5732143193483352,
"rewards/tag_count_reward": 0.704464316368103,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 1111.0705841064453,
"epoch": 0.06827686267815994,
"grad_norm": 40.324981689453125,
"kl": 6.765625,
"learning_rate": 2.0477815699658705e-06,
"loss": 0.4585,
"reward": 1.3991072058677674,
"reward_std": 0.904101237654686,
"rewards/accuracy_reward": 0.14107143497094513,
"rewards/format_reward": 0.5598214507102967,
"rewards/tag_count_reward": 0.6982143223285675,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 1101.1598724365235,
"epoch": 0.06998378424511394,
"grad_norm": 24.814128875732422,
"kl": 3.97109375,
"learning_rate": 2.098976109215017e-06,
"loss": 0.3623,
"reward": 1.4299107789993286,
"reward_std": 0.8886282354593277,
"rewards/accuracy_reward": 0.1821428656578064,
"rewards/format_reward": 0.5482143044471741,
"rewards/tag_count_reward": 0.6995536029338837,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 1134.5839721679688,
"epoch": 0.07169070581206793,
"grad_norm": 35.07132339477539,
"kl": 4.96171875,
"learning_rate": 2.150170648464164e-06,
"loss": 0.4362,
"reward": 1.3689732730388642,
"reward_std": 0.9125554233789444,
"rewards/accuracy_reward": 0.15535715082660317,
"rewards/format_reward": 0.5392857372760773,
"rewards/tag_count_reward": 0.6743303894996643,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 1099.2161224365234,
"epoch": 0.07339762737902193,
"grad_norm": 27.72007942199707,
"kl": 5.187109375,
"learning_rate": 2.201365187713311e-06,
"loss": 0.4161,
"reward": 1.3982143580913544,
"reward_std": 0.8859813660383224,
"rewards/accuracy_reward": 0.18750000819563867,
"rewards/format_reward": 0.5401785910129547,
"rewards/tag_count_reward": 0.6705357432365417,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 1172.6286163330078,
"epoch": 0.07510454894597593,
"grad_norm": 28.218076705932617,
"kl": 4.496875,
"learning_rate": 2.2525597269624573e-06,
"loss": 0.3957,
"reward": 1.3350447058677672,
"reward_std": 0.907436516880989,
"rewards/accuracy_reward": 0.14375000763684512,
"rewards/format_reward": 0.5258928805589675,
"rewards/tag_count_reward": 0.6654018104076386,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 1138.0920104980469,
"epoch": 0.07681147051292993,
"grad_norm": 30.30360221862793,
"kl": 4.7421875,
"learning_rate": 2.3037542662116043e-06,
"loss": 0.3741,
"reward": 1.3508929312229156,
"reward_std": 0.9466613680124283,
"rewards/accuracy_reward": 0.15625000512227416,
"rewards/format_reward": 0.5321428820490837,
"rewards/tag_count_reward": 0.6625000238418579,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 1146.39736328125,
"epoch": 0.07851839207988393,
"grad_norm": 15.380309104919434,
"kl": 6.12265625,
"learning_rate": 2.3549488054607508e-06,
"loss": 0.4282,
"reward": 1.33727685213089,
"reward_std": 0.9166835993528366,
"rewards/accuracy_reward": 0.13839286239817739,
"rewards/format_reward": 0.5285714492201805,
"rewards/tag_count_reward": 0.6703125298023224,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 1135.8580841064454,
"epoch": 0.08022531364683792,
"grad_norm": 49.58182907104492,
"kl": 4.6671875,
"learning_rate": 2.4061433447098977e-06,
"loss": 0.3852,
"reward": 1.3584822177886964,
"reward_std": 0.8921870917081833,
"rewards/accuracy_reward": 0.12500000577419995,
"rewards/format_reward": 0.5598214492201805,
"rewards/tag_count_reward": 0.673660746216774,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 1134.0955810546875,
"epoch": 0.08193223521379192,
"grad_norm": 9.52077865600586,
"kl": 4.63515625,
"learning_rate": 2.4573378839590442e-06,
"loss": 0.3793,
"reward": 1.4533482909202575,
"reward_std": 0.9463501214981079,
"rewards/accuracy_reward": 0.21607143534347414,
"rewards/format_reward": 0.5598214536905288,
"rewards/tag_count_reward": 0.6774553894996643,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 1126.5911193847655,
"epoch": 0.08363915678074592,
"grad_norm": 12.795201301574707,
"kl": 5.07578125,
"learning_rate": 2.508532423208191e-06,
"loss": 0.444,
"reward": 1.4241071879863738,
"reward_std": 0.9276215642690658,
"rewards/accuracy_reward": 0.16607143776491284,
"rewards/format_reward": 0.5776786029338836,
"rewards/tag_count_reward": 0.6803571671247483,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 1206.5196990966797,
"epoch": 0.08534607834769992,
"grad_norm": 25.597183227539062,
"kl": 6.453125,
"learning_rate": 2.559726962457338e-06,
"loss": 0.5221,
"reward": 1.3265625447034837,
"reward_std": 0.9622172951698303,
"rewards/accuracy_reward": 0.16517857862636448,
"rewards/format_reward": 0.529464316368103,
"rewards/tag_count_reward": 0.631919664144516,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 1214.5321838378907,
"epoch": 0.08705299991465393,
"grad_norm": 12.811568260192871,
"kl": 5.79453125,
"learning_rate": 2.6109215017064846e-06,
"loss": 0.5348,
"reward": 1.3906250774860383,
"reward_std": 0.9294156819581986,
"rewards/accuracy_reward": 0.1723214373923838,
"rewards/format_reward": 0.5633928835391998,
"rewards/tag_count_reward": 0.6549107417464256,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 1202.7625549316406,
"epoch": 0.08875992148160793,
"grad_norm": 154.68002319335938,
"kl": 5.7234375,
"learning_rate": 2.6621160409556315e-06,
"loss": 0.4805,
"reward": 1.361384004354477,
"reward_std": 0.9610002607107162,
"rewards/accuracy_reward": 0.1544642921537161,
"rewards/format_reward": 0.555357164144516,
"rewards/tag_count_reward": 0.6515625298023224,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 1146.6179077148438,
"epoch": 0.09046684304856192,
"grad_norm": 6.1883440017700195,
"kl": 5.08046875,
"learning_rate": 2.7133105802047784e-06,
"loss": 0.4617,
"reward": 1.4383929252624512,
"reward_std": 0.9193685740232468,
"rewards/accuracy_reward": 0.14017857760190963,
"rewards/format_reward": 0.6000000268220902,
"rewards/tag_count_reward": 0.698214316368103,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 1172.1741607666015,
"epoch": 0.09217376461551592,
"grad_norm": 11.599955558776855,
"kl": 5.28671875,
"learning_rate": 2.764505119453925e-06,
"loss": 0.4665,
"reward": 1.4604911386966706,
"reward_std": 0.923082035779953,
"rewards/accuracy_reward": 0.15178572060540318,
"rewards/format_reward": 0.6098214566707612,
"rewards/tag_count_reward": 0.6988839596509934,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 1103.8205841064453,
"epoch": 0.09388068618246992,
"grad_norm": 24.47557830810547,
"kl": 5.521875,
"learning_rate": 2.8156996587030715e-06,
"loss": 0.4808,
"reward": 1.4462054193019866,
"reward_std": 0.8959993481636047,
"rewards/accuracy_reward": 0.141071433480829,
"rewards/format_reward": 0.6080357372760773,
"rewards/tag_count_reward": 0.6970982432365418,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 1036.750051879883,
"epoch": 0.09558760774942392,
"grad_norm": 18.15721893310547,
"kl": 5.049609375,
"learning_rate": 2.8668941979522184e-06,
"loss": 0.3953,
"reward": 1.5037946939468383,
"reward_std": 0.8739502459764481,
"rewards/accuracy_reward": 0.14821429317817092,
"rewards/format_reward": 0.6312500327825546,
"rewards/tag_count_reward": 0.7243303894996643,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 1079.7732727050782,
"epoch": 0.09729452931637791,
"grad_norm": 9.835553169250488,
"kl": 4.321875,
"learning_rate": 2.9180887372013653e-06,
"loss": 0.3538,
"reward": 1.4616072118282317,
"reward_std": 0.9075537651777268,
"rewards/accuracy_reward": 0.15446429196745157,
"rewards/format_reward": 0.6044643148779869,
"rewards/tag_count_reward": 0.7026785999536515,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 1014.2696868896485,
"epoch": 0.09900145088333191,
"grad_norm": 65.74641418457031,
"kl": 5.026953125,
"learning_rate": 2.969283276450512e-06,
"loss": 0.4408,
"reward": 1.6125000774860383,
"reward_std": 0.8444355905056,
"rewards/accuracy_reward": 0.16428572293370963,
"rewards/format_reward": 0.685714316368103,
"rewards/tag_count_reward": 0.7625000268220902,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 975.2527221679687,
"epoch": 0.10070837245028591,
"grad_norm": 8.98360538482666,
"kl": 4.430078125,
"learning_rate": 2.999995738818993e-06,
"loss": 0.4058,
"reward": 1.6013393580913544,
"reward_std": 0.8386077880859375,
"rewards/accuracy_reward": 0.15714286370202898,
"rewards/format_reward": 0.6794643223285675,
"rewards/tag_count_reward": 0.7647321850061417,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 1040.8562866210937,
"epoch": 0.10241529401723991,
"grad_norm": 7.139811992645264,
"kl": 5.41328125,
"learning_rate": 2.9999478008106995e-06,
"loss": 0.5289,
"reward": 1.5127232789993286,
"reward_std": 0.8606418490409851,
"rewards/accuracy_reward": 0.18571429569274187,
"rewards/format_reward": 0.6151785984635353,
"rewards/tag_count_reward": 0.7118303835391998,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 999.74111328125,
"epoch": 0.1041222155841939,
"grad_norm": 8.274175643920898,
"kl": 4.5482421875,
"learning_rate": 2.9998466000257944e-06,
"loss": 0.3431,
"reward": 1.5814732909202576,
"reward_std": 0.8399073332548141,
"rewards/accuracy_reward": 0.17142857825383545,
"rewards/format_reward": 0.6616071701049805,
"rewards/tag_count_reward": 0.7484375268220902,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 984.4464752197266,
"epoch": 0.1058291371511479,
"grad_norm": 5.403537750244141,
"kl": 4.11796875,
"learning_rate": 2.999692140057893e-06,
"loss": 0.3573,
"reward": 1.6404018580913544,
"reward_std": 0.7838103622198105,
"rewards/accuracy_reward": 0.22946429662406445,
"rewards/format_reward": 0.658035746216774,
"rewards/tag_count_reward": 0.752901816368103,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 1023.7839813232422,
"epoch": 0.1075360587181019,
"grad_norm": 10.112387657165527,
"kl": 4.18515625,
"learning_rate": 2.999484426391831e-06,
"loss": 0.3199,
"reward": 1.5156250596046448,
"reward_std": 0.8099619418382644,
"rewards/accuracy_reward": 0.11517857648432255,
"rewards/format_reward": 0.6580357402563095,
"rewards/tag_count_reward": 0.7424107432365418,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 939.3991516113281,
"epoch": 0.1092429802850559,
"grad_norm": 4.295641899108887,
"kl": 3.85234375,
"learning_rate": 2.9992234664034687e-06,
"loss": 0.3389,
"reward": 1.6013393521308898,
"reward_std": 0.8064254641532898,
"rewards/accuracy_reward": 0.16875000642612575,
"rewards/format_reward": 0.6758928924798966,
"rewards/tag_count_reward": 0.7566964656114579,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 944.2964782714844,
"epoch": 0.1109499018520099,
"grad_norm": 6.538363456726074,
"kl": 4.298046875,
"learning_rate": 2.998909269359431e-06,
"loss": 0.3693,
"reward": 1.712946504354477,
"reward_std": 0.7462773695588112,
"rewards/accuracy_reward": 0.1812500067986548,
"rewards/format_reward": 0.7276786029338836,
"rewards/tag_count_reward": 0.8040178924798965,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 995.8259307861329,
"epoch": 0.1126568234189639,
"grad_norm": 9.679475784301758,
"kl": 3.96171875,
"learning_rate": 2.9985418464167776e-06,
"loss": 0.3515,
"reward": 1.6656250715255738,
"reward_std": 0.7768863618373871,
"rewards/accuracy_reward": 0.18928572479635478,
"rewards/format_reward": 0.6937500298023224,
"rewards/tag_count_reward": 0.7825893223285675,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 1035.0509338378906,
"epoch": 0.11436374498591789,
"grad_norm": 10.23958683013916,
"kl": 4.15234375,
"learning_rate": 2.9981212106226067e-06,
"loss": 0.4532,
"reward": 1.7529018700122834,
"reward_std": 0.7962618798017502,
"rewards/accuracy_reward": 0.20982143813744186,
"rewards/format_reward": 0.7437500268220901,
"rewards/tag_count_reward": 0.7993303954601287,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 1050.3491577148438,
"epoch": 0.11607066655287189,
"grad_norm": 21.458248138427734,
"kl": 3.53984375,
"learning_rate": 2.9976473769135918e-06,
"loss": 0.3423,
"reward": 1.7979911386966705,
"reward_std": 0.7509608373045922,
"rewards/accuracy_reward": 0.24285715371370314,
"rewards/format_reward": 0.7500000417232513,
"rewards/tag_count_reward": 0.8051339626312256,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 985.1045043945312,
"epoch": 0.11777758811982589,
"grad_norm": 8.980128288269043,
"kl": 4.795703125,
"learning_rate": 2.997120362115451e-06,
"loss": 0.4425,
"reward": 1.8357143640518188,
"reward_std": 0.6679434359073639,
"rewards/accuracy_reward": 0.20267857778817416,
"rewards/format_reward": 0.7964286059141159,
"rewards/tag_count_reward": 0.8366071820259094,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 926.8366455078125,
"epoch": 0.11948450968677989,
"grad_norm": 12.840106964111328,
"kl": 2.9732421875,
"learning_rate": 2.99654018494235e-06,
"loss": 0.3475,
"reward": 1.8665179431438446,
"reward_std": 0.665838934481144,
"rewards/accuracy_reward": 0.22589286882430315,
"rewards/format_reward": 0.7919643223285675,
"rewards/tag_count_reward": 0.8486607581377029,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 898.8857543945312,
"epoch": 0.1211914312537339,
"grad_norm": 1.285402774810791,
"kl": 2.25703125,
"learning_rate": 2.9959068659962367e-06,
"loss": 0.2652,
"reward": 1.908035808801651,
"reward_std": 0.5491333983838558,
"rewards/accuracy_reward": 0.16607143646106123,
"rewards/format_reward": 0.8508929014205933,
"rewards/tag_count_reward": 0.8910714685916901,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 834.8294952392578,
"epoch": 0.1228983528206879,
"grad_norm": 16.34928321838379,
"kl": 4.01875,
"learning_rate": 2.995220427766111e-06,
"loss": 0.3496,
"reward": 1.764285796880722,
"reward_std": 0.6508073821663857,
"rewards/accuracy_reward": 0.1687500079162419,
"rewards/format_reward": 0.7500000327825547,
"rewards/tag_count_reward": 0.8455357521772384,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 824.4848602294921,
"epoch": 0.12460527438764189,
"grad_norm": 22.009183883666992,
"kl": 1.86591796875,
"learning_rate": 2.994480894627225e-06,
"loss": 0.1837,
"reward": 1.7015625834465027,
"reward_std": 0.6883793324232101,
"rewards/accuracy_reward": 0.16964286332949996,
"rewards/format_reward": 0.7187500268220901,
"rewards/tag_count_reward": 0.8131696820259094,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 828.4500396728515,
"epoch": 0.1263121959545959,
"grad_norm": 11.808023452758789,
"kl": 3.427734375,
"learning_rate": 2.9936882928402187e-06,
"loss": 0.2191,
"reward": 1.6609375894069671,
"reward_std": 0.7794422417879104,
"rewards/accuracy_reward": 0.20357143841683864,
"rewards/format_reward": 0.689285746216774,
"rewards/tag_count_reward": 0.7680803865194321,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 899.5518280029297,
"epoch": 0.12801911752154987,
"grad_norm": 3.559278964996338,
"kl": 2.0583984375,
"learning_rate": 2.992842650550186e-06,
"loss": 0.149,
"reward": 1.731026864051819,
"reward_std": 0.6875755071640015,
"rewards/accuracy_reward": 0.18214286481961608,
"rewards/format_reward": 0.7375000357627869,
"rewards/tag_count_reward": 0.81138396859169,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 935.9803985595703,
"epoch": 0.12972603908850389,
"grad_norm": 7.016046524047852,
"kl": 2.69150390625,
"learning_rate": 2.991943997785676e-06,
"loss": 0.21,
"reward": 1.8580358147621154,
"reward_std": 0.6217495501041412,
"rewards/accuracy_reward": 0.1910714365541935,
"rewards/format_reward": 0.8044643223285675,
"rewards/tag_count_reward": 0.8625000417232513,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 911.315219116211,
"epoch": 0.13143296065545787,
"grad_norm": 7.220456600189209,
"kl": 2.14296875,
"learning_rate": 2.9909923664576264e-06,
"loss": 0.1268,
"reward": 1.8678572356700898,
"reward_std": 0.5655309081077575,
"rewards/accuracy_reward": 0.19107143981382252,
"rewards/format_reward": 0.80357146859169,
"rewards/tag_count_reward": 0.8732143253087997,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 862.4652130126954,
"epoch": 0.13313988222241188,
"grad_norm": 7.075647830963135,
"kl": 1.9138671875,
"learning_rate": 2.9899877903582307e-06,
"loss": 0.1381,
"reward": 1.7910715162754058,
"reward_std": 0.5908127099275589,
"rewards/accuracy_reward": 0.16250000642612578,
"rewards/format_reward": 0.77857146859169,
"rewards/tag_count_reward": 0.8500000417232514,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 824.7241424560547,
"epoch": 0.13484680378936587,
"grad_norm": 4.559528350830078,
"kl": 1.3359130859375,
"learning_rate": 2.9889303051597403e-06,
"loss": 0.1208,
"reward": 1.9924108028411864,
"reward_std": 0.4078477367758751,
"rewards/accuracy_reward": 0.18035715036094188,
"rewards/format_reward": 0.8848214656114578,
"rewards/tag_count_reward": 0.9272321879863739,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 915.5607513427734,
"epoch": 0.13655372535631988,
"grad_norm": 6.0182414054870605,
"kl": 1.38389892578125,
"learning_rate": 2.9878199484131928e-06,
"loss": 0.1426,
"reward": 1.8991072237491609,
"reward_std": 0.5109399899840354,
"rewards/accuracy_reward": 0.20892858393490316,
"rewards/format_reward": 0.8116071820259094,
"rewards/tag_count_reward": 0.8785714656114578,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 918.3304016113282,
"epoch": 0.1382606469232739,
"grad_norm": 7.766772270202637,
"kl": 2.130078125,
"learning_rate": 2.986656759547082e-06,
"loss": 0.2231,
"reward": 1.9397322297096253,
"reward_std": 0.5156191930174827,
"rewards/accuracy_reward": 0.22053572321310638,
"rewards/format_reward": 0.8276786118745804,
"rewards/tag_count_reward": 0.891517898440361,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 915.7795013427734,
"epoch": 0.13996756849022787,
"grad_norm": 21.836950302124023,
"kl": 1.909521484375,
"learning_rate": 2.9854407798659583e-06,
"loss": 0.2047,
"reward": 1.9703125834465027,
"reward_std": 0.5022375226020813,
"rewards/accuracy_reward": 0.202678582072258,
"rewards/format_reward": 0.8616071850061416,
"rewards/tag_count_reward": 0.906026828289032,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 858.590219116211,
"epoch": 0.14167449005718188,
"grad_norm": 18.037555694580078,
"kl": 2.42880859375,
"learning_rate": 2.984172052548961e-06,
"loss": 0.2126,
"reward": 1.9808036744594575,
"reward_std": 0.5602116242051125,
"rewards/accuracy_reward": 0.22410715455189348,
"rewards/format_reward": 0.8526786148548127,
"rewards/tag_count_reward": 0.9040179014205932,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 856.7375366210938,
"epoch": 0.14338141162413587,
"grad_norm": 8.315873146057129,
"kl": 3.725,
"learning_rate": 2.982850622648283e-06,
"loss": 0.2975,
"reward": 1.8669643700122833,
"reward_std": 0.62198735922575,
"rewards/accuracy_reward": 0.20625000745058059,
"rewards/format_reward": 0.8044643253087997,
"rewards/tag_count_reward": 0.8562500387430191,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 860.3080749511719,
"epoch": 0.14508833319108988,
"grad_norm": 6.0542707443237305,
"kl": 2.50927734375,
"learning_rate": 2.9814765370875757e-06,
"loss": 0.1952,
"reward": 1.7029018640518188,
"reward_std": 0.732259088754654,
"rewards/accuracy_reward": 0.2017857251688838,
"rewards/format_reward": 0.7133928894996643,
"rewards/tag_count_reward": 0.7877232491970062,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 801.1982513427735,
"epoch": 0.14679525475804386,
"grad_norm": 7.207229137420654,
"kl": 3.885546875,
"learning_rate": 2.9800498446602777e-06,
"loss": 0.2705,
"reward": 1.6745536506175995,
"reward_std": 0.6814444154500962,
"rewards/accuracy_reward": 0.15446429196745157,
"rewards/format_reward": 0.7223214626312255,
"rewards/tag_count_reward": 0.7977678924798965,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 867.9830749511718,
"epoch": 0.14850217632499788,
"grad_norm": 3.2406466007232666,
"kl": 1.6208984375,
"learning_rate": 2.9785705960278854e-06,
"loss": 0.1428,
"reward": 1.6584822237491608,
"reward_std": 0.6414668798446655,
"rewards/accuracy_reward": 0.17946429708972572,
"rewards/format_reward": 0.6955357402563095,
"rewards/tag_count_reward": 0.7834821820259095,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 774.519677734375,
"epoch": 0.15020909789195186,
"grad_norm": 5.495645046234131,
"kl": 2.61328125,
"learning_rate": 2.977038843718153e-06,
"loss": 0.0369,
"reward": 1.54977685213089,
"reward_std": 0.7357341796159744,
"rewards/accuracy_reward": 0.16964286426082253,
"rewards/format_reward": 0.608035746216774,
"rewards/tag_count_reward": 0.772098246216774,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 753.3616424560547,
"epoch": 0.15191601945890587,
"grad_norm": 4.03928804397583,
"kl": 1.2857421875,
"learning_rate": 2.975454642123228e-06,
"loss": 0.0017,
"reward": 1.5091518580913543,
"reward_std": 0.7535226970911026,
"rewards/accuracy_reward": 0.24642858542501928,
"rewards/format_reward": 0.5053571626543999,
"rewards/tag_count_reward": 0.7573661029338836,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 806.1402191162109,
"epoch": 0.15362294102585985,
"grad_norm": 2.121255874633789,
"kl": 1.4345703125,
"learning_rate": 2.9738180474977184e-06,
"loss": 0.0222,
"reward": 1.5348214983940125,
"reward_std": 0.7348162770271301,
"rewards/accuracy_reward": 0.17053572023287417,
"rewards/format_reward": 0.5758928850293159,
"rewards/tag_count_reward": 0.7883928894996644,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 787.08486328125,
"epoch": 0.15532986259281387,
"grad_norm": 2.3794944286346436,
"kl": 2.43564453125,
"learning_rate": 2.972129117956695e-06,
"loss": 0.0834,
"reward": 1.6506697297096253,
"reward_std": 0.7070199698209763,
"rewards/accuracy_reward": 0.1687500079162419,
"rewards/format_reward": 0.6589286088943481,
"rewards/tag_count_reward": 0.8229911088943481,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 824.7964599609375,
"epoch": 0.15703678415976785,
"grad_norm": 1.7041521072387695,
"kl": 0.3784423828125,
"learning_rate": 2.9703879134736304e-06,
"loss": 0.0166,
"reward": 1.7145090103149414,
"reward_std": 0.6771728962659835,
"rewards/accuracy_reward": 0.16964286416769028,
"rewards/format_reward": 0.7071428954601288,
"rewards/tag_count_reward": 0.8377232521772384,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 845.7839599609375,
"epoch": 0.15874370572672186,
"grad_norm": 15.45041561126709,
"kl": 2.6501953125,
"learning_rate": 2.968594495878266e-06,
"loss": 0.0991,
"reward": 1.7546875834465028,
"reward_std": 0.613152152299881,
"rewards/accuracy_reward": 0.18303572051227093,
"rewards/format_reward": 0.7321428894996643,
"rewards/tag_count_reward": 0.8395089656114578,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 880.5625366210937,
"epoch": 0.16045062729367585,
"grad_norm": 1.617084264755249,
"kl": 0.76259765625,
"learning_rate": 2.9667489288544177e-06,
"loss": 0.028,
"reward": 1.8450893878936767,
"reward_std": 0.5999127000570297,
"rewards/accuracy_reward": 0.20714286714792252,
"rewards/format_reward": 0.7776786088943481,
"rewards/tag_count_reward": 0.8602678865194321,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 810.7125396728516,
"epoch": 0.16215754886062986,
"grad_norm": 6.216405391693115,
"kl": 1.2310546875,
"learning_rate": 2.964851277937717e-06,
"loss": 0.0479,
"reward": 1.5607143580913543,
"reward_std": 0.6846371173858643,
"rewards/accuracy_reward": 0.1517857201397419,
"rewards/format_reward": 0.6464286029338837,
"rewards/tag_count_reward": 0.7625000357627869,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 854.5232513427734,
"epoch": 0.16386447042758384,
"grad_norm": 4.113749980926514,
"kl": 0.834716796875,
"learning_rate": 2.9629016105132797e-06,
"loss": 0.0379,
"reward": 1.7453125953674316,
"reward_std": 0.6409839779138565,
"rewards/accuracy_reward": 0.19017858048900962,
"rewards/format_reward": 0.7366071730852127,
"rewards/tag_count_reward": 0.818526816368103,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 798.655386352539,
"epoch": 0.16557139199453785,
"grad_norm": 18.216583251953125,
"kl": 3.745068359375,
"learning_rate": 2.9608999958133147e-06,
"loss": 0.1496,
"reward": 1.7482143700122834,
"reward_std": 0.5732271403074265,
"rewards/accuracy_reward": 0.1580357201397419,
"rewards/format_reward": 0.7580357432365418,
"rewards/tag_count_reward": 0.832142898440361,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 796.342886352539,
"epoch": 0.16727831356149184,
"grad_norm": 3.8710556030273438,
"kl": 1.31533203125,
"learning_rate": 2.9588465049146673e-06,
"loss": -0.017,
"reward": 1.4564732909202576,
"reward_std": 0.7351120918989181,
"rewards/accuracy_reward": 0.1446428645402193,
"rewards/format_reward": 0.5937500238418579,
"rewards/tag_count_reward": 0.7180803924798965,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 745.3670013427734,
"epoch": 0.16898523512844585,
"grad_norm": 4.14769983291626,
"kl": 2.19775390625,
"learning_rate": 2.9567412107362925e-06,
"loss": 0.148,
"reward": 1.4696429371833801,
"reward_std": 0.7330913826823234,
"rewards/accuracy_reward": 0.1678571505472064,
"rewards/format_reward": 0.5955357432365418,
"rewards/tag_count_reward": 0.7062500327825546,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 821.0946746826172,
"epoch": 0.17069215669539983,
"grad_norm": 6.637725830078125,
"kl": 2.6984375,
"learning_rate": 2.954584188036668e-06,
"loss": 0.224,
"reward": 1.3792411386966705,
"reward_std": 0.7458429962396622,
"rewards/accuracy_reward": 0.14017857983708382,
"rewards/format_reward": 0.5544643133878708,
"rewards/tag_count_reward": 0.6845982402563096,
"step": 500
},
{
"clip_ratio": 0.0,
"completion_length": 879.7982543945312,
"epoch": 0.17239907826235384,
"grad_norm": 4.457671642303467,
"kl": 2.8482421875,
"learning_rate": 2.952375513411137e-06,
"loss": 0.2476,
"reward": 1.3660714864730834,
"reward_std": 0.7996741533279419,
"rewards/accuracy_reward": 0.12946429178118707,
"rewards/format_reward": 0.5562500268220901,
"rewards/tag_count_reward": 0.6803571701049804,
"step": 505
},
{
"clip_ratio": 0.0,
"completion_length": 940.1705780029297,
"epoch": 0.17410599982930786,
"grad_norm": 3.609261989593506,
"kl": 2.91796875,
"learning_rate": 2.9501152652891924e-06,
"loss": 0.3096,
"reward": 1.5997768640518188,
"reward_std": 0.7488022714853286,
"rewards/accuracy_reward": 0.15267857694998382,
"rewards/format_reward": 0.6794643104076385,
"rewards/tag_count_reward": 0.7676339626312256,
"step": 510
},
{
"clip_ratio": 0.0,
"completion_length": 921.5455718994141,
"epoch": 0.17581292139626184,
"grad_norm": 3.6567742824554443,
"kl": 3.191015625,
"learning_rate": 2.947803523931687e-06,
"loss": 0.3447,
"reward": 1.6316965103149415,
"reward_std": 0.6867892518639565,
"rewards/accuracy_reward": 0.15625000847503542,
"rewards/format_reward": 0.6946428820490838,
"rewards/tag_count_reward": 0.7808036059141159,
"step": 515
},
{
"clip_ratio": 0.0,
"completion_length": 931.1562896728516,
"epoch": 0.17751984296321585,
"grad_norm": 6.023605823516846,
"kl": 3.08828125,
"learning_rate": 2.945440371427987e-06,
"loss": 0.3204,
"reward": 1.5986607909202575,
"reward_std": 0.7305556893348694,
"rewards/accuracy_reward": 0.1598214365541935,
"rewards/format_reward": 0.6714285969734192,
"rewards/tag_count_reward": 0.767410746216774,
"step": 520
},
{
"clip_ratio": 0.0,
"completion_length": 870.6866394042969,
"epoch": 0.17922676453016984,
"grad_norm": 4.5348615646362305,
"kl": 2.55,
"learning_rate": 2.943025891693054e-06,
"loss": 0.2878,
"reward": 1.7037947177886963,
"reward_std": 0.6444522187113761,
"rewards/accuracy_reward": 0.17410715110599995,
"rewards/format_reward": 0.7169643193483353,
"rewards/tag_count_reward": 0.8127232521772385,
"step": 525
},
{
"clip_ratio": 0.0,
"completion_length": 843.8196807861328,
"epoch": 0.18093368609712385,
"grad_norm": 5.064777851104736,
"kl": 3.21171875,
"learning_rate": 2.940560170464469e-06,
"loss": 0.2754,
"reward": 1.6127232968807221,
"reward_std": 0.774041372537613,
"rewards/accuracy_reward": 0.144642864074558,
"rewards/format_reward": 0.6803571701049804,
"rewards/tag_count_reward": 0.7877232491970062,
"step": 530
},
{
"clip_ratio": 0.0,
"completion_length": 864.150033569336,
"epoch": 0.18264060766407783,
"grad_norm": 3.2473807334899902,
"kl": 1.7181640625,
"learning_rate": 2.938043295299385e-06,
"loss": 0.2154,
"reward": 1.6125000834465026,
"reward_std": 0.7306119620800018,
"rewards/accuracy_reward": 0.1553571529686451,
"rewards/format_reward": 0.6508928865194321,
"rewards/tag_count_reward": 0.8062500417232513,
"step": 535
},
{
"clip_ratio": 0.0,
"completion_length": 891.7946899414062,
"epoch": 0.18434752923103184,
"grad_norm": 3.4195573329925537,
"kl": 3.03828125,
"learning_rate": 2.9354753555714188e-06,
"loss": 0.3556,
"reward": 1.7002232849597931,
"reward_std": 0.6955976724624634,
"rewards/accuracy_reward": 0.14375000847503544,
"rewards/format_reward": 0.7267857462167739,
"rewards/tag_count_reward": 0.8296875327825546,
"step": 540
},
{
"clip_ratio": 0.0,
"completion_length": 863.1812896728516,
"epoch": 0.18605445079798583,
"grad_norm": 4.025784969329834,
"kl": 2.8484375,
"learning_rate": 2.932856442467476e-06,
"loss": 0.2801,
"reward": 1.723660796880722,
"reward_std": 0.6836878031492233,
"rewards/accuracy_reward": 0.19375001210719348,
"rewards/format_reward": 0.7125000387430191,
"rewards/tag_count_reward": 0.817410746216774,
"step": 545
},
{
"clip_ratio": 0.0,
"completion_length": 902.3437896728516,
"epoch": 0.18776137236493984,
"grad_norm": 7.916510105133057,
"kl": 3.9625,
"learning_rate": 2.9301866489845167e-06,
"loss": 0.3677,
"reward": 1.4658482789993286,
"reward_std": 0.7412176042795181,
"rewards/accuracy_reward": 0.16875000838190318,
"rewards/format_reward": 0.5830357387661934,
"rewards/tag_count_reward": 0.7140625357627869,
"step": 550
},
{
"clip_ratio": 0.0,
"completion_length": 867.4428955078125,
"epoch": 0.18946829393189382,
"grad_norm": 4.7268781661987305,
"kl": 2.042578125,
"learning_rate": 2.9274660699262483e-06,
"loss": 0.2883,
"reward": 1.5462054193019867,
"reward_std": 0.7172608077526093,
"rewards/accuracy_reward": 0.14375000577419997,
"rewards/format_reward": 0.6491071701049804,
"rewards/tag_count_reward": 0.7533482521772384,
"step": 555
},
{
"clip_ratio": 0.0,
"completion_length": 815.8750366210937,
"epoch": 0.19117521549884783,
"grad_norm": 9.568544387817383,
"kl": 2.749609375,
"learning_rate": 2.9246948018997622e-06,
"loss": 0.2904,
"reward": 1.508035770058632,
"reward_std": 0.6859041944146156,
"rewards/accuracy_reward": 0.1125000048428774,
"rewards/format_reward": 0.6437500223517418,
"rewards/tag_count_reward": 0.7517857521772384,
"step": 560
},
{
"clip_ratio": 0.0,
"completion_length": 751.5598541259766,
"epoch": 0.19288213706580182,
"grad_norm": 8.419025421142578,
"kl": 3.47421875,
"learning_rate": 2.9218729433121034e-06,
"loss": 0.2747,
"reward": 1.25133935213089,
"reward_std": 0.7746896028518677,
"rewards/accuracy_reward": 0.11785714812576771,
"rewards/format_reward": 0.48660716563463213,
"rewards/tag_count_reward": 0.6468750298023224,
"step": 565
},
{
"clip_ratio": 0.0,
"completion_length": 694.6839538574219,
"epoch": 0.19458905863275583,
"grad_norm": 6.536264896392822,
"kl": 2.1771484375,
"learning_rate": 2.9190005943667748e-06,
"loss": 0.2243,
"reward": 1.5011161386966705,
"reward_std": 0.7515750855207444,
"rewards/accuracy_reward": 0.15892858086153866,
"rewards/format_reward": 0.6035714507102966,
"rewards/tag_count_reward": 0.7386161029338837,
"step": 570
},
{
"clip_ratio": 0.0,
"completion_length": 712.3911041259765,
"epoch": 0.1962959801997098,
"grad_norm": 9.216137886047363,
"kl": 1.8626953125,
"learning_rate": 2.9160778570601787e-06,
"loss": 0.1986,
"reward": 1.6468750715255738,
"reward_std": 0.5837471626698971,
"rewards/accuracy_reward": 0.11517857760190964,
"rewards/format_reward": 0.7107143133878708,
"rewards/tag_count_reward": 0.8209821790456772,
"step": 575
},
{
"clip_ratio": 0.0,
"completion_length": 746.0116394042968,
"epoch": 0.19800290176666382,
"grad_norm": 3.9668009281158447,
"kl": 2.507421875,
"learning_rate": 2.9131048351779963e-06,
"loss": 0.2798,
"reward": 1.6100447118282317,
"reward_std": 0.6843759298324585,
"rewards/accuracy_reward": 0.15803572265431284,
"rewards/format_reward": 0.6633928894996644,
"rewards/tag_count_reward": 0.7886161029338836,
"step": 580
},
{
"clip_ratio": 0.0,
"completion_length": 708.175927734375,
"epoch": 0.1997098233336178,
"grad_norm": 2.981647491455078,
"kl": 2.158203125,
"learning_rate": 2.9100816342915025e-06,
"loss": 0.2073,
"reward": 1.5310268461704255,
"reward_std": 0.688689549267292,
"rewards/accuracy_reward": 0.16428572265431285,
"rewards/format_reward": 0.6160714536905288,
"rewards/tag_count_reward": 0.750669676065445,
"step": 585
},
{
"clip_ratio": 0.0,
"completion_length": 675.5616455078125,
"epoch": 0.20141674490057182,
"grad_norm": 2.74385142326355,
"kl": 2.028076171875,
"learning_rate": 2.907008361753815e-06,
"loss": 0.2021,
"reward": 1.7705357670783997,
"reward_std": 0.5894090965390205,
"rewards/accuracy_reward": 0.16964286444708704,
"rewards/format_reward": 0.7535714626312255,
"rewards/tag_count_reward": 0.8473214656114578,
"step": 590
},
{
"clip_ratio": 0.0,
"completion_length": 691.3527038574218,
"epoch": 0.2031236664675258,
"grad_norm": 13.477487564086914,
"kl": 1.8919921875,
"learning_rate": 2.903885126696083e-06,
"loss": 0.2684,
"reward": 1.987723308801651,
"reward_std": 0.3995923690497875,
"rewards/accuracy_reward": 0.17142858142033218,
"rewards/format_reward": 0.8812500417232514,
"rewards/tag_count_reward": 0.9350446790456772,
"step": 595
},
{
"clip_ratio": 0.0,
"completion_length": 785.2036071777344,
"epoch": 0.20483058803447982,
"grad_norm": 3.5577802658081055,
"kl": 2.7875,
"learning_rate": 2.900712040023615e-06,
"loss": 0.3984,
"reward": 1.8696429312229157,
"reward_std": 0.549374633282423,
"rewards/accuracy_reward": 0.15892857955768705,
"rewards/format_reward": 0.8214286148548127,
"rewards/tag_count_reward": 0.8892857551574707,
"step": 600
},
{
"clip_ratio": 0.0,
"completion_length": 760.4580657958984,
"epoch": 0.20653750960143383,
"grad_norm": 5.842397689819336,
"kl": 3.459765625,
"learning_rate": 2.8974892144119353e-06,
"loss": 0.3857,
"reward": 1.702455425262451,
"reward_std": 0.6209064692258834,
"rewards/accuracy_reward": 0.16160715045407414,
"rewards/format_reward": 0.7205357551574707,
"rewards/tag_count_reward": 0.8203125447034836,
"step": 605
},
{
"clip_ratio": 0.0,
"completion_length": 774.1259307861328,
"epoch": 0.2082444311683878,
"grad_norm": 4.560490131378174,
"kl": 2.733203125,
"learning_rate": 2.894216764302787e-06,
"loss": 0.3168,
"reward": 1.5366072058677673,
"reward_std": 0.7480582863092422,
"rewards/accuracy_reward": 0.14196429261937737,
"rewards/format_reward": 0.636607164144516,
"rewards/tag_count_reward": 0.7580357521772385,
"step": 610
},
{
"clip_ratio": 0.0,
"completion_length": 765.8830657958985,
"epoch": 0.20995135273534182,
"grad_norm": 3.172254800796509,
"kl": 2.722265625,
"learning_rate": 2.8908948059000676e-06,
"loss": 0.2966,
"reward": 1.5986607730388642,
"reward_std": 0.6495244219899178,
"rewards/accuracy_reward": 0.13571429261937737,
"rewards/format_reward": 0.6714285999536515,
"rewards/tag_count_reward": 0.7915178924798966,
"step": 615
},
{
"clip_ratio": 0.0,
"completion_length": 711.2607513427735,
"epoch": 0.2116582743022958,
"grad_norm": 3.3680763244628906,
"kl": 3.16171875,
"learning_rate": 2.8875234571656997e-06,
"loss": 0.3196,
"reward": 1.6500000834465027,
"reward_std": 0.7083895608782769,
"rewards/accuracy_reward": 0.15000000903382898,
"rewards/format_reward": 0.6964285969734192,
"rewards/tag_count_reward": 0.8035714656114579,
"step": 620
},
{
"clip_ratio": 0.0,
"completion_length": 765.7464630126954,
"epoch": 0.21336519586924982,
"grad_norm": 7.421957492828369,
"kl": 3.5240234375,
"learning_rate": 2.8841028378154463e-06,
"loss": 0.3704,
"reward": 1.6843750715255736,
"reward_std": 0.7275039911270141,
"rewards/accuracy_reward": 0.16696429308503866,
"rewards/format_reward": 0.710714316368103,
"rewards/tag_count_reward": 0.8066964626312256,
"step": 625
},
{
"clip_ratio": 0.0,
"completion_length": 751.6946838378906,
"epoch": 0.2150721174362038,
"grad_norm": 5.945168972015381,
"kl": 2.863671875,
"learning_rate": 2.8806330693146575e-06,
"loss": 0.3454,
"reward": 1.7339286506175995,
"reward_std": 0.6255421549081802,
"rewards/accuracy_reward": 0.1321428634226322,
"rewards/format_reward": 0.7598214596509933,
"rewards/tag_count_reward": 0.8419643193483353,
"step": 630
},
{
"clip_ratio": 0.0,
"completion_length": 769.4634246826172,
"epoch": 0.21677903900315781,
"grad_norm": 2.3628225326538086,
"kl": 3.294921875,
"learning_rate": 2.877114274873957e-06,
"loss": 0.3297,
"reward": 1.7406250834465027,
"reward_std": 0.6107124865055085,
"rewards/accuracy_reward": 0.14464286426082254,
"rewards/format_reward": 0.764285746216774,
"rewards/tag_count_reward": 0.8316964715719223,
"step": 635
},
{
"clip_ratio": 0.0,
"completion_length": 806.5482543945312,
"epoch": 0.2184859605701118,
"grad_norm": 5.536141395568848,
"kl": 2.88671875,
"learning_rate": 2.8735465794448674e-06,
"loss": 0.3702,
"reward": 1.673883992433548,
"reward_std": 0.6438215777277947,
"rewards/accuracy_reward": 0.12857143385335804,
"rewards/format_reward": 0.7267857402563095,
"rewards/tag_count_reward": 0.8185268223285675,
"step": 640
},
{
"clip_ratio": 0.0,
"completion_length": 863.1018280029297,
"epoch": 0.2201928821370658,
"grad_norm": 7.033008575439453,
"kl": 3.584375,
"learning_rate": 2.869930109715375e-06,
"loss": 0.4444,
"reward": 1.6212054193019867,
"reward_std": 0.6893970921635628,
"rewards/accuracy_reward": 0.13660714784637093,
"rewards/format_reward": 0.6901785939931869,
"rewards/tag_count_reward": 0.794419676065445,
"step": 645
},
{
"clip_ratio": 0.0,
"completion_length": 786.244677734375,
"epoch": 0.2218998037040198,
"grad_norm": 4.502243518829346,
"kl": 2.404296875,
"learning_rate": 2.8662649941054266e-06,
"loss": 0.2794,
"reward": 1.6738840103149415,
"reward_std": 0.6413164183497428,
"rewards/accuracy_reward": 0.12232143357396126,
"rewards/format_reward": 0.7303571730852128,
"rewards/tag_count_reward": 0.8212053954601288,
"step": 650
},
{
"clip_ratio": 0.0,
"completion_length": 787.2545013427734,
"epoch": 0.2236067252709738,
"grad_norm": 6.634191989898682,
"kl": 3.10234375,
"learning_rate": 2.8625513627623757e-06,
"loss": 0.3178,
"reward": 1.6941964983940125,
"reward_std": 0.6722912862896919,
"rewards/accuracy_reward": 0.155357148591429,
"rewards/format_reward": 0.7267857551574707,
"rewards/tag_count_reward": 0.8120536059141159,
"step": 655
},
{
"clip_ratio": 0.0,
"completion_length": 740.8027191162109,
"epoch": 0.2253136468379278,
"grad_norm": 5.780096530914307,
"kl": 2.25556640625,
"learning_rate": 2.8587893475563546e-06,
"loss": 0.2618,
"reward": 1.9029018759727478,
"reward_std": 0.548953752219677,
"rewards/accuracy_reward": 0.17589286472648383,
"rewards/format_reward": 0.8410714715719223,
"rewards/tag_count_reward": 0.8859375387430191,
"step": 660
},
{
"clip_ratio": 0.0,
"completion_length": 804.132177734375,
"epoch": 0.2270205684048818,
"grad_norm": 5.417214870452881,
"kl": 2.9451171875,
"learning_rate": 2.854979082075596e-06,
"loss": 0.3144,
"reward": 1.9008929550647735,
"reward_std": 0.5650276392698288,
"rewards/accuracy_reward": 0.2026785809546709,
"rewards/format_reward": 0.8223214656114578,
"rewards/tag_count_reward": 0.8758929014205933,
"step": 665
},
{
"clip_ratio": 0.0,
"completion_length": 783.5616394042969,
"epoch": 0.22872748997183578,
"grad_norm": 3.13038969039917,
"kl": 3.588671875,
"learning_rate": 2.851120701621688e-06,
"loss": 0.3702,
"reward": 1.8468750834465026,
"reward_std": 0.5954644531011581,
"rewards/accuracy_reward": 0.19642858095467092,
"rewards/format_reward": 0.7919643223285675,
"rewards/tag_count_reward": 0.8584821790456771,
"step": 670
},
{
"clip_ratio": 0.0,
"completion_length": 778.3678833007813,
"epoch": 0.2304344115387898,
"grad_norm": 5.504825592041016,
"kl": 2.851171875,
"learning_rate": 2.8472143432047694e-06,
"loss": 0.3215,
"reward": 1.8622768580913545,
"reward_std": 0.5927805215120315,
"rewards/accuracy_reward": 0.19732143823057413,
"rewards/format_reward": 0.8017857521772385,
"rewards/tag_count_reward": 0.8631696820259094,
"step": 675
},
{
"clip_ratio": 0.0,
"completion_length": 827.1464691162109,
"epoch": 0.23214133310574378,
"grad_norm": 4.780069351196289,
"kl": 3.583984375,
"learning_rate": 2.8432601455386644e-06,
"loss": 0.3892,
"reward": 1.737723284959793,
"reward_std": 0.6600617378950119,
"rewards/accuracy_reward": 0.17500000931322574,
"rewards/format_reward": 0.7455357432365417,
"rewards/tag_count_reward": 0.8171875357627869,
"step": 680
},
{
"clip_ratio": 0.0,
"completion_length": 792.6098571777344,
"epoch": 0.2338482546726978,
"grad_norm": 3.67447829246521,
"kl": 3.701953125,
"learning_rate": 2.8392582490359563e-06,
"loss": 0.3747,
"reward": 1.7160715103149413,
"reward_std": 0.6576286390423774,
"rewards/accuracy_reward": 0.16964286621659994,
"rewards/format_reward": 0.7348214566707612,
"rewards/tag_count_reward": 0.811607176065445,
"step": 685
},
{
"clip_ratio": 0.0,
"completion_length": 756.3125366210937,
"epoch": 0.23555517623965178,
"grad_norm": 7.4521660804748535,
"kl": 2.940234375,
"learning_rate": 2.8352087958030044e-06,
"loss": 0.2788,
"reward": 1.7551340162754059,
"reward_std": 0.6580756172537804,
"rewards/accuracy_reward": 0.16428572107106448,
"rewards/format_reward": 0.7553571790456772,
"rewards/tag_count_reward": 0.8354911088943482,
"step": 690
},
{
"clip_ratio": 0.0,
"completion_length": 743.0946716308594,
"epoch": 0.2372620978066058,
"grad_norm": 3.591153144836426,
"kl": 3.9421875,
"learning_rate": 2.8311119296348947e-06,
"loss": 0.3788,
"reward": 1.6745536565780639,
"reward_std": 0.714211243391037,
"rewards/accuracy_reward": 0.19375000949949026,
"rewards/format_reward": 0.6946428894996644,
"rewards/tag_count_reward": 0.7861607521772385,
"step": 695
},
{
"clip_ratio": 0.0,
"completion_length": 693.334848022461,
"epoch": 0.23896901937355977,
"grad_norm": 17.02330207824707,
"kl": 2.3927734375,
"learning_rate": 2.826967796010334e-06,
"loss": 0.2871,
"reward": 1.8256697237491608,
"reward_std": 0.600771751999855,
"rewards/accuracy_reward": 0.15803572218865156,
"rewards/format_reward": 0.8008928954601288,
"rewards/tag_count_reward": 0.8667411148548126,
"step": 700
},
{
"clip_ratio": 0.0,
"completion_length": 669.8428833007813,
"epoch": 0.24067594094051378,
"grad_norm": 3.661524534225464,
"kl": 3.7298828125,
"learning_rate": 2.8227765420864864e-06,
"loss": 0.3348,
"reward": 1.8287947297096252,
"reward_std": 0.6175472036004066,
"rewards/accuracy_reward": 0.21517857927829026,
"rewards/format_reward": 0.7669643223285675,
"rewards/tag_count_reward": 0.846651828289032,
"step": 705
},
{
"clip_ratio": 0.0,
"completion_length": 668.5580657958984,
"epoch": 0.2423828625074678,
"grad_norm": 3.7521235942840576,
"kl": 2.2900390625,
"learning_rate": 2.8185383166937453e-06,
"loss": 0.2425,
"reward": 1.7064732909202576,
"reward_std": 0.6850044190883636,
"rewards/accuracy_reward": 0.16071429289877415,
"rewards/format_reward": 0.7241071730852127,
"rewards/tag_count_reward": 0.8216518193483353,
"step": 710
},
{
"clip_ratio": 0.0,
"completion_length": 669.4509185791015,
"epoch": 0.24408978407442178,
"grad_norm": 5.815065860748291,
"kl": 3.68515625,
"learning_rate": 2.8142532703304487e-06,
"loss": 0.2798,
"reward": 1.6671875596046448,
"reward_std": 0.6780211150646209,
"rewards/accuracy_reward": 0.15535714998841285,
"rewards/format_reward": 0.6991071730852128,
"rewards/tag_count_reward": 0.8127232491970062,
"step": 715
},
{
"clip_ratio": 0.0,
"completion_length": 716.5839660644531,
"epoch": 0.2457967056413758,
"grad_norm": 2.345759153366089,
"kl": 2.4634765625,
"learning_rate": 2.8099215551575375e-06,
"loss": 0.2412,
"reward": 1.6359375834465026,
"reward_std": 0.695411990582943,
"rewards/accuracy_reward": 0.17767857862636446,
"rewards/format_reward": 0.6669643133878708,
"rewards/tag_count_reward": 0.7912946760654449,
"step": 720
},
{
"clip_ratio": 0.0,
"completion_length": 697.8205673217774,
"epoch": 0.24750362720832977,
"grad_norm": 4.898887634277344,
"kl": 2.38125,
"learning_rate": 2.805543324993149e-06,
"loss": 0.2515,
"reward": 1.6654018700122832,
"reward_std": 0.6400819554924965,
"rewards/accuracy_reward": 0.1598214372061193,
"rewards/format_reward": 0.6964285969734192,
"rewards/tag_count_reward": 0.8091518193483352,
"step": 725
},
{
"clip_ratio": 0.0,
"completion_length": 711.1000305175781,
"epoch": 0.24921054877528379,
"grad_norm": 2.1958072185516357,
"kl": 2.2142578125,
"learning_rate": 2.8011187353071575e-06,
"loss": 0.2594,
"reward": 1.758035796880722,
"reward_std": 0.6030809044837951,
"rewards/accuracy_reward": 0.12589286137372255,
"rewards/format_reward": 0.77232146859169,
"rewards/tag_count_reward": 0.8598214685916901,
"step": 730
},
{
"clip_ratio": 0.0,
"completion_length": 717.7393188476562,
"epoch": 0.25091747034223777,
"grad_norm": 3.359809637069702,
"kl": 2.7140625,
"learning_rate": 2.796647943215651e-06,
"loss": 0.2854,
"reward": 1.7325893461704254,
"reward_std": 0.6349606230854988,
"rewards/accuracy_reward": 0.16785715138539672,
"rewards/format_reward": 0.7375000327825546,
"rewards/tag_count_reward": 0.8272321820259094,
"step": 735
},
{
"clip_ratio": 0.0,
"completion_length": 721.5625427246093,
"epoch": 0.2526243919091918,
"grad_norm": 3.7433454990386963,
"kl": 3.691796875,
"learning_rate": 2.792131107475355e-06,
"loss": 0.3333,
"reward": 1.5823661267757416,
"reward_std": 0.7140142098069191,
"rewards/accuracy_reward": 0.1642857214435935,
"rewards/format_reward": 0.6526785969734192,
"rewards/tag_count_reward": 0.7654018223285675,
"step": 740
},
{
"clip_ratio": 0.0,
"completion_length": 744.0803863525391,
"epoch": 0.2543313134761458,
"grad_norm": 3.059191942214966,
"kl": 2.079296875,
"learning_rate": 2.7875683884779937e-06,
"loss": 0.2113,
"reward": 1.699330449104309,
"reward_std": 0.6429368361830712,
"rewards/accuracy_reward": 0.1500000087544322,
"rewards/format_reward": 0.7285714596509933,
"rewards/tag_count_reward": 0.8207589685916901,
"step": 745
},
{
"clip_ratio": 0.0,
"completion_length": 677.4527130126953,
"epoch": 0.25603823504309975,
"grad_norm": 4.240560054779053,
"kl": 1.996240234375,
"learning_rate": 2.782959948244593e-06,
"loss": 0.1608,
"reward": 1.8892858147621154,
"reward_std": 0.4875759735703468,
"rewards/accuracy_reward": 0.13214286137372255,
"rewards/format_reward": 0.8526786118745804,
"rewards/tag_count_reward": 0.9044643312692642,
"step": 750
},
{
"clip_ratio": 0.0,
"completion_length": 682.899136352539,
"epoch": 0.25774515661005376,
"grad_norm": 4.286968231201172,
"kl": 3.210546875,
"learning_rate": 2.7783059504197293e-06,
"loss": 0.3291,
"reward": 1.8241072475910187,
"reward_std": 0.6449521824717521,
"rewards/accuracy_reward": 0.1866071516647935,
"rewards/format_reward": 0.7794643253087997,
"rewards/tag_count_reward": 0.8580357432365417,
"step": 755
},
{
"clip_ratio": 0.0,
"completion_length": 718.2161071777343,
"epoch": 0.25945207817700777,
"grad_norm": 5.481039047241211,
"kl": 2.884765625,
"learning_rate": 2.7736065602657186e-06,
"loss": 0.3314,
"reward": 1.7383929252624513,
"reward_std": 0.6634438171982765,
"rewards/accuracy_reward": 0.15089286332949997,
"rewards/format_reward": 0.7526785999536514,
"rewards/tag_count_reward": 0.83482146859169,
"step": 760
},
{
"clip_ratio": 0.0,
"completion_length": 741.3330688476562,
"epoch": 0.2611589997439618,
"grad_norm": 5.945084095001221,
"kl": 3.43203125,
"learning_rate": 2.7688619446567456e-06,
"loss": 0.4039,
"reward": 1.7281250834465027,
"reward_std": 0.6317564234137535,
"rewards/accuracy_reward": 0.11250000493600965,
"rewards/format_reward": 0.7660714745521545,
"rewards/tag_count_reward": 0.8495536029338837,
"step": 765
},
{
"clip_ratio": 0.0,
"completion_length": 730.2044952392578,
"epoch": 0.26286592131091574,
"grad_norm": 9.397069931030273,
"kl": 3.7162109375,
"learning_rate": 2.7640722720729424e-06,
"loss": 0.3945,
"reward": 1.7779018819332122,
"reward_std": 0.6457269221544266,
"rewards/accuracy_reward": 0.15714286686852574,
"rewards/format_reward": 0.7714286088943482,
"rewards/tag_count_reward": 0.8493303894996643,
"step": 770
},
{
"clip_ratio": 0.0,
"completion_length": 686.750032043457,
"epoch": 0.26457284287786975,
"grad_norm": 3.8738150596618652,
"kl": 2.05673828125,
"learning_rate": 2.7592377125944e-06,
"loss": 0.2139,
"reward": 1.8906250834465026,
"reward_std": 0.49385173320770265,
"rewards/accuracy_reward": 0.14910714831203223,
"rewards/format_reward": 0.8428571850061417,
"rewards/tag_count_reward": 0.8986607521772385,
"step": 775
},
{
"clip_ratio": 0.0,
"completion_length": 708.9919921875,
"epoch": 0.26627976444482376,
"grad_norm": 5.740293502807617,
"kl": 2.2453125,
"learning_rate": 2.7543584378951353e-06,
"loss": 0.3081,
"reward": 1.9504465162754059,
"reward_std": 0.4741246700286865,
"rewards/accuracy_reward": 0.1589285794645548,
"rewards/format_reward": 0.8758929014205933,
"rewards/tag_count_reward": 0.9156250447034836,
"step": 780
},
{
"clip_ratio": 0.0,
"completion_length": 744.438427734375,
"epoch": 0.2679866860117778,
"grad_norm": 5.1604743003845215,
"kl": 3.141796875,
"learning_rate": 2.7494346212369884e-06,
"loss": 0.3417,
"reward": 1.8397322237491607,
"reward_std": 0.5357820302248001,
"rewards/accuracy_reward": 0.1508928632363677,
"rewards/format_reward": 0.8133928954601288,
"rewards/tag_count_reward": 0.8754464715719223,
"step": 785
},
{
"clip_ratio": 0.0,
"completion_length": 760.9339630126954,
"epoch": 0.26969360757873173,
"grad_norm": 2.745206832885742,
"kl": 3.4203125,
"learning_rate": 2.7444664374634755e-06,
"loss": 0.3742,
"reward": 1.7848215222358703,
"reward_std": 0.6119966760277749,
"rewards/accuracy_reward": 0.12321429047733545,
"rewards/format_reward": 0.8026786148548126,
"rewards/tag_count_reward": 0.8589286118745804,
"step": 790
},
{
"clip_ratio": 0.0,
"completion_length": 745.5946807861328,
"epoch": 0.27140052914568574,
"grad_norm": 5.523811340332031,
"kl": 3.22548828125,
"learning_rate": 2.739454062993578e-06,
"loss": 0.3161,
"reward": 1.862946516275406,
"reward_std": 0.5260402396321296,
"rewards/accuracy_reward": 0.16607143506407737,
"rewards/format_reward": 0.823214328289032,
"rewards/tag_count_reward": 0.8736607551574707,
"step": 795
},
{
"clip_ratio": 0.0,
"completion_length": 748.8053894042969,
"epoch": 0.27310745071263975,
"grad_norm": 4.337275981903076,
"kl": 1.62451171875,
"learning_rate": 2.7343976758154765e-06,
"loss": 0.2197,
"reward": 1.9397322237491608,
"reward_std": 0.47983556240797043,
"rewards/accuracy_reward": 0.169642863702029,
"rewards/format_reward": 0.86607146859169,
"rewards/tag_count_reward": 0.9040178984403611,
"step": 800
},
{
"clip_ratio": 0.0,
"completion_length": 729.189321899414,
"epoch": 0.27481437227959377,
"grad_norm": 6.721214294433594,
"kl": 3.484375,
"learning_rate": 2.7292974554802343e-06,
"loss": 0.3518,
"reward": 1.8767857909202577,
"reward_std": 0.5359082013368607,
"rewards/accuracy_reward": 0.16875000894069672,
"rewards/format_reward": 0.8303571820259095,
"rewards/tag_count_reward": 0.8776786088943481,
"step": 805
},
{
"clip_ratio": 0.0,
"completion_length": 764.805386352539,
"epoch": 0.2765212938465478,
"grad_norm": 4.701261520385742,
"kl": 2.3328125,
"learning_rate": 2.7241535830954174e-06,
"loss": 0.2629,
"reward": 1.816517949104309,
"reward_std": 0.500702029466629,
"rewards/accuracy_reward": 0.1241071479395032,
"rewards/format_reward": 0.8241071820259094,
"rewards/tag_count_reward": 0.8683036148548127,
"step": 810
},
{
"clip_ratio": 0.0,
"completion_length": 714.7661026000976,
"epoch": 0.27822821541350173,
"grad_norm": 13.666290283203125,
"kl": 2.6896484375,
"learning_rate": 2.718966241318666e-06,
"loss": 0.3039,
"reward": 1.8928572177886962,
"reward_std": 0.551244530826807,
"rewards/accuracy_reward": 0.17857143664732575,
"rewards/format_reward": 0.835714328289032,
"rewards/tag_count_reward": 0.8785714715719223,
"step": 815
},
{
"clip_ratio": 0.0,
"completion_length": 743.2277130126953,
"epoch": 0.27993513698045575,
"grad_norm": 8.007411003112793,
"kl": 2.865234375,
"learning_rate": 2.713735614351208e-06,
"loss": 0.2725,
"reward": 1.8796875894069671,
"reward_std": 0.5558550491929054,
"rewards/accuracy_reward": 0.17857143580913543,
"rewards/format_reward": 0.8267857521772385,
"rewards/tag_count_reward": 0.874330398440361,
"step": 820
},
{
"clip_ratio": 0.0,
"completion_length": 822.1491485595703,
"epoch": 0.28164205854740976,
"grad_norm": 5.34137487411499,
"kl": 3.694140625,
"learning_rate": 2.7084618879313177e-06,
"loss": 0.4349,
"reward": 1.6993304371833802,
"reward_std": 0.7172507822513581,
"rewards/accuracy_reward": 0.12500000596046448,
"rewards/format_reward": 0.7419643193483353,
"rewards/tag_count_reward": 0.8323661059141159,
"step": 825
},
{
"clip_ratio": 0.0,
"completion_length": 812.3500396728516,
"epoch": 0.28334898011436377,
"grad_norm": 2.870173454284668,
"kl": 3.059375,
"learning_rate": 2.7031452493277193e-06,
"loss": 0.3536,
"reward": 1.5551339983940125,
"reward_std": 0.7007823586463928,
"rewards/accuracy_reward": 0.12500000558793545,
"rewards/format_reward": 0.6125000178813934,
"rewards/tag_count_reward": 0.8176339656114578,
"step": 830
},
{
"clip_ratio": 0.0,
"completion_length": 774.5098571777344,
"epoch": 0.2850559016813177,
"grad_norm": 3.5636160373687744,
"kl": 2.5625,
"learning_rate": 2.6977858873329394e-06,
"loss": 0.2921,
"reward": 1.5524554193019866,
"reward_std": 0.6861270070075989,
"rewards/accuracy_reward": 0.16696429420262576,
"rewards/format_reward": 0.5660714566707611,
"rewards/tag_count_reward": 0.819419676065445,
"step": 835
},
{
"clip_ratio": 0.0,
"completion_length": 755.8803894042969,
"epoch": 0.28676282324827174,
"grad_norm": 7.776058197021484,
"kl": 2.5333984375,
"learning_rate": 2.6923839922566012e-06,
"loss": 0.3192,
"reward": 1.6341518700122832,
"reward_std": 0.6832039266824722,
"rewards/accuracy_reward": 0.17410714970901608,
"rewards/format_reward": 0.6196428865194321,
"rewards/tag_count_reward": 0.8404018163681031,
"step": 840
},
{
"clip_ratio": 0.0,
"completion_length": 714.9723510742188,
"epoch": 0.28846974481522575,
"grad_norm": 5.4556498527526855,
"kl": 3.023828125,
"learning_rate": 2.686939755918667e-06,
"loss": 0.3092,
"reward": 1.707589364051819,
"reward_std": 0.6695069923996926,
"rewards/accuracy_reward": 0.1642857222817838,
"rewards/format_reward": 0.700892886519432,
"rewards/tag_count_reward": 0.842410746216774,
"step": 845
},
{
"clip_ratio": 0.0,
"completion_length": 713.3223480224609,
"epoch": 0.29017666638217976,
"grad_norm": 4.508235454559326,
"kl": 3.01953125,
"learning_rate": 2.6814533716426266e-06,
"loss": 0.3284,
"reward": 1.711160808801651,
"reward_std": 0.6680841892957687,
"rewards/accuracy_reward": 0.14910715091973542,
"rewards/format_reward": 0.7205357491970062,
"rewards/tag_count_reward": 0.8415178894996643,
"step": 850
},
{
"clip_ratio": 0.0,
"completion_length": 693.2723495483399,
"epoch": 0.2918835879491337,
"grad_norm": 5.78735876083374,
"kl": 2.384765625,
"learning_rate": 2.675925034248633e-06,
"loss": 0.2759,
"reward": 1.7803572475910188,
"reward_std": 0.5899964898824692,
"rewards/accuracy_reward": 0.11339286342263222,
"rewards/format_reward": 0.7901786118745804,
"rewards/tag_count_reward": 0.8767857521772384,
"step": 855
},
{
"clip_ratio": 0.0,
"completion_length": 703.7946746826171,
"epoch": 0.2935905095160877,
"grad_norm": 10.656431198120117,
"kl": 2.68349609375,
"learning_rate": 2.670354940046585e-06,
"loss": 0.2471,
"reward": 1.8513393819332122,
"reward_std": 0.5069714426994324,
"rewards/accuracy_reward": 0.15625000707805156,
"rewards/format_reward": 0.8116071820259094,
"rewards/tag_count_reward": 0.8834821820259094,
"step": 860
},
{
"clip_ratio": 0.0,
"completion_length": 701.3857421875,
"epoch": 0.29529743108304174,
"grad_norm": 5.979944229125977,
"kl": 1.7492431640625,
"learning_rate": 2.664743286829154e-06,
"loss": 0.1888,
"reward": 1.957812601327896,
"reward_std": 0.4485042683780193,
"rewards/accuracy_reward": 0.16785714952275158,
"rewards/format_reward": 0.8714286148548126,
"rewards/tag_count_reward": 0.918526828289032,
"step": 865
},
{
"clip_ratio": 0.0,
"completion_length": 720.1571807861328,
"epoch": 0.29700435264999575,
"grad_norm": 11.468668937683105,
"kl": 2.44384765625,
"learning_rate": 2.6590902738647616e-06,
"loss": 0.2573,
"reward": 1.9354911506175996,
"reward_std": 0.4262035805732012,
"rewards/accuracy_reward": 0.14375000605359672,
"rewards/format_reward": 0.8741071820259094,
"rewards/tag_count_reward": 0.9176339715719223,
"step": 870
},
{
"clip_ratio": 0.0,
"completion_length": 751.6589630126953,
"epoch": 0.2987112742169497,
"grad_norm": 2.175903081893921,
"kl": 1.407275390625,
"learning_rate": 2.6533961018905052e-06,
"loss": 0.1315,
"reward": 1.9747768819332123,
"reward_std": 0.41898268088698387,
"rewards/accuracy_reward": 0.17946429271250963,
"rewards/format_reward": 0.8776786088943481,
"rewards/tag_count_reward": 0.9176339745521546,
"step": 875
},
{
"clip_ratio": 0.0,
"completion_length": 717.8036071777344,
"epoch": 0.3004181957839037,
"grad_norm": 14.61109733581543,
"kl": 1.8591796875,
"learning_rate": 2.6476609731050277e-06,
"loss": 0.18,
"reward": 1.9504465281963348,
"reward_std": 0.44176030084490775,
"rewards/accuracy_reward": 0.17410715091973544,
"rewards/format_reward": 0.8669643223285675,
"rewards/tag_count_reward": 0.9093750417232513,
"step": 880
},
{
"clip_ratio": 0.0,
"completion_length": 720.5634307861328,
"epoch": 0.30212511735085773,
"grad_norm": 9.741268157958984,
"kl": 2.7447265625,
"learning_rate": 2.6418850911613385e-06,
"loss": 0.1958,
"reward": 1.9113840103149413,
"reward_std": 0.5450401276350021,
"rewards/accuracy_reward": 0.19375000856816768,
"rewards/format_reward": 0.8312500447034836,
"rewards/tag_count_reward": 0.8863839685916901,
"step": 885
},
{
"clip_ratio": 0.0,
"completion_length": 695.589321899414,
"epoch": 0.30383203891781174,
"grad_norm": 3.411552667617798,
"kl": 1.9880859375,
"learning_rate": 2.6360686611595808e-06,
"loss": 0.1606,
"reward": 1.933928668498993,
"reward_std": 0.5159840732812881,
"rewards/accuracy_reward": 0.18303572619333863,
"rewards/format_reward": 0.8535714715719223,
"rewards/tag_count_reward": 0.8973214626312256,
"step": 890
},
{
"clip_ratio": 0.0,
"completion_length": 700.4536041259765,
"epoch": 0.3055389604847657,
"grad_norm": 2.229177474975586,
"kl": 1.51279296875,
"learning_rate": 2.63021188963975e-06,
"loss": 0.1022,
"reward": 1.9627233028411866,
"reward_std": 0.5419594079256058,
"rewards/accuracy_reward": 0.21071429420262575,
"rewards/format_reward": 0.8491071790456772,
"rewards/tag_count_reward": 0.902901828289032,
"step": 895
},
{
"clip_ratio": 0.0,
"completion_length": 720.6919952392578,
"epoch": 0.3072458820517197,
"grad_norm": 5.474370956420898,
"kl": 2.558203125,
"learning_rate": 2.62431498457436e-06,
"loss": 0.2085,
"reward": 1.7459822177886963,
"reward_std": 0.5931232050061226,
"rewards/accuracy_reward": 0.11696429029107094,
"rewards/format_reward": 0.7803571790456771,
"rewards/tag_count_reward": 0.848660746216774,
"step": 900
},
{
"clip_ratio": 0.0,
"completion_length": 731.7214614868165,
"epoch": 0.3089528036186737,
"grad_norm": 3.358218193054199,
"kl": 1.9822265625,
"learning_rate": 2.6183781553610553e-06,
"loss": 0.1622,
"reward": 1.9395090162754058,
"reward_std": 0.45833816528320315,
"rewards/accuracy_reward": 0.1821428650058806,
"rewards/format_reward": 0.8562500417232514,
"rewards/tag_count_reward": 0.9011161118745804,
"step": 905
},
{
"clip_ratio": 0.0,
"completion_length": 698.5411071777344,
"epoch": 0.31065972518562773,
"grad_norm": 3.1719696521759033,
"kl": 1.41435546875,
"learning_rate": 2.612401612815176e-06,
"loss": 0.1284,
"reward": 2.0187501192092894,
"reward_std": 0.4606904126703739,
"rewards/accuracy_reward": 0.20892858002334833,
"rewards/format_reward": 0.8901786148548126,
"rewards/tag_count_reward": 0.9196429014205932,
"step": 910
},
{
"clip_ratio": 0.0,
"completion_length": 702.0857452392578,
"epoch": 0.31236664675258174,
"grad_norm": 4.088690757751465,
"kl": 2.1846923828125,
"learning_rate": 2.6063855691622773e-06,
"loss": 0.2211,
"reward": 1.9000000834465027,
"reward_std": 0.4846745885908604,
"rewards/accuracy_reward": 0.134821433480829,
"rewards/format_reward": 0.8642857521772385,
"rewards/tag_count_reward": 0.900892898440361,
"step": 915
},
{
"clip_ratio": 0.0,
"completion_length": 721.0803833007812,
"epoch": 0.3140735683195357,
"grad_norm": 4.751431465148926,
"kl": 2.767578125,
"learning_rate": 2.6003302380305835e-06,
"loss": 0.2363,
"reward": 1.8395090222358703,
"reward_std": 0.6193484604358673,
"rewards/accuracy_reward": 0.18660715268924832,
"rewards/format_reward": 0.7964286088943482,
"rewards/tag_count_reward": 0.8564732551574707,
"step": 920
},
{
"clip_ratio": 0.0,
"completion_length": 763.2937896728515,
"epoch": 0.3157804898864897,
"grad_norm": 13.211581230163574,
"kl": 2.8927734375,
"learning_rate": 2.5942358344434123e-06,
"loss": 0.2895,
"reward": 1.7747768580913543,
"reward_std": 0.6452827632427216,
"rewards/accuracy_reward": 0.16428572237491607,
"rewards/format_reward": 0.7723214626312256,
"rewards/tag_count_reward": 0.8381696790456772,
"step": 925
},
{
"clip_ratio": 0.0,
"completion_length": 788.6928924560547,
"epoch": 0.3174874114534437,
"grad_norm": 10.8933687210083,
"kl": 3.546875,
"learning_rate": 2.588102574811531e-06,
"loss": 0.3418,
"reward": 1.7863839983940124,
"reward_std": 0.6044072821736336,
"rewards/accuracy_reward": 0.1562500067986548,
"rewards/format_reward": 0.7821428894996643,
"rewards/tag_count_reward": 0.8479911088943481,
"step": 930
},
{
"clip_ratio": 0.0,
"completion_length": 764.8339691162109,
"epoch": 0.31919433302039774,
"grad_norm": 6.15324068069458,
"kl": 3.3,
"learning_rate": 2.581930676925478e-06,
"loss": 0.3588,
"reward": 1.8930804491043092,
"reward_std": 0.5663298577070236,
"rewards/accuracy_reward": 0.20714286640286445,
"rewards/format_reward": 0.8142857611179352,
"rewards/tag_count_reward": 0.8716518223285675,
"step": 935
},
{
"clip_ratio": 0.0,
"completion_length": 747.1009246826172,
"epoch": 0.3209012545873517,
"grad_norm": 6.0142927169799805,
"kl": 1.954833984375,
"learning_rate": 2.5757203599478252e-06,
"loss": 0.2146,
"reward": 1.9209822177886964,
"reward_std": 0.4752822183072567,
"rewards/accuracy_reward": 0.1660714359022677,
"rewards/format_reward": 0.8562500447034835,
"rewards/tag_count_reward": 0.8986607551574707,
"step": 940
},
{
"clip_ratio": 0.0,
"completion_length": 794.6161071777344,
"epoch": 0.3226081761543057,
"grad_norm": 4.774122714996338,
"kl": 2.436328125,
"learning_rate": 2.5694718444053977e-06,
"loss": 0.281,
"reward": 1.9752233147621154,
"reward_std": 0.4990184798836708,
"rewards/accuracy_reward": 0.18660715082660317,
"rewards/format_reward": 0.873214328289032,
"rewards/tag_count_reward": 0.9154018312692642,
"step": 945
},
{
"clip_ratio": 0.0,
"completion_length": 798.3562744140625,
"epoch": 0.3243150977212597,
"grad_norm": 8.109549522399902,
"kl": 2.608984375,
"learning_rate": 2.5631853521814413e-06,
"loss": 0.3287,
"reward": 1.9203125774860381,
"reward_std": 0.5808916047215462,
"rewards/accuracy_reward": 0.2053571494296193,
"rewards/format_reward": 0.8321428924798966,
"rewards/tag_count_reward": 0.8828125417232513,
"step": 950
},
{
"clip_ratio": 0.0,
"completion_length": 786.5107513427735,
"epoch": 0.3260220192882137,
"grad_norm": 10.114983558654785,
"kl": 3.005859375,
"learning_rate": 2.556861106507745e-06,
"loss": 0.3483,
"reward": 1.8308036565780639,
"reward_std": 0.598972900211811,
"rewards/accuracy_reward": 0.16696429401636123,
"rewards/format_reward": 0.8000000387430191,
"rewards/tag_count_reward": 0.8638393193483352,
"step": 955
},
{
"clip_ratio": 0.0,
"completion_length": 804.1580688476563,
"epoch": 0.3277289408551677,
"grad_norm": 6.412922382354736,
"kl": 3.478125,
"learning_rate": 2.5504993319567154e-06,
"loss": 0.3655,
"reward": 1.837276864051819,
"reward_std": 0.5885704472661019,
"rewards/accuracy_reward": 0.1660714378580451,
"rewards/format_reward": 0.8062500447034836,
"rewards/tag_count_reward": 0.8649553924798965,
"step": 960
},
{
"clip_ratio": 0.0,
"completion_length": 773.3937805175781,
"epoch": 0.3294358624221217,
"grad_norm": 8.863078117370605,
"kl": 2.7748046875,
"learning_rate": 2.544100254433396e-06,
"loss": 0.3388,
"reward": 1.8790179371833802,
"reward_std": 0.5164345070719719,
"rewards/accuracy_reward": 0.1598214370198548,
"rewards/format_reward": 0.836607176065445,
"rewards/tag_count_reward": 0.8825893253087997,
"step": 965
},
{
"clip_ratio": 0.0,
"completion_length": 789.2919982910156,
"epoch": 0.3311427839890757,
"grad_norm": 10.961185455322266,
"kl": 3.2140625,
"learning_rate": 2.537664101167453e-06,
"loss": 0.3765,
"reward": 1.897991156578064,
"reward_std": 0.5266193248331547,
"rewards/accuracy_reward": 0.1919642908498645,
"rewards/format_reward": 0.8276786118745804,
"rewards/tag_count_reward": 0.8783482581377029,
"step": 970
},
{
"clip_ratio": 0.0,
"completion_length": 822.3991485595703,
"epoch": 0.3328497055560297,
"grad_norm": 3.4954206943511963,
"kl": 3.3,
"learning_rate": 2.531191100705102e-06,
"loss": 0.3987,
"reward": 1.801116168498993,
"reward_std": 0.596581481397152,
"rewards/accuracy_reward": 0.15803572116419673,
"rewards/format_reward": 0.795535746216774,
"rewards/tag_count_reward": 0.8475446820259094,
"step": 975
},
{
"clip_ratio": 0.0,
"completion_length": 819.3446838378907,
"epoch": 0.3345566271229837,
"grad_norm": 4.400045871734619,
"kl": 4.9328125,
"learning_rate": 2.5246814829009937e-06,
"loss": 0.493,
"reward": 1.7078125894069671,
"reward_std": 0.6570774331688881,
"rewards/accuracy_reward": 0.14910714896395802,
"rewards/format_reward": 0.7455357491970063,
"rewards/tag_count_reward": 0.8131696790456772,
"step": 980
},
{
"clip_ratio": 0.0,
"completion_length": 791.8509246826172,
"epoch": 0.3362635486899377,
"grad_norm": 5.740270137786865,
"kl": 2.66484375,
"learning_rate": 2.518135478910051e-06,
"loss": 0.3217,
"reward": 1.900223296880722,
"reward_std": 0.5493961855769157,
"rewards/accuracy_reward": 0.18928572423756124,
"rewards/format_reward": 0.8312500417232513,
"rewards/tag_count_reward": 0.8796875387430191,
"step": 985
},
{
"clip_ratio": 0.0,
"completion_length": 797.6411010742188,
"epoch": 0.3379704702568917,
"grad_norm": 7.0871076583862305,
"kl": 2.4041015625,
"learning_rate": 2.5115533211792624e-06,
"loss": 0.2537,
"reward": 1.90089293718338,
"reward_std": 0.4941477760672569,
"rewards/accuracy_reward": 0.14821429271250963,
"rewards/format_reward": 0.8571429044008255,
"rewards/tag_count_reward": 0.8955357670783997,
"step": 990
},
{
"clip_ratio": 0.0,
"completion_length": 750.6303894042969,
"epoch": 0.3396773918238457,
"grad_norm": 2.675424575805664,
"kl": 2.11796875,
"learning_rate": 2.5049352434394263e-06,
"loss": 0.2056,
"reward": 2.0265625774860383,
"reward_std": 0.39688876681029794,
"rewards/accuracy_reward": 0.18392857871949672,
"rewards/format_reward": 0.9044643312692642,
"rewards/tag_count_reward": 0.9381696909666062,
"step": 995
},
{
"clip_ratio": 0.0,
"completion_length": 783.2669982910156,
"epoch": 0.34138431339079967,
"grad_norm": 5.672367572784424,
"kl": 1.9142578125,
"learning_rate": 2.4982814806968506e-06,
"loss": 0.2906,
"reward": 1.9834822058677672,
"reward_std": 0.4078844651579857,
"rewards/accuracy_reward": 0.15892857778817415,
"rewards/format_reward": 0.8955357551574707,
"rewards/tag_count_reward": 0.9290179044008255,
"step": 1000
},
{
"clip_ratio": 0.0,
"completion_length": 814.5116424560547,
"epoch": 0.3430912349577537,
"grad_norm": 4.191405773162842,
"kl": 2.8767578125,
"learning_rate": 2.4915922692250107e-06,
"loss": 0.3323,
"reward": 1.9546875834465027,
"reward_std": 0.512929305434227,
"rewards/accuracy_reward": 0.19910715222358705,
"rewards/format_reward": 0.8580357611179352,
"rewards/tag_count_reward": 0.8975446790456771,
"step": 1005
},
{
"clip_ratio": 0.0,
"completion_length": 802.0875366210937,
"epoch": 0.3447981565247077,
"grad_norm": 10.191553115844727,
"kl": 2.809375,
"learning_rate": 2.484867846556157e-06,
"loss": 0.3604,
"reward": 1.8754464983940125,
"reward_std": 0.5539876684546471,
"rewards/accuracy_reward": 0.16250000847503543,
"rewards/format_reward": 0.8330357551574707,
"rewards/tag_count_reward": 0.8799107581377029,
"step": 1010
},
{
"clip_ratio": 0.0,
"completion_length": 760.6366394042968,
"epoch": 0.3465050780916617,
"grad_norm": 9.369229316711426,
"kl": 2.7625,
"learning_rate": 2.4781084514728797e-06,
"loss": 0.3838,
"reward": 1.8613839983940124,
"reward_std": 0.5541360631585122,
"rewards/accuracy_reward": 0.17589286714792252,
"rewards/format_reward": 0.8142857551574707,
"rewards/tag_count_reward": 0.871205398440361,
"step": 1015
},
{
"clip_ratio": 0.0,
"completion_length": 756.9161071777344,
"epoch": 0.3482119996586157,
"grad_norm": 49.239768981933594,
"kl": 2.9908203125,
"learning_rate": 2.471314323999632e-06,
"loss": 0.3966,
"reward": 1.8754465281963348,
"reward_std": 0.548448670655489,
"rewards/accuracy_reward": 0.17589286426082254,
"rewards/format_reward": 0.8223214656114578,
"rewards/tag_count_reward": 0.8772321790456772,
"step": 1020
},
{
"clip_ratio": 0.0,
"completion_length": 724.8803802490235,
"epoch": 0.34991892122556967,
"grad_norm": 8.442652702331543,
"kl": 2.96015625,
"learning_rate": 2.4644857053942066e-06,
"loss": 0.3474,
"reward": 1.855803668498993,
"reward_std": 0.5516223564743996,
"rewards/accuracy_reward": 0.16160714980214835,
"rewards/format_reward": 0.8151786029338837,
"rewards/tag_count_reward": 0.8790178924798966,
"step": 1025
},
{
"clip_ratio": 0.0,
"completion_length": 755.0678955078125,
"epoch": 0.3516258427925237,
"grad_norm": 6.77180814743042,
"kl": 2.8849609375,
"learning_rate": 2.457622838139166e-06,
"loss": 0.4207,
"reward": 1.8727679431438446,
"reward_std": 0.6202140808105469,
"rewards/accuracy_reward": 0.18928572423756124,
"rewards/format_reward": 0.8116071820259094,
"rewards/tag_count_reward": 0.8718750387430191,
"step": 1030
},
{
"clip_ratio": 0.0,
"completion_length": 849.719677734375,
"epoch": 0.3533327643594777,
"grad_norm": 10.987722396850586,
"kl": 3.19482421875,
"learning_rate": 2.4507259659332335e-06,
"loss": 0.3413,
"reward": 1.7948661506175996,
"reward_std": 0.6245307192206383,
"rewards/accuracy_reward": 0.17410715082660316,
"rewards/format_reward": 0.7767857521772384,
"rewards/tag_count_reward": 0.8439732521772385,
"step": 1035
},
{
"clip_ratio": 0.0,
"completion_length": 895.3580810546875,
"epoch": 0.3550396859264317,
"grad_norm": 6.328294277191162,
"kl": 3.1818359375,
"learning_rate": 2.443795333682642e-06,
"loss": 0.3346,
"reward": 1.7332590043544769,
"reward_std": 0.5950308412313461,
"rewards/accuracy_reward": 0.11160714598372579,
"rewards/format_reward": 0.7758929014205933,
"rewards/tag_count_reward": 0.8457589685916901,
"step": 1040
},
{
"clip_ratio": 0.0,
"completion_length": 793.3732391357422,
"epoch": 0.35674660749338566,
"grad_norm": 4.254904270172119,
"kl": 2.4732421875,
"learning_rate": 2.4368311874924335e-06,
"loss": 0.2072,
"reward": 1.8361608028411864,
"reward_std": 0.5906515352427959,
"rewards/accuracy_reward": 0.2044642929919064,
"rewards/format_reward": 0.7812500387430191,
"rewards/tag_count_reward": 0.8504464656114579,
"step": 1045
},
{
"clip_ratio": 0.0,
"completion_length": 773.1652160644531,
"epoch": 0.35845352906033967,
"grad_norm": 5.058516979217529,
"kl": 1.618115234375,
"learning_rate": 2.4298337746577227e-06,
"loss": 0.1907,
"reward": 1.948660784959793,
"reward_std": 0.4876935049891472,
"rewards/accuracy_reward": 0.1875000072643161,
"rewards/format_reward": 0.8535714715719223,
"rewards/tag_count_reward": 0.9075893253087998,
"step": 1050
},
{
"clip_ratio": 0.0,
"completion_length": 853.6509307861328,
"epoch": 0.3601604506272937,
"grad_norm": 7.05323600769043,
"kl": 3.0990234375,
"learning_rate": 2.4228033436549135e-06,
"loss": 0.3536,
"reward": 1.8406250953674317,
"reward_std": 0.5797086969017983,
"rewards/accuracy_reward": 0.1598214370198548,
"rewards/format_reward": 0.8080357611179352,
"rewards/tag_count_reward": 0.8727678984403611,
"step": 1055
},
{
"clip_ratio": 0.0,
"completion_length": 881.2268157958985,
"epoch": 0.3618673721942477,
"grad_norm": 7.7438883781433105,
"kl": 2.562109375,
"learning_rate": 2.4157401441328782e-06,
"loss": 0.311,
"reward": 1.8348215222358704,
"reward_std": 0.5855709910392761,
"rewards/accuracy_reward": 0.16339286286383867,
"rewards/format_reward": 0.80357146859169,
"rewards/tag_count_reward": 0.8678571909666062,
"step": 1060
},
{
"clip_ratio": 0.0,
"completion_length": 815.9911071777344,
"epoch": 0.36357429376120165,
"grad_norm": 2.9513394832611084,
"kl": 3.583203125,
"learning_rate": 2.4086444269040905e-06,
"loss": 0.3823,
"reward": 1.8600447118282317,
"reward_std": 0.560142383724451,
"rewards/accuracy_reward": 0.16071429243311286,
"rewards/format_reward": 0.8205357521772385,
"rewards/tag_count_reward": 0.8787946820259094,
"step": 1065
},
{
"clip_ratio": 0.0,
"completion_length": 861.2714660644531,
"epoch": 0.36528121532815566,
"grad_norm": 5.095993995666504,
"kl": 3.023828125,
"learning_rate": 2.4015164439357192e-06,
"loss": 0.2902,
"reward": 1.8332590162754059,
"reward_std": 0.6293601065874099,
"rewards/accuracy_reward": 0.16785715147852898,
"rewards/format_reward": 0.8000000417232513,
"rewards/tag_count_reward": 0.8654018253087997,
"step": 1070
},
{
"clip_ratio": 0.0,
"completion_length": 789.6696807861329,
"epoch": 0.3669881368951097,
"grad_norm": 3.97644305229187,
"kl": 2.890625,
"learning_rate": 2.3943564483406825e-06,
"loss": 0.3023,
"reward": 1.897991156578064,
"reward_std": 0.5991736590862274,
"rewards/accuracy_reward": 0.20000000931322576,
"rewards/format_reward": 0.8187500417232514,
"rewards/tag_count_reward": 0.8792411088943481,
"step": 1075
},
{
"clip_ratio": 0.0,
"completion_length": 780.6500396728516,
"epoch": 0.3686950584620637,
"grad_norm": 2.795880079269409,
"kl": 2.1537109375,
"learning_rate": 2.387164694368659e-06,
"loss": 0.2472,
"reward": 1.9285715222358704,
"reward_std": 0.5227874010801316,
"rewards/accuracy_reward": 0.1892857223749161,
"rewards/format_reward": 0.8401786148548126,
"rewards/tag_count_reward": 0.8991071790456772,
"step": 1080
},
{
"clip_ratio": 0.0,
"completion_length": 799.2286102294922,
"epoch": 0.37040198002901764,
"grad_norm": 5.091084003448486,
"kl": 4.1421875,
"learning_rate": 2.3799414373970595e-06,
"loss": 0.3939,
"reward": 1.7937500655651093,
"reward_std": 0.5763512536883354,
"rewards/accuracy_reward": 0.1687500085681677,
"rewards/format_reward": 0.7803571790456771,
"rewards/tag_count_reward": 0.8446428954601288,
"step": 1085
},
{
"clip_ratio": 0.0,
"completion_length": 763.5044982910156,
"epoch": 0.37210890159597165,
"grad_norm": 4.417270183563232,
"kl": 2.73828125,
"learning_rate": 2.372686933921957e-06,
"loss": 0.2796,
"reward": 1.7484375894069673,
"reward_std": 0.6087081640958786,
"rewards/accuracy_reward": 0.12053572125732899,
"rewards/format_reward": 0.7758928954601287,
"rewards/tag_count_reward": 0.8520089656114578,
"step": 1090
},
{
"clip_ratio": 0.0,
"completion_length": 762.025033569336,
"epoch": 0.37381582316292566,
"grad_norm": 3.8321306705474854,
"kl": 3.471875,
"learning_rate": 2.3654014415489823e-06,
"loss": 0.3855,
"reward": 1.7381697118282318,
"reward_std": 0.6132876291871071,
"rewards/accuracy_reward": 0.137500006146729,
"rewards/format_reward": 0.7633928924798965,
"rewards/tag_count_reward": 0.8372768223285675,
"step": 1095
},
{
"clip_ratio": 0.0,
"completion_length": 748.9294982910156,
"epoch": 0.3755227447298797,
"grad_norm": 6.6013569831848145,
"kl": 2.5224609375,
"learning_rate": 2.3580852189841734e-06,
"loss": 0.2485,
"reward": 1.7801340043544769,
"reward_std": 0.5882782399654388,
"rewards/accuracy_reward": 0.11785714775323868,
"rewards/format_reward": 0.7991071850061416,
"rewards/tag_count_reward": 0.8631696850061417,
"step": 1100
},
{
"clip_ratio": 0.0,
"completion_length": 722.4464599609375,
"epoch": 0.3772296662968337,
"grad_norm": 10.079838752746582,
"kl": 2.2388671875,
"learning_rate": 2.35073852602479e-06,
"loss": 0.2362,
"reward": 1.9562501072883607,
"reward_std": 0.42431456968188286,
"rewards/accuracy_reward": 0.1857142916880548,
"rewards/format_reward": 0.8633928954601288,
"rewards/tag_count_reward": 0.907142898440361,
"step": 1105
},
{
"clip_ratio": 0.0,
"completion_length": 735.2321746826171,
"epoch": 0.37893658786378764,
"grad_norm": 2.2775678634643555,
"kl": 2.25322265625,
"learning_rate": 2.343361623550087e-06,
"loss": 0.2052,
"reward": 1.963616180419922,
"reward_std": 0.4199460901319981,
"rewards/accuracy_reward": 0.18303572265431284,
"rewards/format_reward": 0.8732143223285675,
"rewards/tag_count_reward": 0.9073661148548127,
"step": 1110
},
{
"clip_ratio": 0.0,
"completion_length": 686.8169891357422,
"epoch": 0.38064350943074166,
"grad_norm": 0.7645272016525269,
"kl": 1.0261962890625,
"learning_rate": 2.3359547735120533e-06,
"loss": 0.0896,
"reward": 2.073214370012283,
"reward_std": 0.35470662005245684,
"rewards/accuracy_reward": 0.23214286658912897,
"rewards/format_reward": 0.9089286148548126,
"rewards/tag_count_reward": 0.9321429014205933,
"step": 1115
},
{
"clip_ratio": 0.0,
"completion_length": 673.1143157958984,
"epoch": 0.38235043099769567,
"grad_norm": 2.4553771018981934,
"kl": 1.4921875,
"learning_rate": 2.328518238926108e-06,
"loss": 0.1956,
"reward": 2.0895090222358705,
"reward_std": 0.3072059566155076,
"rewards/accuracy_reward": 0.20089286640286447,
"rewards/format_reward": 0.9339286088943481,
"rewards/tag_count_reward": 0.9546875387430191,
"step": 1120
},
{
"clip_ratio": 0.0,
"completion_length": 710.1732513427735,
"epoch": 0.3840573525646497,
"grad_norm": 14.056977272033691,
"kl": 1.6969482421875,
"learning_rate": 2.32105228386176e-06,
"loss": 0.1577,
"reward": 2.0502232909202576,
"reward_std": 0.3774921327829361,
"rewards/accuracy_reward": 0.17589286342263222,
"rewards/format_reward": 0.9223214656114578,
"rewards/tag_count_reward": 0.9520089715719223,
"step": 1125
},
{
"clip_ratio": 0.0,
"completion_length": 625.2696655273437,
"epoch": 0.38576427413160363,
"grad_norm": 5.242733478546143,
"kl": 1.62451171875,
"learning_rate": 2.313557173433233e-06,
"loss": 0.1798,
"reward": 2.0428572416305544,
"reward_std": 0.3383133355528116,
"rewards/accuracy_reward": 0.16339286314323545,
"rewards/format_reward": 0.9241071790456772,
"rewards/tag_count_reward": 0.9553571850061416,
"step": 1130
},
{
"clip_ratio": 0.0,
"completion_length": 649.1410980224609,
"epoch": 0.38747119569855765,
"grad_norm": 2.7701191902160645,
"kl": 2.2046142578125,
"learning_rate": 2.306033173790051e-06,
"loss": 0.2391,
"reward": 2.039062571525574,
"reward_std": 0.4759730361402035,
"rewards/accuracy_reward": 0.22321429550647737,
"rewards/format_reward": 0.8892857551574707,
"rewards/tag_count_reward": 0.9265625417232514,
"step": 1135
},
{
"clip_ratio": 0.0,
"completion_length": 717.9062866210937,
"epoch": 0.38917811726551166,
"grad_norm": 4.026260852813721,
"kl": 3.090625,
"learning_rate": 2.298480552107586e-06,
"loss": 0.3405,
"reward": 1.8812500834465027,
"reward_std": 0.5901964485645295,
"rewards/accuracy_reward": 0.14642857741564513,
"rewards/format_reward": 0.8419643312692642,
"rewards/tag_count_reward": 0.8928571879863739,
"step": 1140
},
{
"clip_ratio": 0.0,
"completion_length": 728.6393188476562,
"epoch": 0.39088503883246567,
"grad_norm": 5.9124603271484375,
"kl": 3.3755859375,
"learning_rate": 2.2908995765775724e-06,
"loss": 0.3093,
"reward": 1.8071429431438446,
"reward_std": 0.5719165071845055,
"rewards/accuracy_reward": 0.15000000419095158,
"rewards/format_reward": 0.7964286088943482,
"rewards/tag_count_reward": 0.8607143253087998,
"step": 1145
},
{
"clip_ratio": 0.0,
"completion_length": 720.6089508056641,
"epoch": 0.3925919603994196,
"grad_norm": 1.1206648349761963,
"kl": 1.8416015625,
"learning_rate": 2.283290516398582e-06,
"loss": 0.2198,
"reward": 1.9783483147621155,
"reward_std": 0.5003036454319953,
"rewards/accuracy_reward": 0.20178572246804832,
"rewards/format_reward": 0.8660714715719223,
"rewards/tag_count_reward": 0.9104911118745804,
"step": 1150
},
{
"clip_ratio": 0.0,
"completion_length": 688.6893173217774,
"epoch": 0.39429888196637364,
"grad_norm": 2.443659543991089,
"kl": 1.76376953125,
"learning_rate": 2.275653641766466e-06,
"loss": 0.1235,
"reward": 2.005803668498993,
"reward_std": 0.3753303915262222,
"rewards/accuracy_reward": 0.15714286388829352,
"rewards/format_reward": 0.9116071820259094,
"rewards/tag_count_reward": 0.9370536148548126,
"step": 1155
},
{
"clip_ratio": 0.0,
"completion_length": 725.0339630126953,
"epoch": 0.39600580353332765,
"grad_norm": 1.9696784019470215,
"kl": 0.81748046875,
"learning_rate": 2.2679892238647593e-06,
"loss": 0.127,
"reward": 2.1151786506175996,
"reward_std": 0.26731859482824805,
"rewards/accuracy_reward": 0.19375001126900315,
"rewards/format_reward": 0.9535714536905289,
"rewards/tag_count_reward": 0.967857176065445,
"step": 1160
},
{
"clip_ratio": 0.0,
"completion_length": 770.9250274658203,
"epoch": 0.39771272510028166,
"grad_norm": 6.627246856689453,
"kl": 1.4739501953125,
"learning_rate": 2.2602975348550526e-06,
"loss": 0.169,
"reward": 2.117857205867767,
"reward_std": 0.2829253111034632,
"rewards/accuracy_reward": 0.2026785809546709,
"rewards/format_reward": 0.9508928924798965,
"rewards/tag_count_reward": 0.9642857521772384,
"step": 1165
},
{
"clip_ratio": 0.0,
"completion_length": 765.2500427246093,
"epoch": 0.3994196466672356,
"grad_norm": 1.477338433265686,
"kl": 1.10830078125,
"learning_rate": 2.2525788478673256e-06,
"loss": 0.1475,
"reward": 2.150223308801651,
"reward_std": 0.3132738400250673,
"rewards/accuracy_reward": 0.24642858207225798,
"rewards/format_reward": 0.9437500387430191,
"rewards/tag_count_reward": 0.9600446879863739,
"step": 1170
},
{
"clip_ratio": 0.0,
"completion_length": 783.8303863525391,
"epoch": 0.40112656823418963,
"grad_norm": 3.81966495513916,
"kl": 1.746142578125,
"learning_rate": 2.2448334369902512e-06,
"loss": 0.167,
"reward": 2.0694197356700896,
"reward_std": 0.39150173366069796,
"rewards/accuracy_reward": 0.21160715222358703,
"rewards/format_reward": 0.9142857521772385,
"rewards/tag_count_reward": 0.943526816368103,
"step": 1175
},
{
"clip_ratio": 0.0,
"completion_length": 875.8545043945312,
"epoch": 0.40283348980114364,
"grad_norm": 4.390524387359619,
"kl": 1.417822265625,
"learning_rate": 2.2370615772614596e-06,
"loss": 0.1507,
"reward": 2.0004465222358703,
"reward_std": 0.38841529097408056,
"rewards/accuracy_reward": 0.13928572116419674,
"rewards/format_reward": 0.9187500417232514,
"rewards/tag_count_reward": 0.9424107611179352,
"step": 1180
},
{
"clip_ratio": 0.0,
"completion_length": 854.280404663086,
"epoch": 0.40454041136809765,
"grad_norm": 3.236765146255493,
"kl": 1.91328125,
"learning_rate": 2.229263544657774e-06,
"loss": 0.1798,
"reward": 2.026785832643509,
"reward_std": 0.4409887820482254,
"rewards/accuracy_reward": 0.20535715129226445,
"rewards/format_reward": 0.8919643223285675,
"rewards/tag_count_reward": 0.929464328289032,
"step": 1185
},
{
"clip_ratio": 0.0,
"completion_length": 848.9277191162109,
"epoch": 0.4062473329350516,
"grad_norm": 2.797024965286255,
"kl": 2.8095703125,
"learning_rate": 2.2214396160854086e-06,
"loss": 0.3007,
"reward": 1.9095982909202576,
"reward_std": 0.5447552859783172,
"rewards/accuracy_reward": 0.16696429532021284,
"rewards/format_reward": 0.848214328289032,
"rewards/tag_count_reward": 0.8944196909666061,
"step": 1190
},
{
"clip_ratio": 0.0,
"completion_length": 863.5241485595703,
"epoch": 0.4079542545020056,
"grad_norm": 1.5841748714447021,
"kl": 2.126171875,
"learning_rate": 2.2135900693701396e-06,
"loss": 0.1976,
"reward": 1.966517949104309,
"reward_std": 0.49482071250677107,
"rewards/accuracy_reward": 0.19375001098960637,
"rewards/format_reward": 0.8687500417232513,
"rewards/tag_count_reward": 0.9040179014205932,
"step": 1195
},
{
"clip_ratio": 0.0,
"completion_length": 841.4991424560546,
"epoch": 0.40966117606895963,
"grad_norm": 2.687502861022949,
"kl": 2.2541015625,
"learning_rate": 2.2057151832474344e-06,
"loss": 0.1548,
"reward": 1.9569197297096252,
"reward_std": 0.49666360318660735,
"rewards/accuracy_reward": 0.17500001080334188,
"rewards/format_reward": 0.8705357581377029,
"rewards/tag_count_reward": 0.9113839715719223,
"step": 1200
},
{
"clip_ratio": 0.0,
"completion_length": 822.8750457763672,
"epoch": 0.41136809763591364,
"grad_norm": 4.721628189086914,
"kl": 1.90126953125,
"learning_rate": 2.197815237352559e-06,
"loss": 0.2051,
"reward": 1.9877233028411865,
"reward_std": 0.4236783929169178,
"rewards/accuracy_reward": 0.16250000623986124,
"rewards/format_reward": 0.9000000387430191,
"rewards/tag_count_reward": 0.9252232581377029,
"step": 1205
},
{
"clip_ratio": 0.0,
"completion_length": 890.1616455078125,
"epoch": 0.41307501920286765,
"grad_norm": 5.242164611816406,
"kl": 1.93837890625,
"learning_rate": 2.189890512210643e-06,
"loss": 0.171,
"reward": 2.0156251013278963,
"reward_std": 0.44212441742420194,
"rewards/accuracy_reward": 0.20357143450528384,
"rewards/format_reward": 0.8901786148548126,
"rewards/tag_count_reward": 0.9218750387430191,
"step": 1210
},
{
"clip_ratio": 0.0,
"completion_length": 859.317007446289,
"epoch": 0.4147819407698216,
"grad_norm": 5.278153419494629,
"kl": 2.12939453125,
"learning_rate": 2.181941289226724e-06,
"loss": 0.2144,
"reward": 2.0169643819332124,
"reward_std": 0.41962831988930704,
"rewards/accuracy_reward": 0.18660715240985154,
"rewards/format_reward": 0.900892898440361,
"rewards/tag_count_reward": 0.9294643372297287,
"step": 1215
},
{
"clip_ratio": 0.0,
"completion_length": 826.7553985595703,
"epoch": 0.4164888623367756,
"grad_norm": 3.9751880168914795,
"kl": 2.369921875,
"learning_rate": 2.173967850675749e-06,
"loss": 0.2736,
"reward": 1.9564732968807221,
"reward_std": 0.44398799240589143,
"rewards/accuracy_reward": 0.1392857214435935,
"rewards/format_reward": 0.8946429014205932,
"rewards/tag_count_reward": 0.9225446850061416,
"step": 1220
},
{
"clip_ratio": 0.0,
"completion_length": 787.3187896728516,
"epoch": 0.41819578390372963,
"grad_norm": 2.2601590156555176,
"kl": 1.5919921875,
"learning_rate": 2.1659704796925556e-06,
"loss": 0.1694,
"reward": 1.9640626013278961,
"reward_std": 0.4538421332836151,
"rewards/accuracy_reward": 0.1473214347846806,
"rewards/format_reward": 0.8928571909666061,
"rewards/tag_count_reward": 0.9238839715719223,
"step": 1225
},
{
"clip_ratio": 0.0,
"completion_length": 787.5107513427735,
"epoch": 0.41990270547068365,
"grad_norm": 3.633096933364868,
"kl": 2.5519287109375,
"learning_rate": 2.157949460261816e-06,
"loss": 0.2969,
"reward": 1.9627233147621155,
"reward_std": 0.5212313048541546,
"rewards/accuracy_reward": 0.17678572060540318,
"rewards/format_reward": 0.8767857521772384,
"rewards/tag_count_reward": 0.9091518312692642,
"step": 1230
},
{
"clip_ratio": 0.0,
"completion_length": 823.5098602294922,
"epoch": 0.4216096270376376,
"grad_norm": 4.196262359619141,
"kl": 3.162890625,
"learning_rate": 2.149905077207953e-06,
"loss": 0.3764,
"reward": 1.890401875972748,
"reward_std": 0.5524288520216942,
"rewards/accuracy_reward": 0.18928572311997413,
"rewards/format_reward": 0.8285714656114578,
"rewards/tag_count_reward": 0.8725446820259094,
"step": 1235
},
{
"clip_ratio": 0.0,
"completion_length": 800.7946807861329,
"epoch": 0.4233165486045916,
"grad_norm": 14.379007339477539,
"kl": 2.49970703125,
"learning_rate": 2.1418376161850247e-06,
"loss": 0.2902,
"reward": 1.9551340341567993,
"reward_std": 0.5261093828827142,
"rewards/accuracy_reward": 0.1857142945751548,
"rewards/format_reward": 0.8633929044008255,
"rewards/tag_count_reward": 0.9060268223285675,
"step": 1240
},
{
"clip_ratio": 0.0,
"completion_length": 780.7705718994141,
"epoch": 0.4250234701715456,
"grad_norm": 1.6710326671600342,
"kl": 1.42890625,
"learning_rate": 2.133747363666584e-06,
"loss": 0.1738,
"reward": 2.026116156578064,
"reward_std": 0.39677606597542764,
"rewards/accuracy_reward": 0.17500000940635801,
"rewards/format_reward": 0.9151786118745804,
"rewards/tag_count_reward": 0.9359375447034836,
"step": 1245
},
{
"clip_ratio": 0.0,
"completion_length": 776.9598541259766,
"epoch": 0.42673039173849964,
"grad_norm": 2.11255145072937,
"kl": 2.662890625,
"learning_rate": 2.1256346069355026e-06,
"loss": 0.3397,
"reward": 2.0069197475910188,
"reward_std": 0.45871393829584123,
"rewards/accuracy_reward": 0.20267857946455478,
"rewards/format_reward": 0.8848214745521545,
"rewards/tag_count_reward": 0.9194196820259094,
"step": 1250
},
{
"clip_ratio": 0.0,
"completion_length": 817.2223602294922,
"epoch": 0.4284373133054536,
"grad_norm": 1.47514009475708,
"kl": 1.4896484375,
"learning_rate": 2.117499634073772e-06,
"loss": 0.2132,
"reward": 2.0267858028411867,
"reward_std": 0.42209520787000654,
"rewards/accuracy_reward": 0.17946429522708057,
"rewards/format_reward": 0.910714328289032,
"rewards/tag_count_reward": 0.9366071909666062,
"step": 1255
},
{
"clip_ratio": 0.0,
"completion_length": 768.3696807861328,
"epoch": 0.4301442348724076,
"grad_norm": 4.074100971221924,
"kl": 2.11015625,
"learning_rate": 2.1093427339522736e-06,
"loss": 0.2266,
"reward": 1.9930804431438447,
"reward_std": 0.403466971218586,
"rewards/accuracy_reward": 0.1750000081025064,
"rewards/format_reward": 0.894642898440361,
"rewards/tag_count_reward": 0.9234375357627869,
"step": 1260
},
{
"clip_ratio": 0.0,
"completion_length": 787.3937866210938,
"epoch": 0.4318511564393616,
"grad_norm": 3.29717755317688,
"kl": 1.61904296875,
"learning_rate": 2.1011641962205187e-06,
"loss": 0.228,
"reward": 1.9837054312229156,
"reward_std": 0.3893513225018978,
"rewards/accuracy_reward": 0.13660714868456125,
"rewards/format_reward": 0.9125000417232514,
"rewards/tag_count_reward": 0.9345982551574707,
"step": 1265
},
{
"clip_ratio": 0.0,
"completion_length": 810.4214721679688,
"epoch": 0.43355807800631563,
"grad_norm": 1.2225881814956665,
"kl": 2.337841796875,
"learning_rate": 2.092964311296366e-06,
"loss": 0.366,
"reward": 2.038616180419922,
"reward_std": 0.45512128323316575,
"rewards/accuracy_reward": 0.240178579185158,
"rewards/format_reward": 0.8839286088943481,
"rewards/tag_count_reward": 0.9145089685916901,
"step": 1270
},
{
"clip_ratio": 0.0,
"completion_length": 765.8562805175782,
"epoch": 0.4352649995732696,
"grad_norm": 0.9028781652450562,
"kl": 1.768310546875,
"learning_rate": 2.0847433703557086e-06,
"loss": 0.2351,
"reward": 1.9801340281963349,
"reward_std": 0.42181163243949416,
"rewards/accuracy_reward": 0.14732143403962256,
"rewards/format_reward": 0.9035714715719223,
"rewards/tag_count_reward": 0.9292411148548126,
"step": 1275
},
{
"clip_ratio": 0.0,
"completion_length": 774.1705718994141,
"epoch": 0.4369719211402236,
"grad_norm": 3.9188528060913086,
"kl": 1.5515625,
"learning_rate": 2.0765016653221312e-06,
"loss": 0.1852,
"reward": 2.011160808801651,
"reward_std": 0.4215161487460136,
"rewards/accuracy_reward": 0.17053572349250318,
"rewards/format_reward": 0.9098214656114578,
"rewards/tag_count_reward": 0.9308036059141159,
"step": 1280
},
{
"clip_ratio": 0.0,
"completion_length": 752.0446746826171,
"epoch": 0.4386788427071776,
"grad_norm": 1.324172019958496,
"kl": 1.8251708984375,
"learning_rate": 2.068239488856549e-06,
"loss": 0.1298,
"reward": 1.9770090222358703,
"reward_std": 0.3746281571686268,
"rewards/accuracy_reward": 0.13303572088479995,
"rewards/format_reward": 0.9116071850061417,
"rewards/tag_count_reward": 0.9323661148548126,
"step": 1285
},
{
"clip_ratio": 0.0,
"completion_length": 783.0786010742188,
"epoch": 0.4403857642741316,
"grad_norm": 1.5869454145431519,
"kl": 1.52734375,
"learning_rate": 2.05995713434681e-06,
"loss": 0.1537,
"reward": 1.961384004354477,
"reward_std": 0.4327257826924324,
"rewards/accuracy_reward": 0.15446429289877414,
"rewards/format_reward": 0.8857143253087998,
"rewards/tag_count_reward": 0.921205398440361,
"step": 1290
},
{
"clip_ratio": 0.0,
"completion_length": 810.894677734375,
"epoch": 0.4420926858410856,
"grad_norm": 3.493547201156616,
"kl": 2.1126953125,
"learning_rate": 2.0516548958972816e-06,
"loss": 0.2248,
"reward": 1.917410784959793,
"reward_std": 0.5152807034552097,
"rewards/accuracy_reward": 0.16339286556467414,
"rewards/format_reward": 0.8562500476837158,
"rewards/tag_count_reward": 0.8977679044008255,
"step": 1295
},
{
"clip_ratio": 0.0,
"completion_length": 776.244677734375,
"epoch": 0.4437996074080396,
"grad_norm": 1.382163405418396,
"kl": 1.617578125,
"learning_rate": 2.043333068318405e-06,
"loss": 0.1537,
"reward": 1.9573661506175994,
"reward_std": 0.5043010532855987,
"rewards/accuracy_reward": 0.1919642925262451,
"rewards/format_reward": 0.8625000417232513,
"rewards/tag_count_reward": 0.902901828289032,
"step": 1300
},
{
"clip_ratio": 0.0,
"completion_length": 761.0607482910157,
"epoch": 0.4455065289749936,
"grad_norm": 2.492479085922241,
"kl": 2.128125,
"learning_rate": 2.0349919471162245e-06,
"loss": 0.1946,
"reward": 1.8754465103149414,
"reward_std": 0.5213326171040535,
"rewards/accuracy_reward": 0.1455357194878161,
"rewards/format_reward": 0.8401786118745804,
"rewards/tag_count_reward": 0.8897321820259094,
"step": 1305
},
{
"clip_ratio": 0.0,
"completion_length": 812.150927734375,
"epoch": 0.4472134505419476,
"grad_norm": 4.123756408691406,
"kl": 2.7369140625,
"learning_rate": 2.0266318284818983e-06,
"loss": 0.2577,
"reward": 1.83883935213089,
"reward_std": 0.5365928649902344,
"rewards/accuracy_reward": 0.14642857788130642,
"rewards/format_reward": 0.8160714715719223,
"rewards/tag_count_reward": 0.8763393282890319,
"step": 1310
},
{
"clip_ratio": 0.0,
"completion_length": 758.2562866210938,
"epoch": 0.4489203721089016,
"grad_norm": 3.1437504291534424,
"kl": 1.9005859375,
"learning_rate": 2.0182530092811776e-06,
"loss": 0.2219,
"reward": 1.877678668498993,
"reward_std": 0.5208032101392746,
"rewards/accuracy_reward": 0.1464285783469677,
"rewards/format_reward": 0.84107146859169,
"rewards/tag_count_reward": 0.8901786118745804,
"step": 1315
},
{
"clip_ratio": 0.0,
"completion_length": 773.2527160644531,
"epoch": 0.4506272936758556,
"grad_norm": 3.798027753829956,
"kl": 2.2734375,
"learning_rate": 2.0098557870438672e-06,
"loss": 0.2386,
"reward": 1.937946516275406,
"reward_std": 0.5240325286984444,
"rewards/accuracy_reward": 0.18482143841683865,
"rewards/format_reward": 0.8526786148548127,
"rewards/tag_count_reward": 0.9004464715719223,
"step": 1320
},
{
"clip_ratio": 0.0,
"completion_length": 755.4589569091797,
"epoch": 0.4523342152428096,
"grad_norm": 4.841824054718018,
"kl": 2.136328125,
"learning_rate": 2.001440459953258e-06,
"loss": 0.1876,
"reward": 2.003125101327896,
"reward_std": 0.4476234719157219,
"rewards/accuracy_reward": 0.20178572181612253,
"rewards/format_reward": 0.883035758137703,
"rewards/tag_count_reward": 0.9183036148548126,
"step": 1325
},
{
"clip_ratio": 0.0,
"completion_length": 748.7973480224609,
"epoch": 0.4540411368097636,
"grad_norm": 19.314414978027344,
"kl": 1.78818359375,
"learning_rate": 1.99300732683554e-06,
"loss": 0.1903,
"reward": 2.0245536625385285,
"reward_std": 0.39623707011342046,
"rewards/accuracy_reward": 0.1741071510128677,
"rewards/format_reward": 0.9125000447034836,
"rewards/tag_count_reward": 0.9379464775323868,
"step": 1330
},
{
"clip_ratio": 0.0,
"completion_length": 795.800927734375,
"epoch": 0.4557480583767176,
"grad_norm": 1.6212748289108276,
"kl": 2.395703125,
"learning_rate": 1.9845566871491923e-06,
"loss": 0.2654,
"reward": 1.9578125953674317,
"reward_std": 0.4312289670109749,
"rewards/accuracy_reward": 0.14017857611179352,
"rewards/format_reward": 0.8919643282890319,
"rewards/tag_count_reward": 0.9256696820259094,
"step": 1335
},
{
"clip_ratio": 0.0,
"completion_length": 736.050927734375,
"epoch": 0.45745497994367157,
"grad_norm": 3.6908681392669678,
"kl": 2.1092041015625,
"learning_rate": 1.9760888409743456e-06,
"loss": 0.2535,
"reward": 2.0162947356700895,
"reward_std": 0.40158444084227085,
"rewards/accuracy_reward": 0.1982142932713032,
"rewards/format_reward": 0.894642898440361,
"rewards/tag_count_reward": 0.9234375447034836,
"step": 1340
},
{
"clip_ratio": 0.0,
"completion_length": 764.6018188476562,
"epoch": 0.4591619015106256,
"grad_norm": 5.394177436828613,
"kl": 1.6607421875,
"learning_rate": 1.96760408900213e-06,
"loss": 0.2132,
"reward": 2.040178656578064,
"reward_std": 0.4042182721197605,
"rewards/accuracy_reward": 0.19285715064033865,
"rewards/format_reward": 0.9107143253087997,
"rewards/tag_count_reward": 0.9366071790456771,
"step": 1345
},
{
"clip_ratio": 0.0,
"completion_length": 786.1000396728516,
"epoch": 0.4608688230775796,
"grad_norm": 4.516533374786377,
"kl": 2.57392578125,
"learning_rate": 1.9591027325239968e-06,
"loss": 0.2251,
"reward": 1.9424108147621155,
"reward_std": 0.5002983555197715,
"rewards/accuracy_reward": 0.16517857778817416,
"rewards/format_reward": 0.86607146859169,
"rewards/tag_count_reward": 0.9111607521772385,
"step": 1350
},
{
"clip_ratio": 0.0,
"completion_length": 763.5866394042969,
"epoch": 0.4625757446445336,
"grad_norm": 2.373253107070923,
"kl": 2.108984375,
"learning_rate": 1.950585073421018e-06,
"loss": 0.2665,
"reward": 1.9013393759727477,
"reward_std": 0.4912081308662891,
"rewards/accuracy_reward": 0.11517857778817416,
"rewards/format_reward": 0.875892898440361,
"rewards/tag_count_reward": 0.910267898440361,
"step": 1355
},
{
"clip_ratio": 0.0,
"completion_length": 803.5339721679687,
"epoch": 0.46428266621148756,
"grad_norm": 1.557633399963379,
"kl": 1.86875,
"learning_rate": 1.942051414153169e-06,
"loss": 0.2379,
"reward": 1.9354911744594574,
"reward_std": 0.47398936599493025,
"rewards/accuracy_reward": 0.13928572162985803,
"rewards/format_reward": 0.8794643312692643,
"rewards/tag_count_reward": 0.9167411029338837,
"step": 1360
},
{
"clip_ratio": 0.0,
"completion_length": 810.9598541259766,
"epoch": 0.46598958777844157,
"grad_norm": 1.5708023309707642,
"kl": 2.6498046875,
"learning_rate": 1.933502057748587e-06,
"loss": 0.2696,
"reward": 1.9158483147621155,
"reward_std": 0.5079516015946866,
"rewards/accuracy_reward": 0.1535714370198548,
"rewards/format_reward": 0.8607143282890319,
"rewards/tag_count_reward": 0.9015625387430191,
"step": 1365
},
{
"clip_ratio": 0.0,
"completion_length": 849.8884338378906,
"epoch": 0.4676965093453956,
"grad_norm": 10.703516006469727,
"kl": 2.755859375,
"learning_rate": 1.9249373077928083e-06,
"loss": 0.3247,
"reward": 1.8904018700122833,
"reward_std": 0.5384089753031731,
"rewards/accuracy_reward": 0.1571428648196161,
"rewards/format_reward": 0.8473214685916901,
"rewards/tag_count_reward": 0.8859375476837158,
"step": 1370
},
{
"clip_ratio": 0.0,
"completion_length": 816.4321716308593,
"epoch": 0.4694034309123496,
"grad_norm": 4.501167297363281,
"kl": 2.9134765625,
"learning_rate": 1.916357468417994e-06,
"loss": 0.3764,
"reward": 1.8883929431438446,
"reward_std": 0.554328379034996,
"rewards/accuracy_reward": 0.16339286603033543,
"rewards/format_reward": 0.8419643253087997,
"rewards/tag_count_reward": 0.883035758137703,
"step": 1375
},
{
"clip_ratio": 0.0,
"completion_length": 816.8000396728515,
"epoch": 0.47111035247930355,
"grad_norm": 4.633331298828125,
"kl": 2.7892578125,
"learning_rate": 1.9077628442921244e-06,
"loss": 0.3534,
"reward": 1.9020090043544768,
"reward_std": 0.5272788584232331,
"rewards/accuracy_reward": 0.15625000484287738,
"rewards/format_reward": 0.8517857521772385,
"rewards/tag_count_reward": 0.8939732611179352,
"step": 1380
},
{
"clip_ratio": 0.0,
"completion_length": 771.9250305175781,
"epoch": 0.47281727404625756,
"grad_norm": 10.220499992370605,
"kl": 3.145703125,
"learning_rate": 1.8991537406081833e-06,
"loss": 0.3939,
"reward": 1.8629464983940125,
"reward_std": 0.49162818491458893,
"rewards/accuracy_reward": 0.14107143618166446,
"rewards/format_reward": 0.8366071820259094,
"rewards/tag_count_reward": 0.8852679044008255,
"step": 1385
},
{
"clip_ratio": 0.0,
"completion_length": 794.0920043945313,
"epoch": 0.4745241956132116,
"grad_norm": 5.842833518981934,
"kl": 2.51865234375,
"learning_rate": 1.8905304630733202e-06,
"loss": 0.3048,
"reward": 1.9725447416305542,
"reward_std": 0.5114175193011761,
"rewards/accuracy_reward": 0.23035715110599994,
"rewards/format_reward": 0.854464328289032,
"rewards/tag_count_reward": 0.8877232521772385,
"step": 1390
},
{
"clip_ratio": 0.0,
"completion_length": 776.0598571777343,
"epoch": 0.4762311171801656,
"grad_norm": 6.666676998138428,
"kl": 2.329638671875,
"learning_rate": 1.881893317897994e-06,
"loss": 0.28,
"reward": 1.9508929431438446,
"reward_std": 0.4666281685233116,
"rewards/accuracy_reward": 0.17053572190925478,
"rewards/format_reward": 0.873214328289032,
"rewards/tag_count_reward": 0.907142898440361,
"step": 1395
},
{
"clip_ratio": 0.0,
"completion_length": 845.3866455078125,
"epoch": 0.47793803874711954,
"grad_norm": 4.843286991119385,
"kl": 2.96796875,
"learning_rate": 1.8732426117851007e-06,
"loss": 0.3336,
"reward": 1.872991156578064,
"reward_std": 0.531745757162571,
"rewards/accuracy_reward": 0.15714286407455802,
"rewards/format_reward": 0.835714328289032,
"rewards/tag_count_reward": 0.8801339656114578,
"step": 1400
},
{
"clip_ratio": 0.0,
"completion_length": 811.6437866210938,
"epoch": 0.47964496031407355,
"grad_norm": 5.695433139801025,
"kl": 2.312890625,
"learning_rate": 1.8645786519190823e-06,
"loss": 0.2987,
"reward": 1.8801340222358705,
"reward_std": 0.5154039770364761,
"rewards/accuracy_reward": 0.13571429047733546,
"rewards/format_reward": 0.845535758137703,
"rewards/tag_count_reward": 0.8988839715719223,
"step": 1405
},
{
"clip_ratio": 0.0,
"completion_length": 846.7286102294922,
"epoch": 0.48135188188102757,
"grad_norm": 6.439587593078613,
"kl": 2.5794921875,
"learning_rate": 1.8559017459550167e-06,
"loss": 0.3875,
"reward": 1.8656250655651092,
"reward_std": 0.5760970249772072,
"rewards/accuracy_reward": 0.17053572209551932,
"rewards/format_reward": 0.8205357521772385,
"rewards/tag_count_reward": 0.8745536088943482,
"step": 1410
},
{
"clip_ratio": 0.0,
"completion_length": 954.6134429931641,
"epoch": 0.4830588034479816,
"grad_norm": 6.831388473510742,
"kl": 3.415234375,
"learning_rate": 1.8472122020076958e-06,
"loss": 0.4372,
"reward": 1.7779018580913544,
"reward_std": 0.6864479184150696,
"rewards/accuracy_reward": 0.15178572349250316,
"rewards/format_reward": 0.7741071820259094,
"rewards/tag_count_reward": 0.8520089715719223,
"step": 1415
},
{
"clip_ratio": 0.0,
"completion_length": 928.3268310546875,
"epoch": 0.4847657250149356,
"grad_norm": 4.245449542999268,
"kl": 3.4875,
"learning_rate": 1.8385103286406828e-06,
"loss": 0.5199,
"reward": 1.7602679431438446,
"reward_std": 0.658353678882122,
"rewards/accuracy_reward": 0.17142857844009995,
"rewards/format_reward": 0.7517857402563095,
"rewards/tag_count_reward": 0.8370536118745804,
"step": 1420
},
{
"clip_ratio": 0.0,
"completion_length": 945.8277252197265,
"epoch": 0.48647264658188955,
"grad_norm": 4.0879902839660645,
"kl": 3.248046875,
"learning_rate": 1.8297964348553555e-06,
"loss": 0.4751,
"reward": 1.7975447297096252,
"reward_std": 0.651876625418663,
"rewards/accuracy_reward": 0.16517857937142252,
"rewards/format_reward": 0.7839286118745804,
"rewards/tag_count_reward": 0.8484375387430191,
"step": 1425
},
{
"clip_ratio": 0.0,
"completion_length": 878.1509368896484,
"epoch": 0.48817956814884356,
"grad_norm": 4.380373001098633,
"kl": 2.2369140625,
"learning_rate": 1.821070830079935e-06,
"loss": 0.3507,
"reward": 1.8169643521308898,
"reward_std": 0.5784257367253304,
"rewards/accuracy_reward": 0.14553571958094835,
"rewards/format_reward": 0.810714328289032,
"rewards/tag_count_reward": 0.8607143253087998,
"step": 1430
},
{
"clip_ratio": 0.0,
"completion_length": 820.9803894042968,
"epoch": 0.48988648971579757,
"grad_norm": 2.2317075729370117,
"kl": 1.8021484375,
"learning_rate": 1.812333824158494e-06,
"loss": 0.2936,
"reward": 1.8819197297096253,
"reward_std": 0.5624213635921478,
"rewards/accuracy_reward": 0.1625000074505806,
"rewards/format_reward": 0.8375000447034836,
"rewards/tag_count_reward": 0.8819196820259094,
"step": 1435
},
{
"clip_ratio": 0.0,
"completion_length": 823.3062927246094,
"epoch": 0.4915934112827516,
"grad_norm": 2.928972005844116,
"kl": 2.2279296875,
"learning_rate": 1.80358572733996e-06,
"loss": 0.2098,
"reward": 1.8060268759727478,
"reward_std": 0.5900841251015663,
"rewards/accuracy_reward": 0.1669642923399806,
"rewards/format_reward": 0.7946428924798965,
"rewards/tag_count_reward": 0.8444196790456772,
"step": 1440
},
{
"clip_ratio": 0.0,
"completion_length": 766.2384307861328,
"epoch": 0.49330033284970554,
"grad_norm": 2.458934783935547,
"kl": 1.76279296875,
"learning_rate": 1.7948268502670936e-06,
"loss": 0.2435,
"reward": 1.9732143819332122,
"reward_std": 0.4333352468907833,
"rewards/accuracy_reward": 0.17767857909202575,
"rewards/format_reward": 0.8839286148548127,
"rewards/tag_count_reward": 0.9116071820259094,
"step": 1445
},
{
"clip_ratio": 0.0,
"completion_length": 753.6009185791015,
"epoch": 0.49500725441665955,
"grad_norm": 2.1454453468322754,
"kl": 1.71591796875,
"learning_rate": 1.7860575039654605e-06,
"loss": 0.2184,
"reward": 1.9578125774860382,
"reward_std": 0.4223016068339348,
"rewards/accuracy_reward": 0.16517857760190963,
"rewards/format_reward": 0.8830357551574707,
"rewards/tag_count_reward": 0.9095982551574707,
"step": 1450
},
{
"clip_ratio": 0.0,
"completion_length": 834.9000396728516,
"epoch": 0.49671417598361356,
"grad_norm": 4.523981094360352,
"kl": 2.29609375,
"learning_rate": 1.7772779998323859e-06,
"loss": 0.3067,
"reward": 1.8763393580913543,
"reward_std": 0.5456696435809135,
"rewards/accuracy_reward": 0.172321436367929,
"rewards/format_reward": 0.8348214656114579,
"rewards/tag_count_reward": 0.8691964626312256,
"step": 1455
},
{
"clip_ratio": 0.0,
"completion_length": 839.538427734375,
"epoch": 0.49842109755056757,
"grad_norm": 3.2237324714660645,
"kl": 2.2123046875,
"learning_rate": 1.768488649625897e-06,
"loss": 0.3073,
"reward": 1.825446492433548,
"reward_std": 0.5417851440608501,
"rewards/accuracy_reward": 0.11696429047733545,
"rewards/format_reward": 0.8312500387430191,
"rewards/tag_count_reward": 0.8772321790456772,
"step": 1460
},
{
"clip_ratio": 0.0,
"completion_length": 828.7875366210938,
"epoch": 0.5001280191175216,
"grad_norm": 6.970083713531494,
"kl": 2.060546875,
"learning_rate": 1.7596897654536527e-06,
"loss": 0.2593,
"reward": 1.8203125894069672,
"reward_std": 0.5907119750976563,
"rewards/accuracy_reward": 0.14107143683359027,
"rewards/format_reward": 0.8169643253087997,
"rewards/tag_count_reward": 0.862276816368103,
"step": 1465
},
{
"clip_ratio": 0.0,
"completion_length": 805.7134307861328,
"epoch": 0.5018349406844755,
"grad_norm": 15.752748489379883,
"kl": 2.48359375,
"learning_rate": 1.7508816597618611e-06,
"loss": 0.3889,
"reward": 1.835714375972748,
"reward_std": 0.5934141919016838,
"rewards/accuracy_reward": 0.147321436740458,
"rewards/format_reward": 0.8223214685916901,
"rewards/tag_count_reward": 0.8660714715719223,
"step": 1470
},
{
"clip_ratio": 0.0,
"completion_length": 995.4866485595703,
"epoch": 0.5035418622514295,
"grad_norm": 10.973194122314453,
"kl": 4.366015625,
"learning_rate": 1.742064645324183e-06,
"loss": 0.5055,
"reward": 1.472098284959793,
"reward_std": 0.7849221974611282,
"rewards/accuracy_reward": 0.16071429327130318,
"rewards/format_reward": 0.5678571671247482,
"rewards/tag_count_reward": 0.743526816368103,
"step": 1475
},
{
"clip_ratio": 0.0,
"completion_length": 1102.144708251953,
"epoch": 0.5052487838183836,
"grad_norm": 6.444234371185303,
"kl": 4.0703125,
"learning_rate": 1.7332390352306282e-06,
"loss": 0.3969,
"reward": 0.9915179014205933,
"reward_std": 0.6856313347816467,
"rewards/accuracy_reward": 0.1000000050291419,
"rewards/format_reward": 0.28392858281731603,
"rewards/tag_count_reward": 0.607589316368103,
"step": 1480
},
{
"clip_ratio": 0.0,
"completion_length": 862.6955749511719,
"epoch": 0.5069557053853375,
"grad_norm": 3.410790205001831,
"kl": 1.875390625,
"learning_rate": 1.7244051428764343e-06,
"loss": 0.2012,
"reward": 0.9638393312692642,
"reward_std": 0.6327465578913689,
"rewards/accuracy_reward": 0.14732143711298704,
"rewards/format_reward": 0.21071429494768382,
"rewards/tag_count_reward": 0.6058035999536514,
"step": 1485
},
{
"clip_ratio": 0.0,
"completion_length": 769.9509246826171,
"epoch": 0.5086626269522916,
"grad_norm": 1.9697375297546387,
"kl": 1.87890625,
"learning_rate": 1.7155632819509417e-06,
"loss": 0.2169,
"reward": 0.9707589715719223,
"reward_std": 0.6286859422922134,
"rewards/accuracy_reward": 0.13571429383009673,
"rewards/format_reward": 0.22767858132719992,
"rewards/tag_count_reward": 0.607366093993187,
"step": 1490
},
{
"clip_ratio": 0.0,
"completion_length": 766.0982421875,
"epoch": 0.5103695485192455,
"grad_norm": 10.65464973449707,
"kl": 2.2140625,
"learning_rate": 1.7067137664264521e-06,
"loss": 0.2392,
"reward": 1.3678571969270705,
"reward_std": 0.768636429309845,
"rewards/accuracy_reward": 0.13571429224684833,
"rewards/format_reward": 0.4928571656346321,
"rewards/tag_count_reward": 0.7392857491970062,
"step": 1495
},
{
"clip_ratio": 0.0,
"completion_length": 736.3580718994141,
"epoch": 0.5120764700861995,
"grad_norm": 2.2699103355407715,
"kl": 1.77646484375,
"learning_rate": 1.6978569105470792e-06,
"loss": 0.1815,
"reward": 1.8080357968807221,
"reward_std": 0.6647521004080772,
"rewards/accuracy_reward": 0.19642858039587735,
"rewards/format_reward": 0.7562500357627868,
"rewards/tag_count_reward": 0.8553571820259094,
"step": 1500
},
{
"clip_ratio": 0.0,
"completion_length": 779.2143188476563,
"epoch": 0.5137833916531536,
"grad_norm": 6.321624279022217,
"kl": 2.9748046875,
"learning_rate": 1.6889930288175922e-06,
"loss": 0.3307,
"reward": 1.8591518819332122,
"reward_std": 0.6267977371811867,
"rewards/accuracy_reward": 0.18214286677539349,
"rewards/format_reward": 0.8017857581377029,
"rewards/tag_count_reward": 0.8752232521772385,
"step": 1505
},
{
"clip_ratio": 0.0,
"completion_length": 795.1321807861328,
"epoch": 0.5154903132201075,
"grad_norm": 1.8599531650543213,
"kl": 3.016796875,
"learning_rate": 1.6801224359922466e-06,
"loss": 0.3243,
"reward": 1.79151793718338,
"reward_std": 0.6220594555139541,
"rewards/accuracy_reward": 0.14910714784637094,
"rewards/format_reward": 0.7839286088943481,
"rewards/tag_count_reward": 0.8584821820259094,
"step": 1510
},
{
"clip_ratio": 0.0,
"completion_length": 790.3187866210938,
"epoch": 0.5171972347870616,
"grad_norm": 4.756523132324219,
"kl": 1.9025390625,
"learning_rate": 1.6712454470636052e-06,
"loss": 0.2568,
"reward": 1.8700893700122834,
"reward_std": 0.558867233991623,
"rewards/accuracy_reward": 0.19375000949949026,
"rewards/format_reward": 0.8107143223285675,
"rewards/tag_count_reward": 0.8656250447034836,
"step": 1515
},
{
"clip_ratio": 0.0,
"completion_length": 774.834848022461,
"epoch": 0.5189041563540155,
"grad_norm": 1.1406761407852173,
"kl": 1.52021484375,
"learning_rate": 1.6623623772513576e-06,
"loss": 0.2219,
"reward": 1.9486608028411865,
"reward_std": 0.46728189289569855,
"rewards/accuracy_reward": 0.17767857825383543,
"rewards/format_reward": 0.8642857551574707,
"rewards/tag_count_reward": 0.9066964685916901,
"step": 1520
},
{
"clip_ratio": 0.0,
"completion_length": 765.0571685791016,
"epoch": 0.5206110779209695,
"grad_norm": 1.1958781480789185,
"kl": 1.4518798828125,
"learning_rate": 1.6534735419911228e-06,
"loss": 0.2041,
"reward": 1.9100447177886963,
"reward_std": 0.4585052601993084,
"rewards/accuracy_reward": 0.15178572274744512,
"rewards/format_reward": 0.8625000387430191,
"rewards/tag_count_reward": 0.89575896859169,
"step": 1525
},
{
"clip_ratio": 0.0,
"completion_length": 802.6937805175781,
"epoch": 0.5223179994879236,
"grad_norm": 4.186196327209473,
"kl": 1.6386474609375,
"learning_rate": 1.6445792569232486e-06,
"loss": 0.1724,
"reward": 1.8738840222358704,
"reward_std": 0.5293861232697964,
"rewards/accuracy_reward": 0.15535714933648706,
"rewards/format_reward": 0.8401786059141159,
"rewards/tag_count_reward": 0.8783482551574707,
"step": 1530
},
{
"clip_ratio": 0.0,
"completion_length": 802.0018249511719,
"epoch": 0.5240249210548775,
"grad_norm": 4.158358573913574,
"kl": 0.99736328125,
"learning_rate": 1.635679837881606e-06,
"loss": 0.1695,
"reward": 1.9618304431438447,
"reward_std": 0.4689994312822819,
"rewards/accuracy_reward": 0.17321429261937737,
"rewards/format_reward": 0.8794643312692643,
"rewards/tag_count_reward": 0.9091518253087998,
"step": 1535
},
{
"clip_ratio": 0.0,
"completion_length": 778.8018188476562,
"epoch": 0.5257318426218315,
"grad_norm": 1.9054951667785645,
"kl": 0.9982421875,
"learning_rate": 1.6267756008823701e-06,
"loss": 0.1331,
"reward": 1.9642858028411865,
"reward_std": 0.4854655273258686,
"rewards/accuracy_reward": 0.21785715147852897,
"rewards/format_reward": 0.8535714656114578,
"rewards/tag_count_reward": 0.8928571850061416,
"step": 1540
},
{
"clip_ratio": 0.0,
"completion_length": 790.87236328125,
"epoch": 0.5274387641887855,
"grad_norm": 3.2148427963256836,
"kl": 0.77548828125,
"learning_rate": 1.6178668621128018e-06,
"loss": 0.1566,
"reward": 1.8600447177886963,
"reward_std": 0.534805352985859,
"rewards/accuracy_reward": 0.1491071498952806,
"rewards/format_reward": 0.8348214715719223,
"rewards/tag_count_reward": 0.8761161118745804,
"step": 1545
},
{
"clip_ratio": 0.0,
"completion_length": 772.5991455078125,
"epoch": 0.5291456857557395,
"grad_norm": 2.275534152984619,
"kl": 0.71689453125,
"learning_rate": 1.6089539379200189e-06,
"loss": 0.1425,
"reward": 1.9479911565780639,
"reward_std": 0.4742655538022518,
"rewards/accuracy_reward": 0.16696429336443544,
"rewards/format_reward": 0.8714286148548126,
"rewards/tag_count_reward": 0.9095982581377029,
"step": 1550
},
{
"clip_ratio": 0.0,
"completion_length": 743.4250305175781,
"epoch": 0.5308526073226936,
"grad_norm": 1.629341959953308,
"kl": 0.627587890625,
"learning_rate": 1.6000371447997617e-06,
"loss": 0.1077,
"reward": 2.0165179669857025,
"reward_std": 0.4406208969652653,
"rewards/accuracy_reward": 0.20625001080334188,
"rewards/format_reward": 0.8910714685916901,
"rewards/tag_count_reward": 0.91919646859169,
"step": 1555
},
{
"clip_ratio": 0.0,
"completion_length": 693.431283569336,
"epoch": 0.5325595288896475,
"grad_norm": 0.5048644542694092,
"kl": 0.6044921875,
"learning_rate": 1.591116799385156e-06,
"loss": 0.0448,
"reward": 2.0198661506175997,
"reward_std": 0.44208099469542506,
"rewards/accuracy_reward": 0.23660715185105802,
"rewards/format_reward": 0.8732143312692642,
"rewards/tag_count_reward": 0.910044687986374,
"step": 1560
},
{
"clip_ratio": 0.0,
"completion_length": 712.6607482910156,
"epoch": 0.5342664504566015,
"grad_norm": 0.7713425755500793,
"kl": 0.63125,
"learning_rate": 1.5821932184354677e-06,
"loss": 0.0739,
"reward": 2.0334822297096253,
"reward_std": 0.4626117169857025,
"rewards/accuracy_reward": 0.2223214386962354,
"rewards/format_reward": 0.8910714745521545,
"rewards/tag_count_reward": 0.9200893342494965,
"step": 1565
},
{
"clip_ratio": 0.0,
"completion_length": 731.1116363525391,
"epoch": 0.5359733720235555,
"grad_norm": 1.309543251991272,
"kl": 0.68330078125,
"learning_rate": 1.5732667188248568e-06,
"loss": 0.0613,
"reward": 1.997991144657135,
"reward_std": 0.4713860541582108,
"rewards/accuracy_reward": 0.1919642967171967,
"rewards/format_reward": 0.886607187986374,
"rewards/tag_count_reward": 0.9194196850061417,
"step": 1570
},
{
"clip_ratio": 0.0,
"completion_length": 718.6598480224609,
"epoch": 0.5376802935905095,
"grad_norm": 0.7583140730857849,
"kl": 0.94140625,
"learning_rate": 1.5643376175311233e-06,
"loss": 0.0464,
"reward": 2.0303572535514833,
"reward_std": 0.46192915737628937,
"rewards/accuracy_reward": 0.23392858356237411,
"rewards/format_reward": 0.8848214775323868,
"rewards/tag_count_reward": 0.9116071879863739,
"step": 1575
},
{
"clip_ratio": 0.0,
"completion_length": 776.775927734375,
"epoch": 0.5393872151574635,
"grad_norm": 2.4456801414489746,
"kl": 0.96123046875,
"learning_rate": 1.555406231624453e-06,
"loss": 0.0798,
"reward": 1.9665179789066314,
"reward_std": 0.4243281804025173,
"rewards/accuracy_reward": 0.16785715064033865,
"rewards/format_reward": 0.8848214656114578,
"rewards/tag_count_reward": 0.913839328289032,
"step": 1580
},
{
"clip_ratio": 0.0,
"completion_length": 745.4625274658204,
"epoch": 0.5410941367244175,
"grad_norm": 2.722489595413208,
"kl": 1.0029296875,
"learning_rate": 1.5464728782561578e-06,
"loss": 0.0884,
"reward": 1.9870536386966706,
"reward_std": 0.3701733611524105,
"rewards/accuracy_reward": 0.17500000689178705,
"rewards/format_reward": 0.8928571879863739,
"rewards/tag_count_reward": 0.91919646859169,
"step": 1585
},
{
"clip_ratio": 0.0,
"completion_length": 767.0625396728516,
"epoch": 0.5428010582913715,
"grad_norm": 9.577692031860352,
"kl": 1.68056640625,
"learning_rate": 1.537537874647413e-06,
"loss": 0.1848,
"reward": 2.045535796880722,
"reward_std": 0.3802845995873213,
"rewards/accuracy_reward": 0.21428572600707413,
"rewards/format_reward": 0.9026786178350449,
"rewards/tag_count_reward": 0.92857146859169,
"step": 1590
},
{
"clip_ratio": 0.0,
"completion_length": 841.7741516113281,
"epoch": 0.5445079798583256,
"grad_norm": 5.46830415725708,
"kl": 2.172265625,
"learning_rate": 1.5286015380779939e-06,
"loss": 0.2761,
"reward": 2.0111608028411867,
"reward_std": 0.4875886231660843,
"rewards/accuracy_reward": 0.21785715352743865,
"rewards/format_reward": 0.8812500417232514,
"rewards/tag_count_reward": 0.9120536148548126,
"step": 1595
},
{
"clip_ratio": 0.0,
"completion_length": 802.6259307861328,
"epoch": 0.5462149014252795,
"grad_norm": 56.66122817993164,
"kl": 2.2943359375,
"learning_rate": 1.5196641858750092e-06,
"loss": 0.3174,
"reward": 1.9904018878936767,
"reward_std": 0.43015862852334974,
"rewards/accuracy_reward": 0.18750000754371285,
"rewards/format_reward": 0.8866071850061417,
"rewards/tag_count_reward": 0.9162946820259095,
"step": 1600
},
{
"clip_ratio": 0.0,
"completion_length": 812.5286102294922,
"epoch": 0.5479218229922335,
"grad_norm": 10.48519229888916,
"kl": 2.0384765625,
"learning_rate": 1.5107261354016317e-06,
"loss": 0.2719,
"reward": 2.0328125834465025,
"reward_std": 0.4505886062979698,
"rewards/accuracy_reward": 0.22857143878936767,
"rewards/format_reward": 0.8883928984403611,
"rewards/tag_count_reward": 0.9158482551574707,
"step": 1605
},
{
"clip_ratio": 0.0,
"completion_length": 788.7830780029296,
"epoch": 0.5496287445591875,
"grad_norm": 1.7132465839385986,
"kl": 2.090625,
"learning_rate": 1.5017877040458307e-06,
"loss": 0.3233,
"reward": 1.9564733147621154,
"reward_std": 0.4755387619137764,
"rewards/accuracy_reward": 0.18482143748551608,
"rewards/format_reward": 0.866964328289032,
"rewards/tag_count_reward": 0.9046875417232514,
"step": 1610
},
{
"clip_ratio": 0.0,
"completion_length": 811.6839599609375,
"epoch": 0.5513356661261415,
"grad_norm": 6.485858917236328,
"kl": 2.4689453125,
"learning_rate": 1.4928492092091e-06,
"loss": 0.3277,
"reward": 1.9069197118282317,
"reward_std": 0.5133912198245525,
"rewards/accuracy_reward": 0.1741071511991322,
"rewards/format_reward": 0.846428605914116,
"rewards/tag_count_reward": 0.8863839745521546,
"step": 1615
},
{
"clip_ratio": 0.0,
"completion_length": 810.3795043945313,
"epoch": 0.5530425876930956,
"grad_norm": 3.7540383338928223,
"kl": 1.98447265625,
"learning_rate": 1.4839109682951868e-06,
"loss": 0.248,
"reward": 1.9069197177886963,
"reward_std": 0.5398491598665714,
"rewards/accuracy_reward": 0.177678579185158,
"rewards/format_reward": 0.8455357551574707,
"rewards/tag_count_reward": 0.8837054014205933,
"step": 1620
},
{
"clip_ratio": 0.0,
"completion_length": 801.8080688476563,
"epoch": 0.5547495092600495,
"grad_norm": 7.626502513885498,
"kl": 2.75556640625,
"learning_rate": 1.4749732986988233e-06,
"loss": 0.2917,
"reward": 1.9462054371833801,
"reward_std": 0.49239194244146345,
"rewards/accuracy_reward": 0.1937500095926225,
"rewards/format_reward": 0.8562500357627869,
"rewards/tag_count_reward": 0.8962053984403611,
"step": 1625
},
{
"clip_ratio": 0.0,
"completion_length": 810.5491424560547,
"epoch": 0.5564564308270035,
"grad_norm": 11.224091529846191,
"kl": 2.851171875,
"learning_rate": 1.4660365177944528e-06,
"loss": 0.3272,
"reward": 1.950892949104309,
"reward_std": 0.5697008088231087,
"rewards/accuracy_reward": 0.21250000949949027,
"rewards/format_reward": 0.8482143223285675,
"rewards/tag_count_reward": 0.8901786178350448,
"step": 1630
},
{
"clip_ratio": 0.0,
"completion_length": 755.3411041259766,
"epoch": 0.5581633523939575,
"grad_norm": 5.468372344970703,
"kl": 2.611328125,
"learning_rate": 1.4571009429249621e-06,
"loss": 0.2821,
"reward": 1.9037947297096252,
"reward_std": 0.4984534740447998,
"rewards/accuracy_reward": 0.1607142923399806,
"rewards/format_reward": 0.8517857551574707,
"rewards/tag_count_reward": 0.8912946790456772,
"step": 1635
},
{
"clip_ratio": 0.0,
"completion_length": 756.8464630126953,
"epoch": 0.5598702739609115,
"grad_norm": 6.450639724731445,
"kl": 1.516015625,
"learning_rate": 1.448166891390412e-06,
"loss": 0.2243,
"reward": 1.97678582072258,
"reward_std": 0.3970135450363159,
"rewards/accuracy_reward": 0.17410715036094188,
"rewards/format_reward": 0.8866071790456772,
"rewards/tag_count_reward": 0.916071480512619,
"step": 1640
},
{
"clip_ratio": 0.0,
"completion_length": 774.0580657958984,
"epoch": 0.5615771955278654,
"grad_norm": 8.960746765136719,
"kl": 2.1162353515625,
"learning_rate": 1.4392346804367697e-06,
"loss": 0.2795,
"reward": 2.002232217788696,
"reward_std": 0.5248230457305908,
"rewards/accuracy_reward": 0.2196428682655096,
"rewards/format_reward": 0.8714286148548126,
"rewards/tag_count_reward": 0.9111607581377029,
"step": 1645
},
{
"clip_ratio": 0.0,
"completion_length": 785.7687835693359,
"epoch": 0.5632841170948195,
"grad_norm": 3.409419536590576,
"kl": 2.0033203125,
"learning_rate": 1.4303046272446437e-06,
"loss": 0.2474,
"reward": 2.006473296880722,
"reward_std": 0.477356181293726,
"rewards/accuracy_reward": 0.2330357264727354,
"rewards/format_reward": 0.8714286118745804,
"rewards/tag_count_reward": 0.9020089685916901,
"step": 1650
},
{
"clip_ratio": 0.0,
"completion_length": 799.9205718994141,
"epoch": 0.5649910386617735,
"grad_norm": 4.918757438659668,
"kl": 2.9984375,
"learning_rate": 1.4213770489180224e-06,
"loss": 0.3257,
"reward": 1.9767858028411864,
"reward_std": 0.46925376951694486,
"rewards/accuracy_reward": 0.20892858104780315,
"rewards/format_reward": 0.8669643223285675,
"rewards/tag_count_reward": 0.9008928924798966,
"step": 1655
},
{
"clip_ratio": 0.0,
"completion_length": 786.0205749511719,
"epoch": 0.5666979602287275,
"grad_norm": 7.537283897399902,
"kl": 2.5080078125,
"learning_rate": 1.4124522624730095e-06,
"loss": 0.2612,
"reward": 1.9294643819332122,
"reward_std": 0.4918681502342224,
"rewards/accuracy_reward": 0.19285715091973543,
"rewards/format_reward": 0.8500000417232514,
"rewards/tag_count_reward": 0.8866071850061417,
"step": 1660
},
{
"clip_ratio": 0.0,
"completion_length": 761.3580657958985,
"epoch": 0.5684048817956815,
"grad_norm": 5.60185432434082,
"kl": 2.038671875,
"learning_rate": 1.403530584826573e-06,
"loss": 0.2786,
"reward": 1.985044741630554,
"reward_std": 0.45016813948750495,
"rewards/accuracy_reward": 0.19553572395816446,
"rewards/format_reward": 0.8785714656114578,
"rewards/tag_count_reward": 0.9109375476837158,
"step": 1665
},
{
"clip_ratio": 0.0,
"completion_length": 766.7759307861328,
"epoch": 0.5701118033626354,
"grad_norm": 14.261216163635254,
"kl": 2.019921875,
"learning_rate": 1.3946123327852855e-06,
"loss": 0.2267,
"reward": 2.0183036804199217,
"reward_std": 0.4540314465761185,
"rewards/accuracy_reward": 0.2089285825379193,
"rewards/format_reward": 0.8883929014205932,
"rewards/tag_count_reward": 0.9209821850061417,
"step": 1670
},
{
"clip_ratio": 0.0,
"completion_length": 788.6384246826171,
"epoch": 0.5718187249295895,
"grad_norm": 5.337741851806641,
"kl": 2.634375,
"learning_rate": 1.3856978230340789e-06,
"loss": 0.2992,
"reward": 2.0158482909202577,
"reward_std": 0.5203843086957931,
"rewards/accuracy_reward": 0.21875001154839993,
"rewards/format_reward": 0.8812500476837158,
"rewards/tag_count_reward": 0.9158482611179352,
"step": 1675
},
{
"clip_ratio": 0.0,
"completion_length": 763.0973571777344,
"epoch": 0.5735256464965435,
"grad_norm": 3.45206618309021,
"kl": 2.1083984375,
"learning_rate": 1.3767873721249963e-06,
"loss": 0.2189,
"reward": 1.9062501072883606,
"reward_std": 0.500207568705082,
"rewards/accuracy_reward": 0.17053572209551932,
"rewards/format_reward": 0.8473214715719223,
"rewards/tag_count_reward": 0.8883928954601288,
"step": 1680
},
{
"clip_ratio": 0.0,
"completion_length": 764.3705688476563,
"epoch": 0.5752325680634974,
"grad_norm": 4.182073593139648,
"kl": 1.9408203125,
"learning_rate": 1.3678812964659528e-06,
"loss": 0.1796,
"reward": 1.9564732909202576,
"reward_std": 0.5043147563934326,
"rewards/accuracy_reward": 0.21696429643779994,
"rewards/format_reward": 0.8491071879863739,
"rewards/tag_count_reward": 0.890401828289032,
"step": 1685
},
{
"clip_ratio": 0.0,
"completion_length": 745.931283569336,
"epoch": 0.5769394896304515,
"grad_norm": 3.476262092590332,
"kl": 2.228125,
"learning_rate": 1.358979912309499e-06,
"loss": 0.1999,
"reward": 1.9243304431438446,
"reward_std": 0.4896064311265945,
"rewards/accuracy_reward": 0.14642857890576125,
"rewards/format_reward": 0.8696429014205933,
"rewards/tag_count_reward": 0.9082589715719223,
"step": 1690
},
{
"clip_ratio": 0.0,
"completion_length": 736.755387878418,
"epoch": 0.5786464111974055,
"grad_norm": 3.4234468936920166,
"kl": 1.89208984375,
"learning_rate": 1.3500835357415933e-06,
"loss": 0.1948,
"reward": 1.987053632736206,
"reward_std": 0.48905017524957656,
"rewards/accuracy_reward": 0.20267857918515803,
"rewards/format_reward": 0.8732143342494965,
"rewards/tag_count_reward": 0.9111607521772385,
"step": 1695
},
{
"clip_ratio": 0.0,
"completion_length": 719.3107406616211,
"epoch": 0.5803533327643595,
"grad_norm": 2.9903573989868164,
"kl": 2.109375,
"learning_rate": 1.341192482670372e-06,
"loss": 0.1728,
"reward": 1.975446528196335,
"reward_std": 0.46964697986841203,
"rewards/accuracy_reward": 0.20625000912696123,
"rewards/format_reward": 0.8678571820259094,
"rewards/tag_count_reward": 0.9013393312692642,
"step": 1700
},
{
"clip_ratio": 0.0,
"completion_length": 752.3080657958984,
"epoch": 0.5820602543313135,
"grad_norm": 9.797762870788574,
"kl": 1.139453125,
"learning_rate": 1.3323070688149395e-06,
"loss": 0.1828,
"reward": 2.018750089406967,
"reward_std": 0.45740093365311624,
"rewards/accuracy_reward": 0.22053572498261928,
"rewards/format_reward": 0.8875000387430191,
"rewards/tag_count_reward": 0.9107143312692643,
"step": 1705
},
{
"clip_ratio": 0.0,
"completion_length": 727.2919998168945,
"epoch": 0.5837671758982674,
"grad_norm": 8.485093116760254,
"kl": 1.298193359375,
"learning_rate": 1.3234276096941503e-06,
"loss": 0.1714,
"reward": 2.0808036625385284,
"reward_std": 0.3648061454296112,
"rewards/accuracy_reward": 0.24821429811418055,
"rewards/format_reward": 0.9026786178350449,
"rewards/tag_count_reward": 0.9299107611179351,
"step": 1710
},
{
"clip_ratio": 0.0,
"completion_length": 705.1152099609375,
"epoch": 0.5854740974652215,
"grad_norm": 4.864824295043945,
"kl": 1.031396484375,
"learning_rate": 1.314554420615409e-06,
"loss": 0.1304,
"reward": 2.053125095367432,
"reward_std": 0.32865179777145387,
"rewards/accuracy_reward": 0.2026785796508193,
"rewards/format_reward": 0.9133928924798965,
"rewards/tag_count_reward": 0.9370536088943482,
"step": 1715
},
{
"clip_ratio": 0.0,
"completion_length": 705.3196685791015,
"epoch": 0.5871810190321755,
"grad_norm": 1.7557798624038696,
"kl": 1.1629150390625,
"learning_rate": 1.3056878166634721e-06,
"loss": 0.1955,
"reward": 2.0312501013278963,
"reward_std": 0.36564070135355,
"rewards/accuracy_reward": 0.17589286509901286,
"rewards/format_reward": 0.9151786088943481,
"rewards/tag_count_reward": 0.9401786118745804,
"step": 1720
},
{
"clip_ratio": 0.0,
"completion_length": 717.8866394042968,
"epoch": 0.5888879405991295,
"grad_norm": 2.3789522647857666,
"kl": 1.434619140625,
"learning_rate": 1.2968281126892603e-06,
"loss": 0.189,
"reward": 2.010937583446503,
"reward_std": 0.32410271763801574,
"rewards/accuracy_reward": 0.15446429206058382,
"rewards/format_reward": 0.916964316368103,
"rewards/tag_count_reward": 0.9395089656114578,
"step": 1725
},
{
"clip_ratio": 0.0,
"completion_length": 764.8107513427734,
"epoch": 0.5905948621660835,
"grad_norm": 6.4479899406433105,
"kl": 1.55869140625,
"learning_rate": 1.2879756232986763e-06,
"loss": 0.2201,
"reward": 2.0031250774860383,
"reward_std": 0.44354828745126723,
"rewards/accuracy_reward": 0.2142857240512967,
"rewards/format_reward": 0.8767857551574707,
"rewards/tag_count_reward": 0.9120536148548126,
"step": 1730
},
{
"clip_ratio": 0.0,
"completion_length": 760.9750305175781,
"epoch": 0.5923017837330374,
"grad_norm": 1.7079081535339355,
"kl": 1.798828125,
"learning_rate": 1.2791306628414377e-06,
"loss": 0.2179,
"reward": 1.9656250894069671,
"reward_std": 0.3878778383135796,
"rewards/accuracy_reward": 0.15892857862636448,
"rewards/format_reward": 0.8883928924798965,
"rewards/tag_count_reward": 0.9183036118745804,
"step": 1735
},
{
"clip_ratio": 0.0,
"completion_length": 713.3893127441406,
"epoch": 0.5940087052999915,
"grad_norm": 1.6621439456939697,
"kl": 1.703564453125,
"learning_rate": 1.2702935453999079e-06,
"loss": 0.1869,
"reward": 2.0366072416305543,
"reward_std": 0.4229817047715187,
"rewards/accuracy_reward": 0.20267858179286122,
"rewards/format_reward": 0.9017857640981675,
"rewards/tag_count_reward": 0.9321429014205933,
"step": 1740
},
{
"clip_ratio": 0.0,
"completion_length": 728.0705627441406,
"epoch": 0.5957156268669455,
"grad_norm": 2.5876617431640625,
"kl": 1.56240234375,
"learning_rate": 1.2614645847779498e-06,
"loss": 0.1968,
"reward": 1.926562601327896,
"reward_std": 0.4078594759106636,
"rewards/accuracy_reward": 0.16071429485455155,
"rewards/format_reward": 0.8642857521772385,
"rewards/tag_count_reward": 0.9015625417232513,
"step": 1745
},
{
"clip_ratio": 0.0,
"completion_length": 736.7232452392578,
"epoch": 0.5974225484338994,
"grad_norm": 5.281065464019775,
"kl": 1.70810546875,
"learning_rate": 1.252644094489778e-06,
"loss": 0.1785,
"reward": 1.942410796880722,
"reward_std": 0.44451272040605544,
"rewards/accuracy_reward": 0.17678572433069348,
"rewards/format_reward": 0.8625000447034836,
"rewards/tag_count_reward": 0.9031250447034835,
"step": 1750
},
{
"clip_ratio": 0.0,
"completion_length": 677.8294891357422,
"epoch": 0.5991294700008535,
"grad_norm": 4.909358501434326,
"kl": 1.2296875,
"learning_rate": 1.2438323877488274e-06,
"loss": 0.1429,
"reward": 2.0571429431438446,
"reward_std": 0.4347854398190975,
"rewards/accuracy_reward": 0.2392857251688838,
"rewards/format_reward": 0.8910714685916901,
"rewards/tag_count_reward": 0.9267857581377029,
"step": 1755
},
{
"clip_ratio": 0.0,
"completion_length": 740.7134246826172,
"epoch": 0.6008363915678074,
"grad_norm": 9.942142486572266,
"kl": 1.40419921875,
"learning_rate": 1.2350297774566337e-06,
"loss": 0.1766,
"reward": 1.97477685213089,
"reward_std": 0.4526204660534859,
"rewards/accuracy_reward": 0.19196429187431932,
"rewards/format_reward": 0.8750000357627868,
"rewards/tag_count_reward": 0.9078125417232513,
"step": 1760
},
{
"clip_ratio": 0.0,
"completion_length": 722.588427734375,
"epoch": 0.6025433131347615,
"grad_norm": 2.3722033500671387,
"kl": 1.3779296875,
"learning_rate": 1.2262365761917163e-06,
"loss": 0.1971,
"reward": 2.035267961025238,
"reward_std": 0.4693600654602051,
"rewards/accuracy_reward": 0.2125000089406967,
"rewards/format_reward": 0.8955357551574707,
"rewards/tag_count_reward": 0.9272321820259094,
"step": 1765
},
{
"clip_ratio": 0.0,
"completion_length": 708.3277038574219,
"epoch": 0.6042502347017155,
"grad_norm": 4.353895664215088,
"kl": 1.6890625,
"learning_rate": 1.2174530961984853e-06,
"loss": 0.2061,
"reward": 1.922321516275406,
"reward_std": 0.49082919061183927,
"rewards/accuracy_reward": 0.1955357225611806,
"rewards/format_reward": 0.8410714715719223,
"rewards/tag_count_reward": 0.885714328289032,
"step": 1770
},
{
"clip_ratio": 0.0,
"completion_length": 765.6419982910156,
"epoch": 0.6059571562686694,
"grad_norm": 3.145759105682373,
"kl": 2.332421875,
"learning_rate": 1.2086796493761495e-06,
"loss": 0.2836,
"reward": 1.865178644657135,
"reward_std": 0.5404426328837871,
"rewards/accuracy_reward": 0.16428572041913866,
"rewards/format_reward": 0.8258929014205932,
"rewards/tag_count_reward": 0.8750000387430191,
"step": 1775
},
{
"clip_ratio": 0.0,
"completion_length": 696.7169952392578,
"epoch": 0.6076640778356235,
"grad_norm": 1.4645618200302124,
"kl": 1.76787109375,
"learning_rate": 1.1999165472676426e-06,
"loss": 0.1652,
"reward": 1.9060268819332122,
"reward_std": 0.5109671518206597,
"rewards/accuracy_reward": 0.18392857955768704,
"rewards/format_reward": 0.8392857491970063,
"rewards/tag_count_reward": 0.8828125417232513,
"step": 1780
},
{
"clip_ratio": 0.0,
"completion_length": 690.5178894042969,
"epoch": 0.6093709994025774,
"grad_norm": 1.7024650573730469,
"kl": 1.23330078125,
"learning_rate": 1.1911641010485598e-06,
"loss": 0.1556,
"reward": 2.016741174459457,
"reward_std": 0.43561617806553843,
"rewards/accuracy_reward": 0.22678572572767736,
"rewards/format_reward": 0.8758928954601288,
"rewards/tag_count_reward": 0.9140625476837159,
"step": 1785
},
{
"clip_ratio": 0.0,
"completion_length": 721.9571762084961,
"epoch": 0.6110779209695314,
"grad_norm": 1.3666507005691528,
"kl": 1.13935546875,
"learning_rate": 1.182422621516109e-06,
"loss": 0.1252,
"reward": 1.9524554371833802,
"reward_std": 0.398265440762043,
"rewards/accuracy_reward": 0.15625000530853866,
"rewards/format_reward": 0.8839286118745804,
"rewards/tag_count_reward": 0.9122768253087997,
"step": 1790
},
{
"clip_ratio": 0.0,
"completion_length": 684.8982421875,
"epoch": 0.6127848425364855,
"grad_norm": 2.1875545978546143,
"kl": 1.0215087890625,
"learning_rate": 1.1736924190780725e-06,
"loss": 0.0728,
"reward": 2.037276875972748,
"reward_std": 0.3939563654363155,
"rewards/accuracy_reward": 0.19464286416769028,
"rewards/format_reward": 0.9089286148548126,
"rewards/tag_count_reward": 0.933705398440361,
"step": 1795
},
{
"clip_ratio": 0.0,
"completion_length": 758.3491455078125,
"epoch": 0.6144917641034394,
"grad_norm": 3.1847763061523438,
"kl": 1.2671875,
"learning_rate": 1.1649738037417878e-06,
"loss": 0.1705,
"reward": 1.9837054431438446,
"reward_std": 0.40774648189544677,
"rewards/accuracy_reward": 0.16517857890576124,
"rewards/format_reward": 0.8955357521772385,
"rewards/tag_count_reward": 0.9229911148548127,
"step": 1800
},
{
"clip_ratio": 0.0,
"completion_length": 709.7312835693359,
"epoch": 0.6161986856703935,
"grad_norm": 1.9738051891326904,
"kl": 1.80869140625,
"learning_rate": 1.1562670851031345e-06,
"loss": 0.2007,
"reward": 1.983035796880722,
"reward_std": 0.4977798670530319,
"rewards/accuracy_reward": 0.21428572218865155,
"rewards/format_reward": 0.8633928954601288,
"rewards/tag_count_reward": 0.9053571909666062,
"step": 1805
},
{
"clip_ratio": 0.0,
"completion_length": 755.1071746826171,
"epoch": 0.6179056072373474,
"grad_norm": 7.7020673751831055,
"kl": 1.845703125,
"learning_rate": 1.1475725723355462e-06,
"loss": 0.1874,
"reward": 1.864955449104309,
"reward_std": 0.5472064293920994,
"rewards/accuracy_reward": 0.17678572256118058,
"rewards/format_reward": 0.8205357551574707,
"rewards/tag_count_reward": 0.8676339685916901,
"step": 1810
},
{
"clip_ratio": 0.0,
"completion_length": 727.8607482910156,
"epoch": 0.6196125288043014,
"grad_norm": 2.8931884765625,
"kl": 2.0552734375,
"learning_rate": 1.1388905741790269e-06,
"loss": 0.2344,
"reward": 1.922767949104309,
"reward_std": 0.4940613195300102,
"rewards/accuracy_reward": 0.15089286286383868,
"rewards/format_reward": 0.8651786088943482,
"rewards/tag_count_reward": 0.9066964745521545,
"step": 1815
},
{
"clip_ratio": 0.0,
"completion_length": 765.1125366210938,
"epoch": 0.6213194503712555,
"grad_norm": 1.5861130952835083,
"kl": 1.46142578125,
"learning_rate": 1.1302213989291914e-06,
"loss": 0.2015,
"reward": 1.9593750953674316,
"reward_std": 0.4679669015109539,
"rewards/accuracy_reward": 0.1785714362747967,
"rewards/format_reward": 0.8732143253087997,
"rewards/tag_count_reward": 0.9075893282890319,
"step": 1820
},
{
"clip_ratio": 0.0,
"completion_length": 763.0911102294922,
"epoch": 0.6230263719382094,
"grad_norm": 2.04757022857666,
"kl": 1.56689453125,
"learning_rate": 1.1215653544263147e-06,
"loss": 0.1845,
"reward": 2.006250095367432,
"reward_std": 0.46510125435888766,
"rewards/accuracy_reward": 0.21250000968575478,
"rewards/format_reward": 0.8776786118745804,
"rewards/tag_count_reward": 0.9160714685916901,
"step": 1825
},
{
"clip_ratio": 0.0,
"completion_length": 743.4795013427735,
"epoch": 0.6247332935051635,
"grad_norm": 3.3067221641540527,
"kl": 1.68193359375,
"learning_rate": 1.1129227480444041e-06,
"loss": 0.1842,
"reward": 1.9939732909202577,
"reward_std": 0.4414864867925644,
"rewards/accuracy_reward": 0.20446429373696445,
"rewards/format_reward": 0.8767857551574707,
"rewards/tag_count_reward": 0.9127232581377029,
"step": 1830
},
{
"clip_ratio": 0.0,
"completion_length": 765.1839599609375,
"epoch": 0.6264402150721174,
"grad_norm": 3.4389750957489014,
"kl": 1.82890625,
"learning_rate": 1.10429388668028e-06,
"loss": 0.2676,
"reward": 2.0104911625385284,
"reward_std": 0.5313868165016175,
"rewards/accuracy_reward": 0.22589287031441926,
"rewards/format_reward": 0.8705357581377029,
"rewards/tag_count_reward": 0.9140625417232513,
"step": 1835
},
{
"clip_ratio": 0.0,
"completion_length": 746.4893157958984,
"epoch": 0.6281471366390714,
"grad_norm": 2.3858678340911865,
"kl": 1.932421875,
"learning_rate": 1.0956790767426834e-06,
"loss": 0.2194,
"reward": 1.9825893580913543,
"reward_std": 0.5101016968488693,
"rewards/accuracy_reward": 0.2044642936438322,
"rewards/format_reward": 0.8723214715719223,
"rewards/tag_count_reward": 0.9058036118745804,
"step": 1840
},
{
"clip_ratio": 0.0,
"completion_length": 746.9839630126953,
"epoch": 0.6298540582060255,
"grad_norm": 3.954383134841919,
"kl": 2.05859375,
"learning_rate": 1.0870786241413909e-06,
"loss": 0.2344,
"reward": 1.9395090341567993,
"reward_std": 0.5304494857788086,
"rewards/accuracy_reward": 0.2142857256345451,
"rewards/format_reward": 0.8401786118745804,
"rewards/tag_count_reward": 0.8850446850061416,
"step": 1845
},
{
"clip_ratio": 0.0,
"completion_length": 755.4393157958984,
"epoch": 0.6315609797729794,
"grad_norm": 6.762625694274902,
"kl": 1.98203125,
"learning_rate": 1.078492834276354e-06,
"loss": 0.2525,
"reward": 1.8654018759727478,
"reward_std": 0.5434098705649376,
"rewards/accuracy_reward": 0.15803572097793223,
"rewards/format_reward": 0.8303571879863739,
"rewards/tag_count_reward": 0.8770089685916901,
"step": 1850
},
{
"clip_ratio": 0.0,
"completion_length": 788.8839660644531,
"epoch": 0.6332679013399334,
"grad_norm": 2.6459848880767822,
"kl": 2.58125,
"learning_rate": 1.069922012026854e-06,
"loss": 0.2477,
"reward": 1.8825893640518188,
"reward_std": 0.6637891173362732,
"rewards/accuracy_reward": 0.19910715203732252,
"rewards/format_reward": 0.8151786118745804,
"rewards/tag_count_reward": 0.8683036148548127,
"step": 1855
},
{
"clip_ratio": 0.0,
"completion_length": 723.5955688476563,
"epoch": 0.6349748229068874,
"grad_norm": 2.0201399326324463,
"kl": 1.7146484375,
"learning_rate": 1.0613664617406762e-06,
"loss": 0.1506,
"reward": 1.9308036625385285,
"reward_std": 0.5138662807643414,
"rewards/accuracy_reward": 0.17678572088479996,
"rewards/format_reward": 0.8562500417232514,
"rewards/tag_count_reward": 0.8977679014205933,
"step": 1860
},
{
"clip_ratio": 0.0,
"completion_length": 743.973257446289,
"epoch": 0.6366817444738414,
"grad_norm": 2.8136751651763916,
"kl": 1.8068359375,
"learning_rate": 1.0528264872233018e-06,
"loss": 0.1668,
"reward": 1.8919643580913543,
"reward_std": 0.5319583684206008,
"rewards/accuracy_reward": 0.18839286705479025,
"rewards/format_reward": 0.8267857581377029,
"rewards/tag_count_reward": 0.8767857551574707,
"step": 1865
},
{
"clip_ratio": 0.0,
"completion_length": 695.7518188476563,
"epoch": 0.6383886660407955,
"grad_norm": 2.7147600650787354,
"kl": 2.24091796875,
"learning_rate": 1.0443023917271202e-06,
"loss": 0.1887,
"reward": 1.868303656578064,
"reward_std": 0.5464206546545028,
"rewards/accuracy_reward": 0.20178572367876768,
"rewards/format_reward": 0.8080357521772384,
"rewards/tag_count_reward": 0.8584821790456771,
"step": 1870
},
{
"clip_ratio": 0.0,
"completion_length": 718.8482482910156,
"epoch": 0.6400955876077494,
"grad_norm": 3.581350326538086,
"kl": 1.817578125,
"learning_rate": 1.0357944779406609e-06,
"loss": 0.1201,
"reward": 1.9125000953674316,
"reward_std": 0.5139884263277054,
"rewards/accuracy_reward": 0.19821429224684833,
"rewards/format_reward": 0.8348214745521545,
"rewards/tag_count_reward": 0.8794643253087997,
"step": 1875
},
{
"clip_ratio": 0.0,
"completion_length": 686.5134216308594,
"epoch": 0.6418025091747034,
"grad_norm": 1.0345350503921509,
"kl": 1.70791015625,
"learning_rate": 1.0273030479778456e-06,
"loss": 0.1892,
"reward": 1.9660715281963348,
"reward_std": 0.45822909846901894,
"rewards/accuracy_reward": 0.1767857238650322,
"rewards/format_reward": 0.877678605914116,
"rewards/tag_count_reward": 0.9116071879863739,
"step": 1880
},
{
"clip_ratio": 0.0,
"completion_length": 720.682177734375,
"epoch": 0.6435094307416575,
"grad_norm": 0.9296639561653137,
"kl": 1.1458984375,
"learning_rate": 1.0188284033672586e-06,
"loss": 0.1595,
"reward": 1.9872768700122834,
"reward_std": 0.40355037674307825,
"rewards/accuracy_reward": 0.15714286332949995,
"rewards/format_reward": 0.9053571850061417,
"rewards/tag_count_reward": 0.924776828289032,
"step": 1885
},
{
"clip_ratio": 0.0,
"completion_length": 651.3928894042969,
"epoch": 0.6452163523086114,
"grad_norm": 5.478949069976807,
"kl": 0.98974609375,
"learning_rate": 1.0103708450414404e-06,
"loss": 0.079,
"reward": 2.0517858028411866,
"reward_std": 0.419903215020895,
"rewards/accuracy_reward": 0.2044642929919064,
"rewards/format_reward": 0.9125000387430191,
"rewards/tag_count_reward": 0.93482146859169,
"step": 1890
},
{
"clip_ratio": 0.0,
"completion_length": 710.156283569336,
"epoch": 0.6469232738755654,
"grad_norm": 2.714566469192505,
"kl": 1.3677734375,
"learning_rate": 1.0019306733262022e-06,
"loss": 0.1255,
"reward": 1.9647322475910187,
"reward_std": 0.40584927052259445,
"rewards/accuracy_reward": 0.16160715082660318,
"rewards/format_reward": 0.8901786118745804,
"rewards/tag_count_reward": 0.9129464715719223,
"step": 1895
},
{
"clip_ratio": 0.0,
"completion_length": 703.6277053833007,
"epoch": 0.6486301954425194,
"grad_norm": 1.2529172897338867,
"kl": 1.42275390625,
"learning_rate": 9.935081879299605e-07,
"loss": 0.1632,
"reward": 1.9410715401172638,
"reward_std": 0.4627831496298313,
"rewards/accuracy_reward": 0.1928571521304548,
"rewards/format_reward": 0.8544643193483352,
"rewards/tag_count_reward": 0.8937500357627869,
"step": 1900
},
{
"clip_ratio": 0.0,
"completion_length": 745.7062866210938,
"epoch": 0.6503371170094734,
"grad_norm": 2.3366475105285645,
"kl": 1.8337890625,
"learning_rate": 9.851036879330958e-07,
"loss": 0.1861,
"reward": 1.936830472946167,
"reward_std": 0.5271413020789624,
"rewards/accuracy_reward": 0.18035714915022255,
"rewards/format_reward": 0.8553571850061417,
"rewards/tag_count_reward": 0.9011161088943481,
"step": 1905
},
{
"clip_ratio": 0.0,
"completion_length": 735.8169952392578,
"epoch": 0.6520440385764275,
"grad_norm": 2.5596821308135986,
"kl": 1.5796875,
"learning_rate": 9.767174717773307e-07,
"loss": 0.1504,
"reward": 1.9691965103149414,
"reward_std": 0.4714687965810299,
"rewards/accuracy_reward": 0.1839285807684064,
"rewards/format_reward": 0.8750000447034836,
"rewards/tag_count_reward": 0.9102679014205932,
"step": 1910
},
{
"clip_ratio": 0.0,
"completion_length": 727.2643218994141,
"epoch": 0.6537509601433814,
"grad_norm": 0.8387001156806946,
"kl": 1.6423828125,
"learning_rate": 9.683498372551335e-07,
"loss": 0.1925,
"reward": 1.9691965103149414,
"reward_std": 0.4812636740505695,
"rewards/accuracy_reward": 0.19553572311997414,
"rewards/format_reward": 0.8669643342494965,
"rewards/tag_count_reward": 0.9066964685916901,
"step": 1915
},
{
"clip_ratio": 0.0,
"completion_length": 734.2857482910156,
"epoch": 0.6554578817103354,
"grad_norm": 0.9733322262763977,
"kl": 1.50322265625,
"learning_rate": 9.600010814991425e-07,
"loss": 0.1831,
"reward": 1.9265625894069671,
"reward_std": 0.4761272206902504,
"rewards/accuracy_reward": 0.16339286472648382,
"rewards/format_reward": 0.8625000387430191,
"rewards/tag_count_reward": 0.9006696850061416,
"step": 1920
},
{
"clip_ratio": 0.0,
"completion_length": 743.7411041259766,
"epoch": 0.6571648032772894,
"grad_norm": 1.7600935697555542,
"kl": 1.1025390625,
"learning_rate": 9.51671500971617e-07,
"loss": 0.1222,
"reward": 1.9857143700122832,
"reward_std": 0.41541323736310004,
"rewards/accuracy_reward": 0.17946429513394832,
"rewards/format_reward": 0.8883929014205932,
"rewards/tag_count_reward": 0.9178571850061417,
"step": 1925
},
{
"clip_ratio": 0.0,
"completion_length": 734.8607513427735,
"epoch": 0.6588717248442434,
"grad_norm": 1.9581345319747925,
"kl": 1.437109375,
"learning_rate": 9.433613914539076e-07,
"loss": 0.1572,
"reward": 1.9459822356700898,
"reward_std": 0.4454167552292347,
"rewards/accuracy_reward": 0.1678571510128677,
"rewards/format_reward": 0.873214328289032,
"rewards/tag_count_reward": 0.9049107521772385,
"step": 1930
},
{
"clip_ratio": 0.0,
"completion_length": 690.2170013427734,
"epoch": 0.6605786464111975,
"grad_norm": 4.418382167816162,
"kl": 1.1951171875,
"learning_rate": 9.350710480359549e-07,
"loss": 0.1354,
"reward": 2.0120536625385284,
"reward_std": 0.39660380184650423,
"rewards/accuracy_reward": 0.18303572237491608,
"rewards/format_reward": 0.9000000387430191,
"rewards/tag_count_reward": 0.92901791036129,
"step": 1935
},
{
"clip_ratio": 0.0,
"completion_length": 697.366098022461,
"epoch": 0.6622855679781514,
"grad_norm": 1.5262176990509033,
"kl": 1.357177734375,
"learning_rate": 9.268007651058089e-07,
"loss": 0.129,
"reward": 2.02790185213089,
"reward_std": 0.4235570203512907,
"rewards/accuracy_reward": 0.22767858300358057,
"rewards/format_reward": 0.8857143253087998,
"rewards/tag_count_reward": 0.9145089715719223,
"step": 1940
},
{
"clip_ratio": 0.0,
"completion_length": 733.0839538574219,
"epoch": 0.6639924895451054,
"grad_norm": 1.9810816049575806,
"kl": 1.683984375,
"learning_rate": 9.185508363391787e-07,
"loss": 0.1943,
"reward": 1.9560268938541412,
"reward_std": 0.48697994872927663,
"rewards/accuracy_reward": 0.1714285804890096,
"rewards/format_reward": 0.8723214715719223,
"rewards/tag_count_reward": 0.912276828289032,
"step": 1945
},
{
"clip_ratio": 0.0,
"completion_length": 760.0661041259766,
"epoch": 0.6656994111120594,
"grad_norm": 2.6021721363067627,
"kl": 1.9115234375,
"learning_rate": 9.103215546890001e-07,
"loss": 0.1917,
"reward": 1.939285784959793,
"reward_std": 0.5354632467031479,
"rewards/accuracy_reward": 0.18482143925502897,
"rewards/format_reward": 0.8562500417232514,
"rewards/tag_count_reward": 0.8982143223285675,
"step": 1950
},
{
"clip_ratio": 0.0,
"completion_length": 708.5402069091797,
"epoch": 0.6674063326790134,
"grad_norm": 5.726203918457031,
"kl": 1.6966796875,
"learning_rate": 9.021132123750361e-07,
"loss": 0.1782,
"reward": 1.943303656578064,
"reward_std": 0.4957593522965908,
"rewards/accuracy_reward": 0.1562500072643161,
"rewards/format_reward": 0.8767857551574707,
"rewards/tag_count_reward": 0.9102679014205932,
"step": 1955
},
{
"clip_ratio": 0.0,
"completion_length": 738.5419921875,
"epoch": 0.6691132542459673,
"grad_norm": 3.667335271835327,
"kl": 1.2876220703125,
"learning_rate": 8.93926100873498e-07,
"loss": 0.1574,
"reward": 2.027678656578064,
"reward_std": 0.41424002312123775,
"rewards/accuracy_reward": 0.18660715240985154,
"rewards/format_reward": 0.9053571850061417,
"rewards/tag_count_reward": 0.9357143193483353,
"step": 1960
},
{
"clip_ratio": 0.0,
"completion_length": 715.632177734375,
"epoch": 0.6708201758129214,
"grad_norm": 5.630631923675537,
"kl": 1.3962890625,
"learning_rate": 8.857605109066977e-07,
"loss": 0.1684,
"reward": 2.0866072475910187,
"reward_std": 0.4239813320338726,
"rewards/accuracy_reward": 0.23660715185105802,
"rewards/format_reward": 0.9151786088943481,
"rewards/tag_count_reward": 0.9348214745521546,
"step": 1965
},
{
"clip_ratio": 0.0,
"completion_length": 724.7134246826172,
"epoch": 0.6725270973798754,
"grad_norm": 0.8905249834060669,
"kl": 1.224609375,
"learning_rate": 8.776167324327203e-07,
"loss": 0.1153,
"reward": 2.027678686380386,
"reward_std": 0.3887888576835394,
"rewards/accuracy_reward": 0.16875000894069672,
"rewards/format_reward": 0.9169643223285675,
"rewards/tag_count_reward": 0.9419643253087997,
"step": 1970
},
{
"clip_ratio": 0.0,
"completion_length": 782.5875366210937,
"epoch": 0.6742340189468294,
"grad_norm": 2.29498291015625,
"kl": 1.78818359375,
"learning_rate": 8.694950546351335e-07,
"loss": 0.2179,
"reward": 1.9397322356700897,
"reward_std": 0.43477702885866165,
"rewards/accuracy_reward": 0.14375000745058059,
"rewards/format_reward": 0.8812500387430191,
"rewards/tag_count_reward": 0.9147321850061416,
"step": 1975
},
{
"clip_ratio": 0.0,
"completion_length": 711.3321807861328,
"epoch": 0.6759409405137834,
"grad_norm": 1.6042215824127197,
"kl": 1.51044921875,
"learning_rate": 8.61395765912712e-07,
"loss": 0.1558,
"reward": 2.017857217788696,
"reward_std": 0.41501733139157293,
"rewards/accuracy_reward": 0.18750000819563867,
"rewards/format_reward": 0.9035714656114578,
"rewards/tag_count_reward": 0.9267857581377029,
"step": 1980
},
{
"clip_ratio": 0.0,
"completion_length": 749.9616394042969,
"epoch": 0.6776478620807374,
"grad_norm": 6.817398548126221,
"kl": 1.70859375,
"learning_rate": 8.533191538692026e-07,
"loss": 0.199,
"reward": 2.0314733028411864,
"reward_std": 0.4689814649522305,
"rewards/accuracy_reward": 0.2258928676135838,
"rewards/format_reward": 0.8875000447034835,
"rewards/tag_count_reward": 0.9180804014205932,
"step": 1985
},
{
"clip_ratio": 0.0,
"completion_length": 775.2875335693359,
"epoch": 0.6793547836476914,
"grad_norm": 5.016016006469727,
"kl": 1.9013671875,
"learning_rate": 8.452655053031066e-07,
"loss": 0.2043,
"reward": 1.953571516275406,
"reward_std": 0.5462075427174569,
"rewards/accuracy_reward": 0.19375000931322575,
"rewards/format_reward": 0.8625000476837158,
"rewards/tag_count_reward": 0.89732146859169,
"step": 1990
},
{
"clip_ratio": 0.0,
"completion_length": 765.9411102294922,
"epoch": 0.6810617052146454,
"grad_norm": 1.6670773029327393,
"kl": 2.117578125,
"learning_rate": 8.372351061975014e-07,
"loss": 0.2836,
"reward": 1.9348215281963348,
"reward_std": 0.5541624218225479,
"rewards/accuracy_reward": 0.2035714391618967,
"rewards/format_reward": 0.8491071790456772,
"rewards/tag_count_reward": 0.8821429014205933,
"step": 1995
},
{
"clip_ratio": 0.0,
"completion_length": 807.7107513427734,
"epoch": 0.6827686267815993,
"grad_norm": 1.560651183128357,
"kl": 1.6337890625,
"learning_rate": 8.292282417098763e-07,
"loss": 0.2252,
"reward": 1.9933036804199218,
"reward_std": 0.47950020134449006,
"rewards/accuracy_reward": 0.20089286658912897,
"rewards/format_reward": 0.8812500447034836,
"rewards/tag_count_reward": 0.9111607640981674,
"step": 2000
},
{
"clip_ratio": 0.0,
"completion_length": 756.7250396728516,
"epoch": 0.6844755483485534,
"grad_norm": 4.225320816040039,
"kl": 1.6544921875,
"learning_rate": 8.212451961620176e-07,
"loss": 0.2091,
"reward": 1.9883929371833802,
"reward_std": 0.4503480665385723,
"rewards/accuracy_reward": 0.19910714793950318,
"rewards/format_reward": 0.8785714775323867,
"rewards/tag_count_reward": 0.9107143253087997,
"step": 2005
},
{
"clip_ratio": 0.0,
"completion_length": 753.9098541259766,
"epoch": 0.6861824699155074,
"grad_norm": 1.1238821744918823,
"kl": 1.6462890625,
"learning_rate": 8.132862530299031e-07,
"loss": 0.2113,
"reward": 1.9857143700122832,
"reward_std": 0.5074703797698021,
"rewards/accuracy_reward": 0.21875000707805156,
"rewards/format_reward": 0.8669643253087997,
"rewards/tag_count_reward": 0.9000000417232513,
"step": 2010
},
{
"clip_ratio": 0.0,
"completion_length": 776.651821899414,
"epoch": 0.6878893914824614,
"grad_norm": 5.102161407470703,
"kl": 2.066015625,
"learning_rate": 8.053516949336425e-07,
"loss": 0.2291,
"reward": 1.9631697356700897,
"reward_std": 0.4666756376624107,
"rewards/accuracy_reward": 0.18392858169972898,
"rewards/format_reward": 0.875892898440361,
"rewards/tag_count_reward": 0.9033482551574707,
"step": 2015
},
{
"clip_ratio": 0.0,
"completion_length": 731.7232482910156,
"epoch": 0.6895963130494154,
"grad_norm": 4.221590042114258,
"kl": 1.782421875,
"learning_rate": 7.974418036274371e-07,
"loss": 0.1794,
"reward": 1.9631697356700897,
"reward_std": 0.5094508707523346,
"rewards/accuracy_reward": 0.19821429466828705,
"rewards/format_reward": 0.8633928954601288,
"rewards/tag_count_reward": 0.9015625447034836,
"step": 2020
},
{
"clip_ratio": 0.0,
"completion_length": 745.8018249511719,
"epoch": 0.6913032346163693,
"grad_norm": 2.238372802734375,
"kl": 1.6064453125,
"learning_rate": 7.895568599895763e-07,
"loss": 0.1517,
"reward": 2.003348296880722,
"reward_std": 0.4860348865389824,
"rewards/accuracy_reward": 0.20178572349250318,
"rewards/format_reward": 0.882142898440361,
"rewards/tag_count_reward": 0.9194196820259094,
"step": 2025
},
{
"clip_ratio": 0.0,
"completion_length": 776.3482482910156,
"epoch": 0.6930101561833234,
"grad_norm": 2.6449360847473145,
"kl": 1.395166015625,
"learning_rate": 7.816971440124661e-07,
"loss": 0.1546,
"reward": 2.007142949104309,
"reward_std": 0.4705513596534729,
"rewards/accuracy_reward": 0.2071428656578064,
"rewards/format_reward": 0.8848214685916901,
"rewards/tag_count_reward": 0.9151786148548127,
"step": 2030
},
{
"clip_ratio": 0.0,
"completion_length": 720.1062850952148,
"epoch": 0.6947170777502774,
"grad_norm": 0.6730054020881653,
"kl": 1.17783203125,
"learning_rate": 7.738629347926818e-07,
"loss": 0.1375,
"reward": 2.0524554371833803,
"reward_std": 0.4183092713356018,
"rewards/accuracy_reward": 0.20446429429575802,
"rewards/format_reward": 0.9133928954601288,
"rewards/tag_count_reward": 0.9345982581377029,
"step": 2035
},
{
"clip_ratio": 0.0,
"completion_length": 796.7393218994141,
"epoch": 0.6964239993172314,
"grad_norm": 0.5611494779586792,
"kl": 1.42998046875,
"learning_rate": 7.660545105210627e-07,
"loss": 0.1679,
"reward": 2.0196429550647736,
"reward_std": 0.4546675443649292,
"rewards/accuracy_reward": 0.20982143972069026,
"rewards/format_reward": 0.8875000417232514,
"rewards/tag_count_reward": 0.9223214745521545,
"step": 2040
},
{
"clip_ratio": 0.0,
"completion_length": 772.7375335693359,
"epoch": 0.6981309208841854,
"grad_norm": 1.859115719795227,
"kl": 1.3640625,
"learning_rate": 7.582721484728289e-07,
"loss": 0.22,
"reward": 1.988392949104309,
"reward_std": 0.39518385380506516,
"rewards/accuracy_reward": 0.1821428638882935,
"rewards/format_reward": 0.8857143223285675,
"rewards/tag_count_reward": 0.9205357611179352,
"step": 2045
},
{
"clip_ratio": 0.0,
"completion_length": 725.6500366210937,
"epoch": 0.6998378424511393,
"grad_norm": 6.75427770614624,
"kl": 1.10439453125,
"learning_rate": 7.50516124997738e-07,
"loss": 0.1238,
"reward": 1.9763393878936768,
"reward_std": 0.39795525595545767,
"rewards/accuracy_reward": 0.16071429178118707,
"rewards/format_reward": 0.8919643223285675,
"rewards/tag_count_reward": 0.9236607491970062,
"step": 2050
},
{
"clip_ratio": 0.0,
"completion_length": 745.8535980224609,
"epoch": 0.7015447640180934,
"grad_norm": 1.4931570291519165,
"kl": 1.0126953125,
"learning_rate": 7.427867155102712e-07,
"loss": 0.133,
"reward": 2.013169747591019,
"reward_std": 0.380087611079216,
"rewards/accuracy_reward": 0.1669642947614193,
"rewards/format_reward": 0.9080357491970062,
"rewards/tag_count_reward": 0.9381696850061416,
"step": 2055
},
{
"clip_ratio": 0.0,
"completion_length": 733.42861328125,
"epoch": 0.7032516855850474,
"grad_norm": 5.1911211013793945,
"kl": 1.214453125,
"learning_rate": 7.350841944798547e-07,
"loss": 0.16,
"reward": 1.9904018700122834,
"reward_std": 0.4548638232052326,
"rewards/accuracy_reward": 0.2062500131316483,
"rewards/format_reward": 0.8714286029338837,
"rewards/tag_count_reward": 0.9127232551574707,
"step": 2060
},
{
"clip_ratio": 0.0,
"completion_length": 751.2232513427734,
"epoch": 0.7049586071520013,
"grad_norm": 1.5467967987060547,
"kl": 1.441796875,
"learning_rate": 7.2740883542111e-07,
"loss": 0.1354,
"reward": 1.958035796880722,
"reward_std": 0.5087195709347725,
"rewards/accuracy_reward": 0.21071429485455156,
"rewards/format_reward": 0.851785758137703,
"rewards/tag_count_reward": 0.8955357581377029,
"step": 2065
},
{
"clip_ratio": 0.0,
"completion_length": 778.2982543945312,
"epoch": 0.7066655287189554,
"grad_norm": 4.287073612213135,
"kl": 1.624609375,
"learning_rate": 7.197609108841465e-07,
"loss": 0.137,
"reward": 1.9265625834465028,
"reward_std": 0.5439430341124535,
"rewards/accuracy_reward": 0.18303572461009027,
"rewards/format_reward": 0.8419643342494965,
"rewards/tag_count_reward": 0.9015625417232513,
"step": 2070
},
{
"clip_ratio": 0.0,
"completion_length": 745.2491424560546,
"epoch": 0.7083724502859093,
"grad_norm": 1.5213736295700073,
"kl": 1.6482421875,
"learning_rate": 7.121406924448783e-07,
"loss": 0.1766,
"reward": 1.944196528196335,
"reward_std": 0.5264617592096329,
"rewards/accuracy_reward": 0.18392858002334833,
"rewards/format_reward": 0.8553571790456772,
"rewards/tag_count_reward": 0.9049107551574707,
"step": 2075
},
{
"clip_ratio": 0.0,
"completion_length": 733.4545013427735,
"epoch": 0.7100793718528634,
"grad_norm": 1.2759286165237427,
"kl": 1.5237548828125,
"learning_rate": 7.045484506953832e-07,
"loss": 0.1214,
"reward": 1.9647322237491607,
"reward_std": 0.4902063623070717,
"rewards/accuracy_reward": 0.20535715045407416,
"rewards/format_reward": 0.8589286088943482,
"rewards/tag_count_reward": 0.9004464775323868,
"step": 2080
},
{
"clip_ratio": 0.0,
"completion_length": 730.1562896728516,
"epoch": 0.7117862934198174,
"grad_norm": 4.01860237121582,
"kl": 1.3330078125,
"learning_rate": 6.969844552342939e-07,
"loss": 0.1437,
"reward": 1.993303656578064,
"reward_std": 0.42157181948423383,
"rewards/accuracy_reward": 0.1848214372061193,
"rewards/format_reward": 0.8848214656114578,
"rewards/tag_count_reward": 0.923660758137703,
"step": 2085
},
{
"clip_ratio": 0.0,
"completion_length": 728.0446838378906,
"epoch": 0.7134932149867713,
"grad_norm": 1.2710025310516357,
"kl": 1.3125,
"learning_rate": 6.894489746572252e-07,
"loss": 0.1311,
"reward": 1.9805804371833802,
"reward_std": 0.44029773622751234,
"rewards/accuracy_reward": 0.16696429029107093,
"rewards/format_reward": 0.8910714685916901,
"rewards/tag_count_reward": 0.9225446879863739,
"step": 2090
},
{
"clip_ratio": 0.0,
"completion_length": 714.4286010742187,
"epoch": 0.7152001365537254,
"grad_norm": 2.4798624515533447,
"kl": 1.593359375,
"learning_rate": 6.819422765472337e-07,
"loss": 0.2047,
"reward": 2.0042411744594575,
"reward_std": 0.46089204400777817,
"rewards/accuracy_reward": 0.2044642947614193,
"rewards/format_reward": 0.8839286178350448,
"rewards/tag_count_reward": 0.9158482521772384,
"step": 2095
},
{
"clip_ratio": 0.0,
"completion_length": 691.5536010742187,
"epoch": 0.7169070581206793,
"grad_norm": 2.2551651000976562,
"kl": 1.31337890625,
"learning_rate": 6.744646274653198e-07,
"loss": 0.1504,
"reward": 2.024553656578064,
"reward_std": 0.4200122021138668,
"rewards/accuracy_reward": 0.23571429625153542,
"rewards/format_reward": 0.8785714685916901,
"rewards/tag_count_reward": 0.9102678954601288,
"step": 2100
},
{
"clip_ratio": 0.0,
"completion_length": 737.5652160644531,
"epoch": 0.7186139796876334,
"grad_norm": 1.9618374109268188,
"kl": 1.533203125,
"learning_rate": 6.670162929409572e-07,
"loss": 0.1547,
"reward": 1.949330461025238,
"reward_std": 0.47244517505168915,
"rewards/accuracy_reward": 0.17321429355069995,
"rewards/format_reward": 0.8625000447034836,
"rewards/tag_count_reward": 0.9136161088943482,
"step": 2105
},
{
"clip_ratio": 0.0,
"completion_length": 735.7500244140625,
"epoch": 0.7203209012545874,
"grad_norm": 3.131383180618286,
"kl": 1.41767578125,
"learning_rate": 6.595975374626699e-07,
"loss": 0.182,
"reward": 1.9044643700122834,
"reward_std": 0.48238158598542213,
"rewards/accuracy_reward": 0.13839286249130964,
"rewards/format_reward": 0.8616071850061416,
"rewards/tag_count_reward": 0.9044643253087997,
"step": 2110
},
{
"clip_ratio": 0.0,
"completion_length": 700.0759216308594,
"epoch": 0.7220278228215413,
"grad_norm": 1.9941433668136597,
"kl": 1.3751953125,
"learning_rate": 6.522086244686351e-07,
"loss": 0.1702,
"reward": 2.022098296880722,
"reward_std": 0.42175976261496545,
"rewards/accuracy_reward": 0.2017857232131064,
"rewards/format_reward": 0.8910714715719223,
"rewards/tag_count_reward": 0.9292411118745804,
"step": 2115
},
{
"clip_ratio": 0.0,
"completion_length": 732.7411071777344,
"epoch": 0.7237347443884954,
"grad_norm": 1.4535045623779297,
"kl": 1.6833984375,
"learning_rate": 6.448498163373324e-07,
"loss": 0.2005,
"reward": 1.98214293718338,
"reward_std": 0.4729441896080971,
"rewards/accuracy_reward": 0.19821429708972574,
"rewards/format_reward": 0.8723214745521546,
"rewards/tag_count_reward": 0.9116071820259094,
"step": 2120
},
{
"clip_ratio": 0.0,
"completion_length": 744.5536071777344,
"epoch": 0.7254416659554493,
"grad_norm": 5.207380294799805,
"kl": 1.4818359375,
"learning_rate": 6.375213743782236e-07,
"loss": 0.2118,
"reward": 1.9560268759727477,
"reward_std": 0.4524582926183939,
"rewards/accuracy_reward": 0.17678572228178382,
"rewards/format_reward": 0.873214328289032,
"rewards/tag_count_reward": 0.9060268312692642,
"step": 2125
},
{
"clip_ratio": 0.0,
"completion_length": 720.8357482910156,
"epoch": 0.7271485875224033,
"grad_norm": 3.1080732345581055,
"kl": 1.273681640625,
"learning_rate": 6.302235588224764e-07,
"loss": 0.1525,
"reward": 1.907366156578064,
"reward_std": 0.4530935399234295,
"rewards/accuracy_reward": 0.18035715045407413,
"rewards/format_reward": 0.8446428924798965,
"rewards/tag_count_reward": 0.8823661178350448,
"step": 2130
},
{
"clip_ratio": 0.0,
"completion_length": 697.6750335693359,
"epoch": 0.7288555090893574,
"grad_norm": 1.5537960529327393,
"kl": 1.09833984375,
"learning_rate": 6.229566288137212e-07,
"loss": 0.1274,
"reward": 2.033705461025238,
"reward_std": 0.41119332425296307,
"rewards/accuracy_reward": 0.192857151851058,
"rewards/format_reward": 0.9071429014205933,
"rewards/tag_count_reward": 0.933705398440361,
"step": 2135
},
{
"clip_ratio": 0.0,
"completion_length": 718.7955627441406,
"epoch": 0.7305624306563113,
"grad_norm": 1.1402044296264648,
"kl": 1.59580078125,
"learning_rate": 6.157208423988513e-07,
"loss": 0.1568,
"reward": 1.930357241630554,
"reward_std": 0.4769420772790909,
"rewards/accuracy_reward": 0.1625000076368451,
"rewards/format_reward": 0.8651786118745803,
"rewards/tag_count_reward": 0.9026786148548126,
"step": 2140
},
{
"clip_ratio": 0.0,
"completion_length": 703.5714599609375,
"epoch": 0.7322693522232654,
"grad_norm": 5.189487934112549,
"kl": 2.553125,
"learning_rate": 6.085164565188594e-07,
"loss": 0.2333,
"reward": 1.837723284959793,
"reward_std": 0.5507455065846443,
"rewards/accuracy_reward": 0.15982143608853222,
"rewards/format_reward": 0.8116071820259094,
"rewards/tag_count_reward": 0.866294676065445,
"step": 2145
},
{
"clip_ratio": 0.0,
"completion_length": 735.147348022461,
"epoch": 0.7339762737902193,
"grad_norm": 5.181674480438232,
"kl": 2.1427734375,
"learning_rate": 6.013437269997111e-07,
"loss": 0.2152,
"reward": 1.794866132736206,
"reward_std": 0.6164493501186371,
"rewards/accuracy_reward": 0.15178572125732898,
"rewards/format_reward": 0.7875000447034836,
"rewards/tag_count_reward": 0.855580398440361,
"step": 2150
},
{
"clip_ratio": 0.0,
"completion_length": 721.4187896728515,
"epoch": 0.7356831953571733,
"grad_norm": 2.7006282806396484,
"kl": 1.9798828125,
"learning_rate": 5.942029085432636e-07,
"loss": 0.2448,
"reward": 1.9575893998146057,
"reward_std": 0.5865769028663635,
"rewards/accuracy_reward": 0.2482142984867096,
"rewards/format_reward": 0.8321428954601288,
"rewards/tag_count_reward": 0.8772321879863739,
"step": 2155
},
{
"clip_ratio": 0.0,
"completion_length": 761.2812805175781,
"epoch": 0.7373901169241274,
"grad_norm": 3.3802006244659424,
"kl": 2.14765625,
"learning_rate": 5.87094254718219e-07,
"loss": 0.1986,
"reward": 1.8209822237491609,
"reward_std": 0.5044765949249268,
"rewards/accuracy_reward": 0.12767857704311608,
"rewards/format_reward": 0.8241071820259094,
"rewards/tag_count_reward": 0.8691964715719223,
"step": 2160
},
{
"clip_ratio": 0.0,
"completion_length": 717.7089599609375,
"epoch": 0.7390970384910813,
"grad_norm": 3.371708631515503,
"kl": 1.8705078125,
"learning_rate": 5.80018017951123e-07,
"loss": 0.2285,
"reward": 1.9609375894069672,
"reward_std": 0.527063025534153,
"rewards/accuracy_reward": 0.20446429466828703,
"rewards/format_reward": 0.8553571850061417,
"rewards/tag_count_reward": 0.9011161148548126,
"step": 2165
},
{
"clip_ratio": 0.0,
"completion_length": 724.678598022461,
"epoch": 0.7408039600580353,
"grad_norm": 1.3309767246246338,
"kl": 1.638671875,
"learning_rate": 5.729744495173978e-07,
"loss": 0.1628,
"reward": 1.9218750953674317,
"reward_std": 0.4835737131536007,
"rewards/accuracy_reward": 0.1580357243306935,
"rewards/format_reward": 0.8607143253087998,
"rewards/tag_count_reward": 0.9031250417232514,
"step": 2170
},
{
"clip_ratio": 0.0,
"completion_length": 709.6544952392578,
"epoch": 0.7425108816249893,
"grad_norm": 2.5176899433135986,
"kl": 1.289453125,
"learning_rate": 5.659637995324229e-07,
"loss": 0.1573,
"reward": 1.9895090281963348,
"reward_std": 0.4916133493185043,
"rewards/accuracy_reward": 0.2044642967171967,
"rewards/format_reward": 0.875892898440361,
"rewards/tag_count_reward": 0.9091518193483352,
"step": 2175
},
{
"clip_ratio": 0.0,
"completion_length": 747.656283569336,
"epoch": 0.7442178031919433,
"grad_norm": 0.922635018825531,
"kl": 1.71025390625,
"learning_rate": 5.589863169426506e-07,
"loss": 0.1529,
"reward": 1.9279018759727478,
"reward_std": 0.5085976898670197,
"rewards/accuracy_reward": 0.1892857227474451,
"rewards/format_reward": 0.8473214715719223,
"rewards/tag_count_reward": 0.8912946850061416,
"step": 2180
},
{
"clip_ratio": 0.0,
"completion_length": 734.5178894042969,
"epoch": 0.7459247247588974,
"grad_norm": 1.6850134134292603,
"kl": 1.76162109375,
"learning_rate": 5.520422495167671e-07,
"loss": 0.24,
"reward": 1.9770090103149414,
"reward_std": 0.47378580197691916,
"rewards/accuracy_reward": 0.2151785809546709,
"rewards/format_reward": 0.8607143282890319,
"rewards/tag_count_reward": 0.9011161088943481,
"step": 2185
},
{
"clip_ratio": 0.0,
"completion_length": 728.4803924560547,
"epoch": 0.7476316463258513,
"grad_norm": 3.3271028995513916,
"kl": 1.80859375,
"learning_rate": 5.451318438368943e-07,
"loss": 0.2188,
"reward": 1.9725447177886963,
"reward_std": 0.49253502711653707,
"rewards/accuracy_reward": 0.20267858086153864,
"rewards/format_reward": 0.8651786088943482,
"rewards/tag_count_reward": 0.9046875417232514,
"step": 2190
},
{
"clip_ratio": 0.0,
"completion_length": 706.0098602294922,
"epoch": 0.7493385678928053,
"grad_norm": 2.4637181758880615,
"kl": 1.594140625,
"learning_rate": 5.382553452898354e-07,
"loss": 0.1562,
"reward": 1.9122768878936767,
"reward_std": 0.4774385288357735,
"rewards/accuracy_reward": 0.15089286360889673,
"rewards/format_reward": 0.8616071850061416,
"rewards/tag_count_reward": 0.8997768282890319,
"step": 2195
},
{
"clip_ratio": 0.0,
"completion_length": 713.5857437133789,
"epoch": 0.7510454894597594,
"grad_norm": 3.416898012161255,
"kl": 2.0388671875,
"learning_rate": 5.314129980583572e-07,
"loss": 0.2084,
"reward": 1.9265625834465028,
"reward_std": 0.5427880018949509,
"rewards/accuracy_reward": 0.21250001024454832,
"rewards/format_reward": 0.832142898440361,
"rewards/tag_count_reward": 0.8819196820259094,
"step": 2200
},
{
"clip_ratio": 0.0,
"completion_length": 720.0196838378906,
"epoch": 0.7527524110267133,
"grad_norm": 2.489032506942749,
"kl": 2.124609375,
"learning_rate": 5.246050451125244e-07,
"loss": 0.1666,
"reward": 1.8587054431438446,
"reward_std": 0.6338795974850655,
"rewards/accuracy_reward": 0.18035715045407413,
"rewards/format_reward": 0.8107143253087997,
"rewards/tag_count_reward": 0.8676339685916901,
"step": 2205
},
{
"clip_ratio": 0.0,
"completion_length": 750.5732421875,
"epoch": 0.7544593325936674,
"grad_norm": 1.8052889108657837,
"kl": 2.0294921875,
"learning_rate": 5.178317282010667e-07,
"loss": 0.2124,
"reward": 1.9091518640518188,
"reward_std": 0.5391280226409435,
"rewards/accuracy_reward": 0.1982142969965935,
"rewards/format_reward": 0.8312500417232513,
"rewards/tag_count_reward": 0.8796875447034835,
"step": 2210
},
{
"clip_ratio": 0.0,
"completion_length": 720.4928985595703,
"epoch": 0.7561662541606213,
"grad_norm": 2.0707011222839355,
"kl": 1.9193359375,
"learning_rate": 5.110932878427982e-07,
"loss": 0.1962,
"reward": 1.904017949104309,
"reward_std": 0.4930908754467964,
"rewards/accuracy_reward": 0.19107143729925155,
"rewards/format_reward": 0.8366071820259094,
"rewards/tag_count_reward": 0.8763393223285675,
"step": 2215
},
{
"clip_ratio": 0.0,
"completion_length": 684.6955688476562,
"epoch": 0.7578731757275753,
"grad_norm": 2.0002994537353516,
"kl": 1.47373046875,
"learning_rate": 5.043899633180737e-07,
"loss": 0.1337,
"reward": 2.0084822356700895,
"reward_std": 0.466427081823349,
"rewards/accuracy_reward": 0.20178572162985803,
"rewards/format_reward": 0.8866071820259094,
"rewards/tag_count_reward": 0.920089328289032,
"step": 2220
},
{
"clip_ratio": 0.0,
"completion_length": 749.0723571777344,
"epoch": 0.7595800972945294,
"grad_norm": 1.084472894668579,
"kl": 1.1036865234375,
"learning_rate": 4.977219926602959e-07,
"loss": 0.1531,
"reward": 2.0250000953674316,
"reward_std": 0.39613064378499985,
"rewards/accuracy_reward": 0.19732143655419349,
"rewards/format_reward": 0.8982143253087997,
"rewards/tag_count_reward": 0.9294643312692642,
"step": 2225
},
{
"clip_ratio": 0.0,
"completion_length": 731.555386352539,
"epoch": 0.7612870188614833,
"grad_norm": 2.8694820404052734,
"kl": 1.3392578125,
"learning_rate": 4.910896126474581e-07,
"loss": 0.1268,
"reward": 2.042410796880722,
"reward_std": 0.3969473861157894,
"rewards/accuracy_reward": 0.186607151851058,
"rewards/format_reward": 0.9142857521772385,
"rewards/tag_count_reward": 0.9415179044008255,
"step": 2230
},
{
"clip_ratio": 0.0,
"completion_length": 703.9161041259765,
"epoch": 0.7629939404284373,
"grad_norm": 4.769933223724365,
"kl": 1.3875,
"learning_rate": 4.844930587937399e-07,
"loss": 0.1978,
"reward": 2.0287947416305543,
"reward_std": 0.41266813427209853,
"rewards/accuracy_reward": 0.1946428656578064,
"rewards/format_reward": 0.9017857551574707,
"rewards/tag_count_reward": 0.9323661208152771,
"step": 2235
},
{
"clip_ratio": 0.0,
"completion_length": 730.9107452392578,
"epoch": 0.7647008619953913,
"grad_norm": 2.770189046859741,
"kl": 1.587255859375,
"learning_rate": 4.779325653411413e-07,
"loss": 0.1698,
"reward": 2.007366156578064,
"reward_std": 0.4938293993473053,
"rewards/accuracy_reward": 0.20446429401636124,
"rewards/format_reward": 0.8839286208152771,
"rewards/tag_count_reward": 0.9189732581377029,
"step": 2240
},
{
"clip_ratio": 0.0,
"completion_length": 758.0080657958985,
"epoch": 0.7664077835623453,
"grad_norm": 3.440570592880249,
"kl": 1.26435546875,
"learning_rate": 4.714083652511686e-07,
"loss": 0.1752,
"reward": 1.99553582072258,
"reward_std": 0.4044483944773674,
"rewards/accuracy_reward": 0.1919642947614193,
"rewards/format_reward": 0.8848214656114578,
"rewards/tag_count_reward": 0.9187500447034835,
"step": 2245
},
{
"clip_ratio": 0.0,
"completion_length": 725.3080688476563,
"epoch": 0.7681147051292994,
"grad_norm": 2.006042242050171,
"kl": 1.3291015625,
"learning_rate": 4.6492069019655783e-07,
"loss": 0.1251,
"reward": 1.9899554550647736,
"reward_std": 0.4256874620914459,
"rewards/accuracy_reward": 0.16964286305010318,
"rewards/format_reward": 0.8937500417232513,
"rewards/tag_count_reward": 0.9265625447034835,
"step": 2250
},
{
"clip_ratio": 0.0,
"completion_length": 754.3616333007812,
"epoch": 0.7698216266962533,
"grad_norm": 1.4680705070495605,
"kl": 1.687109375,
"learning_rate": 4.5846977055305117e-07,
"loss": 0.2298,
"reward": 1.923214364051819,
"reward_std": 0.47255007922649384,
"rewards/accuracy_reward": 0.15446429289877414,
"rewards/format_reward": 0.8607143342494965,
"rewards/tag_count_reward": 0.9080357551574707,
"step": 2255
},
{
"clip_ratio": 0.0,
"completion_length": 766.7187866210937,
"epoch": 0.7715285482632073,
"grad_norm": 2.3678011894226074,
"kl": 1.8763671875,
"learning_rate": 4.5205583539121457e-07,
"loss": 0.2091,
"reward": 1.9611608386039734,
"reward_std": 0.4554700754582882,
"rewards/accuracy_reward": 0.16160715138539672,
"rewards/format_reward": 0.8812500447034836,
"rewards/tag_count_reward": 0.9183036118745804,
"step": 2260
},
{
"clip_ratio": 0.0,
"completion_length": 746.3866333007812,
"epoch": 0.7732354698301613,
"grad_norm": 3.2156410217285156,
"kl": 1.5232421875,
"learning_rate": 4.456791124683043e-07,
"loss": 0.1613,
"reward": 1.9868304550647735,
"reward_std": 0.4991786405444145,
"rewards/accuracy_reward": 0.19017858076840638,
"rewards/format_reward": 0.879464328289032,
"rewards/tag_count_reward": 0.9171875447034836,
"step": 2265
},
{
"clip_ratio": 0.0,
"completion_length": 729.1241424560546,
"epoch": 0.7749423913971153,
"grad_norm": 1.6405709981918335,
"kl": 1.175,
"learning_rate": 4.3933982822017883e-07,
"loss": 0.1704,
"reward": 1.9705358147621155,
"reward_std": 0.38443772345781324,
"rewards/accuracy_reward": 0.1714285803027451,
"rewards/format_reward": 0.8857143223285675,
"rewards/tag_count_reward": 0.9133929014205933,
"step": 2270
},
{
"clip_ratio": 0.0,
"completion_length": 736.1670013427735,
"epoch": 0.7766493129640692,
"grad_norm": 2.096561908721924,
"kl": 1.32587890625,
"learning_rate": 4.330382077532594e-07,
"loss": 0.1661,
"reward": 1.9901786684989928,
"reward_std": 0.4159936264157295,
"rewards/accuracy_reward": 0.17678572433069348,
"rewards/format_reward": 0.8892857521772385,
"rewards/tag_count_reward": 0.9241071790456772,
"step": 2275
},
{
"clip_ratio": 0.0,
"completion_length": 713.774136352539,
"epoch": 0.7783562345310233,
"grad_norm": 1.4854867458343506,
"kl": 1.338671875,
"learning_rate": 4.2677447483653544e-07,
"loss": 0.1976,
"reward": 2.014509016275406,
"reward_std": 0.3893769010901451,
"rewards/accuracy_reward": 0.1946428687311709,
"rewards/format_reward": 0.8955357551574707,
"rewards/tag_count_reward": 0.924330398440361,
"step": 2280
},
{
"clip_ratio": 0.0,
"completion_length": 744.8598602294921,
"epoch": 0.7800631560979773,
"grad_norm": 1.2633461952209473,
"kl": 1.43720703125,
"learning_rate": 4.2054885189361833e-07,
"loss": 0.2002,
"reward": 1.9819197416305543,
"reward_std": 0.4485273748636246,
"rewards/accuracy_reward": 0.17142857760190963,
"rewards/format_reward": 0.8857143342494964,
"rewards/tag_count_reward": 0.924776828289032,
"step": 2285
},
{
"clip_ratio": 0.0,
"completion_length": 748.2277160644531,
"epoch": 0.7817700776649313,
"grad_norm": 1.8592782020568848,
"kl": 1.3447265625,
"learning_rate": 4.143615599948437e-07,
"loss": 0.2039,
"reward": 1.9948661625385284,
"reward_std": 0.46121391505002973,
"rewards/accuracy_reward": 0.18482143487781286,
"rewards/format_reward": 0.8883929014205932,
"rewards/tag_count_reward": 0.9216518223285675,
"step": 2290
},
{
"clip_ratio": 0.0,
"completion_length": 755.8902099609375,
"epoch": 0.7834769992318853,
"grad_norm": 3.4845056533813477,
"kl": 1.9146484375,
"learning_rate": 4.0821281884942145e-07,
"loss": 0.2436,
"reward": 1.9580358266830444,
"reward_std": 0.48862814009189603,
"rewards/accuracy_reward": 0.17857143571600317,
"rewards/format_reward": 0.869642898440361,
"rewards/tag_count_reward": 0.9098214715719223,
"step": 2295
},
{
"clip_ratio": 0.0,
"completion_length": 750.5259246826172,
"epoch": 0.7851839207988393,
"grad_norm": 1.9733744859695435,
"kl": 1.48447265625,
"learning_rate": 4.021028467976341e-07,
"loss": 0.2299,
"reward": 1.9584822475910186,
"reward_std": 0.4035663403570652,
"rewards/accuracy_reward": 0.16071429373696447,
"rewards/format_reward": 0.8794643253087997,
"rewards/tag_count_reward": 0.918303620815277,
"step": 2300
},
{
"clip_ratio": 0.0,
"completion_length": 721.5187866210938,
"epoch": 0.7868908423657933,
"grad_norm": 4.208427429199219,
"kl": 1.501953125,
"learning_rate": 3.9603186080308253e-07,
"loss": 0.2006,
"reward": 2.011160784959793,
"reward_std": 0.47146844640374186,
"rewards/accuracy_reward": 0.20178572395816446,
"rewards/format_reward": 0.8857143312692642,
"rewards/tag_count_reward": 0.9236607551574707,
"step": 2305
},
{
"clip_ratio": 0.0,
"completion_length": 697.9473556518554,
"epoch": 0.7885977639327473,
"grad_norm": 1.42350435256958,
"kl": 1.84296875,
"learning_rate": 3.90000076444983e-07,
"loss": 0.2227,
"reward": 1.9866072297096253,
"reward_std": 0.4845219224691391,
"rewards/accuracy_reward": 0.19553572265431285,
"rewards/format_reward": 0.8750000417232513,
"rewards/tag_count_reward": 0.9160714626312256,
"step": 2310
},
{
"clip_ratio": 0.0,
"completion_length": 710.6911041259766,
"epoch": 0.7903046854997013,
"grad_norm": 1.7006930112838745,
"kl": 1.32939453125,
"learning_rate": 3.8400770791051087e-07,
"loss": 0.1753,
"reward": 1.9910715341567993,
"reward_std": 0.4488883726298809,
"rewards/accuracy_reward": 0.16785715138539672,
"rewards/format_reward": 0.8937500417232513,
"rewards/tag_count_reward": 0.929464328289032,
"step": 2315
},
{
"clip_ratio": 0.0,
"completion_length": 761.4884246826172,
"epoch": 0.7920116070666553,
"grad_norm": 1.214569091796875,
"kl": 1.682421875,
"learning_rate": 3.7805496798719545e-07,
"loss": 0.2685,
"reward": 1.9638393938541412,
"reward_std": 0.4939615406095982,
"rewards/accuracy_reward": 0.18035714877769352,
"rewards/format_reward": 0.8732143253087997,
"rewards/tag_count_reward": 0.9102679044008255,
"step": 2320
},
{
"clip_ratio": 0.0,
"completion_length": 724.4518188476562,
"epoch": 0.7937185286336093,
"grad_norm": 3.557129383087158,
"kl": 1.6748046875,
"learning_rate": 3.721420680553634e-07,
"loss": 0.1957,
"reward": 1.9421875953674317,
"reward_std": 0.4420395828783512,
"rewards/accuracy_reward": 0.15625000586733223,
"rewards/format_reward": 0.8741071879863739,
"rewards/tag_count_reward": 0.9118303954601288,
"step": 2325
},
{
"clip_ratio": 0.0,
"completion_length": 725.4232543945312,
"epoch": 0.7954254502005633,
"grad_norm": 2.902493476867676,
"kl": 1.6931640625,
"learning_rate": 3.6626921808063434e-07,
"loss": 0.2131,
"reward": 1.9752233147621154,
"reward_std": 0.48989410772919656,
"rewards/accuracy_reward": 0.1910714352503419,
"rewards/format_reward": 0.8741071850061417,
"rewards/tag_count_reward": 0.910044687986374,
"step": 2330
},
{
"clip_ratio": 0.0,
"completion_length": 760.6036010742188,
"epoch": 0.7971323717675173,
"grad_norm": 1.2533881664276123,
"kl": 1.611328125,
"learning_rate": 3.604366266064625e-07,
"loss": 0.2517,
"reward": 1.9265625774860382,
"reward_std": 0.49659521505236626,
"rewards/accuracy_reward": 0.16696429457515477,
"rewards/format_reward": 0.8598214685916901,
"rewards/tag_count_reward": 0.8997768223285675,
"step": 2335
},
{
"clip_ratio": 0.0,
"completion_length": 689.780386352539,
"epoch": 0.7988392933344712,
"grad_norm": 1.823298454284668,
"kl": 1.47578125,
"learning_rate": 3.546445007467333e-07,
"loss": 0.2197,
"reward": 1.9589286863803863,
"reward_std": 0.45438055247068404,
"rewards/accuracy_reward": 0.17678572349250316,
"rewards/format_reward": 0.8741071939468383,
"rewards/tag_count_reward": 0.908035758137703,
"step": 2340
},
{
"clip_ratio": 0.0,
"completion_length": 760.0036041259766,
"epoch": 0.8005462149014253,
"grad_norm": 1.8940644264221191,
"kl": 1.762109375,
"learning_rate": 3.488930461784075e-07,
"loss": 0.2641,
"reward": 1.8991072356700898,
"reward_std": 0.4983711659908295,
"rewards/accuracy_reward": 0.1392857225611806,
"rewards/format_reward": 0.8589286118745804,
"rewards/tag_count_reward": 0.9008928954601287,
"step": 2345
},
{
"clip_ratio": 0.0,
"completion_length": 718.2661041259765,
"epoch": 0.8022531364683793,
"grad_norm": 5.181538105010986,
"kl": 1.598828125,
"learning_rate": 3.431824671342198e-07,
"loss": 0.1734,
"reward": 1.9828125834465027,
"reward_std": 0.47254706025123594,
"rewards/accuracy_reward": 0.19107143431901932,
"rewards/format_reward": 0.8776786148548126,
"rewards/tag_count_reward": 0.9140625417232513,
"step": 2350
},
{
"clip_ratio": 0.0,
"completion_length": 737.5125274658203,
"epoch": 0.8039600580353333,
"grad_norm": 2.329116106033325,
"kl": 1.43359375,
"learning_rate": 3.375129663954233e-07,
"loss": 0.1932,
"reward": 1.9479911565780639,
"reward_std": 0.4480812445282936,
"rewards/accuracy_reward": 0.16785715129226447,
"rewards/format_reward": 0.8732143312692642,
"rewards/tag_count_reward": 0.9069196879863739,
"step": 2355
},
{
"clip_ratio": 0.0,
"completion_length": 739.2071746826172,
"epoch": 0.8056669796022873,
"grad_norm": 0.8552534580230713,
"kl": 1.71162109375,
"learning_rate": 3.318847452845922e-07,
"loss": 0.2293,
"reward": 1.9145090103149414,
"reward_std": 0.5126624539494514,
"rewards/accuracy_reward": 0.16339286379516124,
"rewards/format_reward": 0.8553571850061417,
"rewards/tag_count_reward": 0.8957589745521546,
"step": 2360
},
{
"clip_ratio": 0.0,
"completion_length": 717.5143218994141,
"epoch": 0.8073739011692412,
"grad_norm": 2.5750255584716797,
"kl": 1.56728515625,
"learning_rate": 3.2629800365847046e-07,
"loss": 0.2652,
"reward": 1.9863840222358704,
"reward_std": 0.4886273622512817,
"rewards/accuracy_reward": 0.20892858020961286,
"rewards/format_reward": 0.869642898440361,
"rewards/tag_count_reward": 0.9078125387430191,
"step": 2365
},
{
"clip_ratio": 0.0,
"completion_length": 738.2857482910156,
"epoch": 0.8090808227361953,
"grad_norm": 4.131805896759033,
"kl": 1.611328125,
"learning_rate": 3.207529399008756e-07,
"loss": 0.2404,
"reward": 1.9517858147621154,
"reward_std": 0.520108437538147,
"rewards/accuracy_reward": 0.169642863702029,
"rewards/format_reward": 0.8741071850061417,
"rewards/tag_count_reward": 0.908035758137703,
"step": 2370
},
{
"clip_ratio": 0.0,
"completion_length": 752.1777130126953,
"epoch": 0.8107877443031493,
"grad_norm": 1.5850900411605835,
"kl": 1.9701171875,
"learning_rate": 3.152497509156543e-07,
"loss": 0.2684,
"reward": 1.891071516275406,
"reward_std": 0.49450200498104097,
"rewards/accuracy_reward": 0.15000000754371284,
"rewards/format_reward": 0.8517857640981674,
"rewards/tag_count_reward": 0.8892857521772385,
"step": 2375
},
{
"clip_ratio": 0.0,
"completion_length": 719.3598571777344,
"epoch": 0.8124946658701032,
"grad_norm": 2.113619804382324,
"kl": 1.4361328125,
"learning_rate": 3.0978863211969146e-07,
"loss": 0.1825,
"reward": 2.00491082072258,
"reward_std": 0.41534021943807603,
"rewards/accuracy_reward": 0.1901785798370838,
"rewards/format_reward": 0.8910714685916901,
"rewards/tag_count_reward": 0.9236607611179352,
"step": 2380
},
{
"clip_ratio": 0.0,
"completion_length": 725.2830688476563,
"epoch": 0.8142015874370573,
"grad_norm": 47.90665817260742,
"kl": 1.62421875,
"learning_rate": 3.0436977743596823e-07,
"loss": 0.1664,
"reward": 1.945312601327896,
"reward_std": 0.4939949780702591,
"rewards/accuracy_reward": 0.17678572284057736,
"rewards/format_reward": 0.86607146859169,
"rewards/tag_count_reward": 0.9024554044008255,
"step": 2385
},
{
"clip_ratio": 0.0,
"completion_length": 739.9491424560547,
"epoch": 0.8159085090040112,
"grad_norm": 2.70536470413208,
"kl": 1.4115234375,
"learning_rate": 2.989933792866793e-07,
"loss": 0.1852,
"reward": 1.9897322237491608,
"reward_std": 0.47392349503934383,
"rewards/accuracy_reward": 0.2017857247032225,
"rewards/format_reward": 0.8750000476837159,
"rewards/tag_count_reward": 0.91294646859169,
"step": 2390
},
{
"clip_ratio": 0.0,
"completion_length": 759.7696807861328,
"epoch": 0.8176154305709653,
"grad_norm": 1.9668916463851929,
"kl": 1.85087890625,
"learning_rate": 2.9365962858639733e-07,
"loss": 0.3048,
"reward": 1.9325893759727477,
"reward_std": 0.5062482297420502,
"rewards/accuracy_reward": 0.17857143562287092,
"rewards/format_reward": 0.8544643253087998,
"rewards/tag_count_reward": 0.8995536118745804,
"step": 2395
},
{
"clip_ratio": 0.0,
"completion_length": 748.3286041259765,
"epoch": 0.8193223521379193,
"grad_norm": 5.197607040405273,
"kl": 1.83046875,
"learning_rate": 2.8836871473529435e-07,
"loss": 0.2788,
"reward": 1.923660808801651,
"reward_std": 0.5111315354704857,
"rewards/accuracy_reward": 0.1714285772293806,
"rewards/format_reward": 0.8562500387430191,
"rewards/tag_count_reward": 0.8959821909666061,
"step": 2400
},
{
"clip_ratio": 0.0,
"completion_length": 733.7250335693359,
"epoch": 0.8210292737048732,
"grad_norm": 3.3527982234954834,
"kl": 1.71171875,
"learning_rate": 2.831208256124167e-07,
"loss": 0.2245,
"reward": 1.8738840103149415,
"reward_std": 0.5179990664124489,
"rewards/accuracy_reward": 0.15089286230504512,
"rewards/format_reward": 0.8410714715719223,
"rewards/tag_count_reward": 0.8819196879863739,
"step": 2405
},
{
"clip_ratio": 0.0,
"completion_length": 720.9964660644531,
"epoch": 0.8227361952718273,
"grad_norm": 1.7930233478546143,
"kl": 1.6314453125,
"learning_rate": 2.779161475690135e-07,
"loss": 0.1954,
"reward": 1.8834822058677674,
"reward_std": 0.47747668251395226,
"rewards/accuracy_reward": 0.14285715008154512,
"rewards/format_reward": 0.8517857611179351,
"rewards/tag_count_reward": 0.8888393253087997,
"step": 2410
},
{
"clip_ratio": 0.0,
"completion_length": 720.8866363525391,
"epoch": 0.8244431168387812,
"grad_norm": 3.0464282035827637,
"kl": 1.894921875,
"learning_rate": 2.727548654219193e-07,
"loss": 0.2449,
"reward": 1.9600447237491607,
"reward_std": 0.531683550029993,
"rewards/accuracy_reward": 0.20982143972069026,
"rewards/format_reward": 0.8553571850061417,
"rewards/tag_count_reward": 0.8948661118745804,
"step": 2415
},
{
"clip_ratio": 0.0,
"completion_length": 717.7839569091797,
"epoch": 0.8261500384057353,
"grad_norm": 2.702840805053711,
"kl": 1.90859375,
"learning_rate": 2.6763716244699057e-07,
"loss": 0.2345,
"reward": 1.8950893878936768,
"reward_std": 0.5322792515158653,
"rewards/accuracy_reward": 0.1482142921537161,
"rewards/format_reward": 0.8535714685916901,
"rewards/tag_count_reward": 0.8933036118745804,
"step": 2420
},
{
"clip_ratio": 0.0,
"completion_length": 749.0768280029297,
"epoch": 0.8278569599726893,
"grad_norm": 1.9960432052612305,
"kl": 1.976953125,
"learning_rate": 2.625632203725979e-07,
"loss": 0.2261,
"reward": 1.9006697237491608,
"reward_std": 0.5312141239643097,
"rewards/accuracy_reward": 0.16607143729925156,
"rewards/format_reward": 0.8464286178350449,
"rewards/tag_count_reward": 0.8881696850061417,
"step": 2425
},
{
"clip_ratio": 0.0,
"completion_length": 729.7393218994141,
"epoch": 0.8295638815396432,
"grad_norm": 2.3977620601654053,
"kl": 1.990625,
"learning_rate": 2.575332193731732e-07,
"loss": 0.2026,
"reward": 1.9734375774860382,
"reward_std": 0.5333075791597366,
"rewards/accuracy_reward": 0.20714286817237734,
"rewards/format_reward": 0.8633929014205932,
"rewards/tag_count_reward": 0.9029018253087997,
"step": 2430
},
{
"clip_ratio": 0.0,
"completion_length": 722.9384246826172,
"epoch": 0.8312708031065973,
"grad_norm": 1.5522792339324951,
"kl": 1.5064453125,
"learning_rate": 2.525473380628127e-07,
"loss": 0.1976,
"reward": 1.9145089983940125,
"reward_std": 0.4954484052956104,
"rewards/accuracy_reward": 0.17946429280564188,
"rewards/format_reward": 0.8464286088943481,
"rewards/tag_count_reward": 0.8886161059141159,
"step": 2435
},
{
"clip_ratio": 0.0,
"completion_length": 731.9803863525391,
"epoch": 0.8329777246735512,
"grad_norm": 1.8127202987670898,
"kl": 1.65087890625,
"learning_rate": 2.4760575348893164e-07,
"loss": 0.2192,
"reward": 1.9658482909202575,
"reward_std": 0.4845112472772598,
"rewards/accuracy_reward": 0.17946429476141929,
"rewards/format_reward": 0.876785758137703,
"rewards/tag_count_reward": 0.9095982581377029,
"step": 2440
},
{
"clip_ratio": 0.0,
"completion_length": 703.525033569336,
"epoch": 0.8346846462405052,
"grad_norm": 1.0752168893814087,
"kl": 1.583203125,
"learning_rate": 2.427086411259812e-07,
"loss": 0.2127,
"reward": 1.9837054550647735,
"reward_std": 0.4994090169668198,
"rewards/accuracy_reward": 0.20625000940635801,
"rewards/format_reward": 0.8696429014205933,
"rewards/tag_count_reward": 0.9078125357627869,
"step": 2445
},
{
"clip_ratio": 0.0,
"completion_length": 732.0089660644531,
"epoch": 0.8363915678074593,
"grad_norm": 1.7221641540527344,
"kl": 1.6380859375,
"learning_rate": 2.378561748692124e-07,
"loss": 0.2077,
"reward": 1.9569197177886963,
"reward_std": 0.47989270444959403,
"rewards/accuracy_reward": 0.19285715287551283,
"rewards/format_reward": 0.86607146859169,
"rewards/tag_count_reward": 0.8979911088943482,
"step": 2450
},
{
"clip_ratio": 0.0,
"completion_length": 703.2803955078125,
"epoch": 0.8380984893744132,
"grad_norm": 2.0460386276245117,
"kl": 1.30048828125,
"learning_rate": 2.3304852702850688e-07,
"loss": 0.1829,
"reward": 2.0325893819332124,
"reward_std": 0.44422818124294283,
"rewards/accuracy_reward": 0.22678572358563542,
"rewards/format_reward": 0.8875000387430191,
"rewards/tag_count_reward": 0.9183036088943481,
"step": 2455
},
{
"clip_ratio": 0.0,
"completion_length": 702.3446746826172,
"epoch": 0.8398054109413673,
"grad_norm": 1.9257980585098267,
"kl": 1.203515625,
"learning_rate": 2.282858683222535e-07,
"loss": 0.185,
"reward": 2.0203125894069673,
"reward_std": 0.45232805162668227,
"rewards/accuracy_reward": 0.2116071492433548,
"rewards/format_reward": 0.8883928984403611,
"rewards/tag_count_reward": 0.920312550663948,
"step": 2460
},
{
"clip_ratio": 0.0,
"completion_length": 694.3973571777344,
"epoch": 0.8415123325083212,
"grad_norm": 2.9717674255371094,
"kl": 1.59736328125,
"learning_rate": 2.2356836787128947e-07,
"loss": 0.2189,
"reward": 1.9305804431438447,
"reward_std": 0.43135242685675623,
"rewards/accuracy_reward": 0.17589286426082254,
"rewards/format_reward": 0.8580357551574707,
"rewards/tag_count_reward": 0.896651828289032,
"step": 2465
},
{
"clip_ratio": 0.0,
"completion_length": 714.9027130126954,
"epoch": 0.8432192540752752,
"grad_norm": 0.7697265148162842,
"kl": 1.35322265625,
"learning_rate": 2.188961931928925e-07,
"loss": 0.1832,
"reward": 2.0316965222358703,
"reward_std": 0.45708170533180237,
"rewards/accuracy_reward": 0.21428572311997413,
"rewards/format_reward": 0.8910714745521545,
"rewards/tag_count_reward": 0.9263393253087997,
"step": 2470
},
{
"clip_ratio": 0.0,
"completion_length": 677.9036041259766,
"epoch": 0.8449261756422293,
"grad_norm": 2.200136661529541,
"kl": 1.51708984375,
"learning_rate": 2.1426951019483327e-07,
"loss": 0.197,
"reward": 1.9850447237491609,
"reward_std": 0.43524651750922205,
"rewards/accuracy_reward": 0.17232143571600317,
"rewards/format_reward": 0.8883928984403611,
"rewards/tag_count_reward": 0.924330398440361,
"step": 2475
},
{
"clip_ratio": 0.0,
"completion_length": 729.7661071777344,
"epoch": 0.8466330972091832,
"grad_norm": 2.73653507232666,
"kl": 1.453515625,
"learning_rate": 2.0968848316948414e-07,
"loss": 0.1622,
"reward": 1.9546875834465027,
"reward_std": 0.47535726577043536,
"rewards/accuracy_reward": 0.19017857676371933,
"rewards/format_reward": 0.8625000387430191,
"rewards/tag_count_reward": 0.9020089656114578,
"step": 2480
},
{
"clip_ratio": 0.0,
"completion_length": 731.2491394042969,
"epoch": 0.8483400187761372,
"grad_norm": 2.792485475540161,
"kl": 1.142578125,
"learning_rate": 2.0515327478798601e-07,
"loss": 0.1639,
"reward": 1.9897322356700897,
"reward_std": 0.4151985734701157,
"rewards/accuracy_reward": 0.1767857219092548,
"rewards/format_reward": 0.8910714656114578,
"rewards/tag_count_reward": 0.9218750387430191,
"step": 2485
},
{
"clip_ratio": 0.0,
"completion_length": 712.3964569091797,
"epoch": 0.8500469403430913,
"grad_norm": 2.1750404834747314,
"kl": 1.362890625,
"learning_rate": 2.006640460944701e-07,
"loss": 0.2054,
"reward": 2.036384052038193,
"reward_std": 0.4850167170166969,
"rewards/accuracy_reward": 0.24107143646106124,
"rewards/format_reward": 0.8794643312692643,
"rewards/tag_count_reward": 0.9158482551574707,
"step": 2490
},
{
"clip_ratio": 0.0,
"completion_length": 713.9687805175781,
"epoch": 0.8517538619100452,
"grad_norm": 2.2476091384887695,
"kl": 1.6875244140625,
"learning_rate": 1.9622095650034077e-07,
"loss": 0.2528,
"reward": 2.035267961025238,
"reward_std": 0.5042560985311866,
"rewards/accuracy_reward": 0.24553572237491608,
"rewards/format_reward": 0.8767857521772384,
"rewards/tag_count_reward": 0.9129464715719223,
"step": 2495
},
{
"clip_ratio": 0.0,
"completion_length": 720.7411010742187,
"epoch": 0.8534607834769993,
"grad_norm": 2.9233672618865967,
"kl": 1.546484375,
"learning_rate": 1.9182416377861388e-07,
"loss": 0.2024,
"reward": 1.935491156578064,
"reward_std": 0.4808127790689468,
"rewards/accuracy_reward": 0.1607142912223935,
"rewards/format_reward": 0.866964328289032,
"rewards/tag_count_reward": 0.9078125476837158,
"step": 2500
},
{
"clip_ratio": 0.0,
"completion_length": 708.3143157958984,
"epoch": 0.8551677050439532,
"grad_norm": 3.2671101093292236,
"kl": 1.0187744140625,
"learning_rate": 1.8747382405831515e-07,
"loss": 0.1549,
"reward": 1.9649554669857026,
"reward_std": 0.3683423440903425,
"rewards/accuracy_reward": 0.1535714365541935,
"rewards/format_reward": 0.8919643282890319,
"rewards/tag_count_reward": 0.9194196850061417,
"step": 2505
},
{
"clip_ratio": 0.0,
"completion_length": 707.5116363525391,
"epoch": 0.8568746266109072,
"grad_norm": 1.725104808807373,
"kl": 1.13115234375,
"learning_rate": 1.8317009181893507e-07,
"loss": 0.1587,
"reward": 1.9776786625385285,
"reward_std": 0.4078820027410984,
"rewards/accuracy_reward": 0.17053572311997414,
"rewards/format_reward": 0.8883929014205932,
"rewards/tag_count_reward": 0.9187500447034835,
"step": 2510
},
{
"clip_ratio": 0.0,
"completion_length": 726.5812835693359,
"epoch": 0.8585815481778613,
"grad_norm": 0.9512146711349487,
"kl": 1.148876953125,
"learning_rate": 1.7891311988494523e-07,
"loss": 0.1464,
"reward": 2.0390625953674317,
"reward_std": 0.40070234164595603,
"rewards/accuracy_reward": 0.2053571523167193,
"rewards/format_reward": 0.904464328289032,
"rewards/tag_count_reward": 0.9292411148548126,
"step": 2515
},
{
"clip_ratio": 0.0,
"completion_length": 708.7821838378907,
"epoch": 0.8602884697448152,
"grad_norm": 1.6017332077026367,
"kl": 1.38046875,
"learning_rate": 1.7470305942036864e-07,
"loss": 0.163,
"reward": 2.030134028196335,
"reward_std": 0.4416078761219978,
"rewards/accuracy_reward": 0.20982143776491285,
"rewards/format_reward": 0.8937500447034836,
"rewards/tag_count_reward": 0.9265625476837158,
"step": 2520
},
{
"clip_ratio": 0.0,
"completion_length": 694.2955627441406,
"epoch": 0.8619953913117693,
"grad_norm": 1.2069106101989746,
"kl": 1.48408203125,
"learning_rate": 1.705400599234152e-07,
"loss": 0.2074,
"reward": 2.0006697118282317,
"reward_std": 0.48490975946187975,
"rewards/accuracy_reward": 0.22053572302684188,
"rewards/format_reward": 0.8741071850061417,
"rewards/tag_count_reward": 0.9060268253087997,
"step": 2525
},
{
"clip_ratio": 0.0,
"completion_length": 695.4312896728516,
"epoch": 0.8637023128787232,
"grad_norm": 1.0792300701141357,
"kl": 1.41123046875,
"learning_rate": 1.6642426922117037e-07,
"loss": 0.1589,
"reward": 1.9928572356700898,
"reward_std": 0.4516665853559971,
"rewards/accuracy_reward": 0.17232143692672253,
"rewards/format_reward": 0.8955357521772385,
"rewards/tag_count_reward": 0.9250000417232513,
"step": 2530
},
{
"clip_ratio": 0.0,
"completion_length": 724.9500274658203,
"epoch": 0.8654092344456772,
"grad_norm": 1.6458358764648438,
"kl": 1.53515625,
"learning_rate": 1.62355833464347e-07,
"loss": 0.1851,
"reward": 1.9680804491043091,
"reward_std": 0.4724808134138584,
"rewards/accuracy_reward": 0.18125000586733223,
"rewards/format_reward": 0.876785758137703,
"rewards/tag_count_reward": 0.9100446850061417,
"step": 2535
},
{
"clip_ratio": 0.0,
"completion_length": 678.3027099609375,
"epoch": 0.8671161560126313,
"grad_norm": 2.028154134750366,
"kl": 1.37314453125,
"learning_rate": 1.5833489712209643e-07,
"loss": 0.1602,
"reward": 1.980580449104309,
"reward_std": 0.47636549547314644,
"rewards/accuracy_reward": 0.20178572311997414,
"rewards/format_reward": 0.8705357521772384,
"rewards/tag_count_reward": 0.9082589656114578,
"step": 2540
},
{
"clip_ratio": 0.0,
"completion_length": 723.8598449707031,
"epoch": 0.8688230775795852,
"grad_norm": 1.2274961471557617,
"kl": 1.5658203125,
"learning_rate": 1.5436160297687614e-07,
"loss": 0.1623,
"reward": 1.9558036625385284,
"reward_std": 0.47571387365460394,
"rewards/accuracy_reward": 0.16071429289877415,
"rewards/format_reward": 0.879464328289032,
"rewards/tag_count_reward": 0.9156250357627869,
"step": 2545
},
{
"clip_ratio": 0.0,
"completion_length": 715.9285949707031,
"epoch": 0.8705299991465392,
"grad_norm": 1.1409246921539307,
"kl": 1.7728515625,
"learning_rate": 1.5043609211938257e-07,
"loss": 0.1836,
"reward": 1.9303572237491609,
"reward_std": 0.5227501168847084,
"rewards/accuracy_reward": 0.16607143748551606,
"rewards/format_reward": 0.8642857611179352,
"rewards/tag_count_reward": 0.9000000476837158,
"step": 2550
},
{
"clip_ratio": 0.0,
"completion_length": 741.2357452392578,
"epoch": 0.8722369207134932,
"grad_norm": 2.0067524909973145,
"kl": 1.790625,
"learning_rate": 1.4655850394353738e-07,
"loss": 0.2562,
"reward": 1.9258929550647736,
"reward_std": 0.5418844744563103,
"rewards/accuracy_reward": 0.16250000735744835,
"rewards/format_reward": 0.8625000387430191,
"rewards/tag_count_reward": 0.9008929014205933,
"step": 2555
},
{
"clip_ratio": 0.0,
"completion_length": 696.0223449707031,
"epoch": 0.8739438422804472,
"grad_norm": 3.1993730068206787,
"kl": 1.7654296875,
"learning_rate": 1.4272897614154161e-07,
"loss": 0.2202,
"reward": 1.9792411744594574,
"reward_std": 0.5070497654378414,
"rewards/accuracy_reward": 0.20714286770671606,
"rewards/format_reward": 0.86607146859169,
"rewards/tag_count_reward": 0.9060268193483353,
"step": 2560
},
{
"clip_ratio": 0.0,
"completion_length": 696.559848022461,
"epoch": 0.8756507638474013,
"grad_norm": 2.2070538997650146,
"kl": 1.67294921875,
"learning_rate": 1.389476446989828e-07,
"loss": 0.2129,
"reward": 1.979910808801651,
"reward_std": 0.5403969317674637,
"rewards/accuracy_reward": 0.21160715445876122,
"rewards/format_reward": 0.8625000387430191,
"rewards/tag_count_reward": 0.9058036118745804,
"step": 2565
},
{
"clip_ratio": 0.0,
"completion_length": 729.2357513427735,
"epoch": 0.8773576854143552,
"grad_norm": 1.5151607990264893,
"kl": 1.8220703125,
"learning_rate": 1.3521464389000853e-07,
"loss": 0.2421,
"reward": 1.950446504354477,
"reward_std": 0.5206233039498329,
"rewards/accuracy_reward": 0.18750000931322575,
"rewards/format_reward": 0.8598214685916901,
"rewards/tag_count_reward": 0.9031250447034835,
"step": 2570
},
{
"clip_ratio": 0.0,
"completion_length": 736.9411010742188,
"epoch": 0.8790646069813092,
"grad_norm": 3.210714101791382,
"kl": 1.8384765625,
"learning_rate": 1.3153010627255728e-07,
"loss": 0.2388,
"reward": 1.8953125953674317,
"reward_std": 0.522989672422409,
"rewards/accuracy_reward": 0.16071429401636123,
"rewards/format_reward": 0.8455357521772384,
"rewards/tag_count_reward": 0.8890625387430191,
"step": 2575
},
{
"clip_ratio": 0.0,
"completion_length": 744.8303863525391,
"epoch": 0.8807715285482632,
"grad_norm": 2.01275897026062,
"kl": 2.0505859375,
"learning_rate": 1.2789416268365146e-07,
"loss": 0.2423,
"reward": 1.9046875894069673,
"reward_std": 0.588728591799736,
"rewards/accuracy_reward": 0.17857143739238382,
"rewards/format_reward": 0.8410714775323868,
"rewards/tag_count_reward": 0.8850446820259095,
"step": 2580
},
{
"clip_ratio": 0.0,
"completion_length": 711.4723541259766,
"epoch": 0.8824784501152172,
"grad_norm": 1.6076045036315918,
"kl": 1.7802734375,
"learning_rate": 1.2430694223475087e-07,
"loss": 0.2169,
"reward": 1.8776786565780639,
"reward_std": 0.5265608415007591,
"rewards/accuracy_reward": 0.1544642928056419,
"rewards/format_reward": 0.8366071879863739,
"rewards/tag_count_reward": 0.8866071820259094,
"step": 2585
},
{
"clip_ratio": 0.0,
"completion_length": 748.5223541259766,
"epoch": 0.8841853716821712,
"grad_norm": 2.9953057765960693,
"kl": 1.766796875,
"learning_rate": 1.2076857230717004e-07,
"loss": 0.2396,
"reward": 1.876339364051819,
"reward_std": 0.5042637214064598,
"rewards/accuracy_reward": 0.1357142912223935,
"rewards/format_reward": 0.8473214656114578,
"rewards/tag_count_reward": 0.8933036178350449,
"step": 2590
},
{
"clip_ratio": 0.0,
"completion_length": 685.8982513427734,
"epoch": 0.8858922932491252,
"grad_norm": 1.664928913116455,
"kl": 1.6484375,
"learning_rate": 1.1727917854755238e-07,
"loss": 0.1894,
"reward": 1.946428656578064,
"reward_std": 0.5181833237409592,
"rewards/accuracy_reward": 0.20089286677539347,
"rewards/format_reward": 0.8500000447034836,
"rewards/tag_count_reward": 0.8955357581377029,
"step": 2595
},
{
"clip_ratio": 0.0,
"completion_length": 738.9411041259766,
"epoch": 0.8875992148160792,
"grad_norm": 1.6388742923736572,
"kl": 1.8392578125,
"learning_rate": 1.1383888486341032e-07,
"loss": 0.2812,
"reward": 1.861160808801651,
"reward_std": 0.5453050881624222,
"rewards/accuracy_reward": 0.1428571492433548,
"rewards/format_reward": 0.835714328289032,
"rewards/tag_count_reward": 0.8825893372297287,
"step": 2600
},
{
"clip_ratio": 0.0,
"completion_length": 692.9500396728515,
"epoch": 0.8893061363830332,
"grad_norm": 1.3716119527816772,
"kl": 1.69130859375,
"learning_rate": 1.1044781341872411e-07,
"loss": 0.2436,
"reward": 1.9834822297096253,
"reward_std": 0.5287271916866303,
"rewards/accuracy_reward": 0.23928572330623865,
"rewards/format_reward": 0.8526786148548127,
"rewards/tag_count_reward": 0.8915178954601288,
"step": 2605
},
{
"clip_ratio": 0.0,
"completion_length": 745.263427734375,
"epoch": 0.8910130579499872,
"grad_norm": 1.7265480756759644,
"kl": 1.85703125,
"learning_rate": 1.0710608462960486e-07,
"loss": 0.2463,
"reward": 1.9129464983940125,
"reward_std": 0.5115076020359993,
"rewards/accuracy_reward": 0.16428571874275805,
"rewards/format_reward": 0.8526786118745804,
"rewards/tag_count_reward": 0.8959821909666061,
"step": 2610
},
{
"clip_ratio": 0.0,
"completion_length": 706.6116363525391,
"epoch": 0.8927199795169412,
"grad_norm": 5.472092151641846,
"kl": 1.733984375,
"learning_rate": 1.038138171600177e-07,
"loss": 0.2281,
"reward": 1.9308036863803864,
"reward_std": 0.4873148113489151,
"rewards/accuracy_reward": 0.16517857909202577,
"rewards/format_reward": 0.8642857491970062,
"rewards/tag_count_reward": 0.9013393223285675,
"step": 2615
},
{
"clip_ratio": 0.0,
"completion_length": 754.4884185791016,
"epoch": 0.8944269010838952,
"grad_norm": 1.2924634218215942,
"kl": 1.869921875,
"learning_rate": 1.005711279175694e-07,
"loss": 0.2641,
"reward": 1.903348296880722,
"reward_std": 0.5570731215178967,
"rewards/accuracy_reward": 0.16785714756697417,
"rewards/format_reward": 0.845535758137703,
"rewards/tag_count_reward": 0.8899553954601288,
"step": 2620
},
{
"clip_ratio": 0.0,
"completion_length": 684.0527130126953,
"epoch": 0.8961338226508492,
"grad_norm": 2.2256343364715576,
"kl": 1.68251953125,
"learning_rate": 9.737813204935497e-08,
"loss": 0.2139,
"reward": 1.90401793718338,
"reward_std": 0.5087300404906273,
"rewards/accuracy_reward": 0.16607143655419349,
"rewards/format_reward": 0.8500000387430191,
"rewards/tag_count_reward": 0.8879464656114578,
"step": 2625
},
{
"clip_ratio": 0.0,
"completion_length": 731.5366394042969,
"epoch": 0.8978407442178032,
"grad_norm": 2.7418696880340576,
"kl": 1.9642578125,
"learning_rate": 9.423494293787082e-08,
"loss": 0.2427,
"reward": 1.8968750774860381,
"reward_std": 0.5185163721442223,
"rewards/accuracy_reward": 0.16071429187431932,
"rewards/format_reward": 0.8455357551574707,
"rewards/tag_count_reward": 0.8906250387430191,
"step": 2630
},
{
"clip_ratio": 0.0,
"completion_length": 667.6437774658203,
"epoch": 0.8995476657847572,
"grad_norm": 2.741389274597168,
"kl": 1.519921875,
"learning_rate": 9.114167219698744e-08,
"loss": 0.1709,
"reward": 1.954017961025238,
"reward_std": 0.464486388117075,
"rewards/accuracy_reward": 0.1812500087544322,
"rewards/format_reward": 0.8678571850061416,
"rewards/tag_count_reward": 0.9049107581377029,
"step": 2635
},
{
"clip_ratio": 0.0,
"completion_length": 724.1705718994141,
"epoch": 0.9012545873517112,
"grad_norm": 1.2222126722335815,
"kl": 1.3740234375,
"learning_rate": 8.809842966798587e-08,
"loss": 0.2209,
"reward": 1.959598284959793,
"reward_std": 0.5034348502755165,
"rewards/accuracy_reward": 0.208035721629858,
"rewards/format_reward": 0.8562500476837158,
"rewards/tag_count_reward": 0.8953125417232514,
"step": 2640
},
{
"clip_ratio": 0.0,
"completion_length": 710.2991378784179,
"epoch": 0.9029615089186652,
"grad_norm": 0.7933781147003174,
"kl": 1.36630859375,
"learning_rate": 8.510532341565807e-08,
"loss": 0.1944,
"reward": 1.9640626072883607,
"reward_std": 0.4950651377439499,
"rewards/accuracy_reward": 0.20982143916189672,
"rewards/format_reward": 0.8562500387430191,
"rewards/tag_count_reward": 0.8979911118745804,
"step": 2645
},
{
"clip_ratio": 0.0,
"completion_length": 701.2223571777344,
"epoch": 0.9046684304856192,
"grad_norm": 5.042927265167236,
"kl": 1.5912109375,
"learning_rate": 8.216245972446962e-08,
"loss": 0.1986,
"reward": 1.9100447297096252,
"reward_std": 0.49433635324239733,
"rewards/accuracy_reward": 0.15357143441215157,
"rewards/format_reward": 0.8571428984403611,
"rewards/tag_count_reward": 0.8993303954601288,
"step": 2650
},
{
"clip_ratio": 0.0,
"completion_length": 705.6312866210938,
"epoch": 0.9063753520525731,
"grad_norm": 1.833943486213684,
"kl": 1.58671875,
"learning_rate": 7.926994309478403e-08,
"loss": 0.2351,
"reward": 1.9138393700122833,
"reward_std": 0.5347620368003845,
"rewards/accuracy_reward": 0.16875000894069672,
"rewards/format_reward": 0.8500000417232514,
"rewards/tag_count_reward": 0.8950893223285675,
"step": 2655
},
{
"clip_ratio": 0.0,
"completion_length": 691.6018188476562,
"epoch": 0.9080822736195272,
"grad_norm": 1.80747389793396,
"kl": 1.40380859375,
"learning_rate": 7.642787623915442e-08,
"loss": 0.1708,
"reward": 1.991517972946167,
"reward_std": 0.4647905558347702,
"rewards/accuracy_reward": 0.19464286686852575,
"rewards/format_reward": 0.8812500417232514,
"rewards/tag_count_reward": 0.9156250387430191,
"step": 2660
},
{
"clip_ratio": 0.0,
"completion_length": 683.5330688476563,
"epoch": 0.9097891951864812,
"grad_norm": 2.021982192993164,
"kl": 1.7751953125,
"learning_rate": 7.36363600786733e-08,
"loss": 0.2256,
"reward": 1.9256697237491607,
"reward_std": 0.5254194289445877,
"rewards/accuracy_reward": 0.18482143618166447,
"rewards/format_reward": 0.8491071820259094,
"rewards/tag_count_reward": 0.8917411029338836,
"step": 2665
},
{
"clip_ratio": 0.0,
"completion_length": 707.2678833007812,
"epoch": 0.9114961167534352,
"grad_norm": 1.5858349800109863,
"kl": 1.50009765625,
"learning_rate": 7.089549373939186e-08,
"loss": 0.2216,
"reward": 1.9671875894069673,
"reward_std": 0.495637346804142,
"rewards/accuracy_reward": 0.18392857955768704,
"rewards/format_reward": 0.8741071879863739,
"rewards/tag_count_reward": 0.9091518342494964,
"step": 2670
},
{
"clip_ratio": 0.0,
"completion_length": 706.431283569336,
"epoch": 0.9132030383203892,
"grad_norm": 0.9021114110946655,
"kl": 1.64638671875,
"learning_rate": 6.8205374548798e-08,
"loss": 0.196,
"reward": 1.9872768700122834,
"reward_std": 0.4923027902841568,
"rewards/accuracy_reward": 0.2142857253551483,
"rewards/format_reward": 0.869642898440361,
"rewards/tag_count_reward": 0.9033482581377029,
"step": 2675
},
{
"clip_ratio": 0.0,
"completion_length": 721.2375335693359,
"epoch": 0.9149099598873431,
"grad_norm": 1.658331274986267,
"kl": 1.6201171875,
"learning_rate": 6.556609803236108e-08,
"loss": 0.243,
"reward": 1.9319197297096253,
"reward_std": 0.5096075862646103,
"rewards/accuracy_reward": 0.19285714998841286,
"rewards/format_reward": 0.8491071850061417,
"rewards/tag_count_reward": 0.8899554044008255,
"step": 2680
},
{
"clip_ratio": 0.0,
"completion_length": 711.7491394042969,
"epoch": 0.9166168814542972,
"grad_norm": 1.9178466796875,
"kl": 1.453515625,
"learning_rate": 6.297775791013933e-08,
"loss": 0.2094,
"reward": 1.983035808801651,
"reward_std": 0.5048068448901176,
"rewards/accuracy_reward": 0.1875000089406967,
"rewards/format_reward": 0.8785714656114578,
"rewards/tag_count_reward": 0.916964328289032,
"step": 2685
},
{
"clip_ratio": 0.0,
"completion_length": 708.7125305175781,
"epoch": 0.9183238030212512,
"grad_norm": 2.0225086212158203,
"kl": 1.497705078125,
"learning_rate": 6.044044609345228e-08,
"loss": 0.2327,
"reward": 1.9497768878936768,
"reward_std": 0.5056387215852738,
"rewards/accuracy_reward": 0.17142857983708382,
"rewards/format_reward": 0.8678571850061416,
"rewards/tag_count_reward": 0.9104911088943481,
"step": 2690
},
{
"clip_ratio": 0.0,
"completion_length": 713.8303894042969,
"epoch": 0.9200307245882051,
"grad_norm": 1.8685790300369263,
"kl": 1.5716796875,
"learning_rate": 5.7954252681617304e-08,
"loss": 0.2246,
"reward": 1.9587054550647736,
"reward_std": 0.48516621366143226,
"rewards/accuracy_reward": 0.21428572656586767,
"rewards/format_reward": 0.8535714656114578,
"rewards/tag_count_reward": 0.8908482551574707,
"step": 2695
},
{
"clip_ratio": 0.0,
"completion_length": 719.8518188476562,
"epoch": 0.9217376461551592,
"grad_norm": 2.501157283782959,
"kl": 1.40458984375,
"learning_rate": 5.5519265958749066e-08,
"loss": 0.1797,
"reward": 1.930803644657135,
"reward_std": 0.4772623166441917,
"rewards/accuracy_reward": 0.1508928645402193,
"rewards/format_reward": 0.87232146859169,
"rewards/tag_count_reward": 0.9075893342494965,
"step": 2700
},
{
"clip_ratio": 0.0,
"completion_length": 711.0937927246093,
"epoch": 0.9234445677221131,
"grad_norm": 1.3878949880599976,
"kl": 1.0825927734375,
"learning_rate": 5.313557239062627e-08,
"loss": 0.1808,
"reward": 1.945089375972748,
"reward_std": 0.43458477333188056,
"rewards/accuracy_reward": 0.14017857844009995,
"rewards/format_reward": 0.8848214745521545,
"rewards/tag_count_reward": 0.9200893342494965,
"step": 2705
},
{
"clip_ratio": 0.0,
"completion_length": 715.4616394042969,
"epoch": 0.9251514892890672,
"grad_norm": 1.9537073373794556,
"kl": 1.6349609375,
"learning_rate": 5.0803256621619445e-08,
"loss": 0.2297,
"reward": 1.9232143819332124,
"reward_std": 0.4841707475483418,
"rewards/accuracy_reward": 0.15625000800937414,
"rewards/format_reward": 0.8616071879863739,
"rewards/tag_count_reward": 0.9053571820259094,
"step": 2710
},
{
"clip_ratio": 0.0,
"completion_length": 702.2705780029297,
"epoch": 0.9268584108560212,
"grad_norm": 1.7269147634506226,
"kl": 1.5267578125,
"learning_rate": 4.852240147168696e-08,
"loss": 0.1783,
"reward": 1.9468750834465027,
"reward_std": 0.5081598028540611,
"rewards/accuracy_reward": 0.1892857238650322,
"rewards/format_reward": 0.8535714685916901,
"rewards/tag_count_reward": 0.9040178984403611,
"step": 2715
},
{
"clip_ratio": 0.0,
"completion_length": 712.5902099609375,
"epoch": 0.9285653324229751,
"grad_norm": 1.7688935995101929,
"kl": 1.719921875,
"learning_rate": 4.629308793343229e-08,
"loss": 0.2492,
"reward": 1.9412947356700898,
"reward_std": 0.5291612073779106,
"rewards/accuracy_reward": 0.19642857909202577,
"rewards/format_reward": 0.8500000387430191,
"rewards/tag_count_reward": 0.8948661148548126,
"step": 2720
},
{
"clip_ratio": 0.0,
"completion_length": 704.3607452392578,
"epoch": 0.9302722539899292,
"grad_norm": 4.1701436042785645,
"kl": 1.4392578125,
"learning_rate": 4.4115395169230074e-08,
"loss": 0.2062,
"reward": 1.94665185213089,
"reward_std": 0.48943726569414137,
"rewards/accuracy_reward": 0.18125000819563866,
"rewards/format_reward": 0.8633928954601288,
"rewards/tag_count_reward": 0.9020089745521546,
"step": 2725
},
{
"clip_ratio": 0.0,
"completion_length": 711.0794952392578,
"epoch": 0.9319791755568831,
"grad_norm": 2.8235621452331543,
"kl": 1.57685546875,
"learning_rate": 4.1989400508413264e-08,
"loss": 0.2257,
"reward": 1.9709822356700897,
"reward_std": 0.5317689374089241,
"rewards/accuracy_reward": 0.20000000819563865,
"rewards/format_reward": 0.8660714745521545,
"rewards/tag_count_reward": 0.9049107611179352,
"step": 2730
},
{
"clip_ratio": 0.0,
"completion_length": 722.0044952392578,
"epoch": 0.9336860971238372,
"grad_norm": 2.165998935699463,
"kl": 1.6806640625,
"learning_rate": 3.991517944452827e-08,
"loss": 0.2257,
"reward": 1.9388393700122832,
"reward_std": 0.5329652637243271,
"rewards/accuracy_reward": 0.17857143664732575,
"rewards/format_reward": 0.8598214626312256,
"rewards/tag_count_reward": 0.9004464685916901,
"step": 2735
},
{
"clip_ratio": 0.0,
"completion_length": 696.9875305175781,
"epoch": 0.9353930186907912,
"grad_norm": 1.3871349096298218,
"kl": 1.5708984375,
"learning_rate": 3.789280563265346e-08,
"loss": 0.2273,
"reward": 1.905803632736206,
"reward_std": 0.5343507960438728,
"rewards/accuracy_reward": 0.16250000800937414,
"rewards/format_reward": 0.8535714656114578,
"rewards/tag_count_reward": 0.8897321790456771,
"step": 2740
},
{
"clip_ratio": 0.0,
"completion_length": 654.9214599609375,
"epoch": 0.9370999402577451,
"grad_norm": 1.9422177076339722,
"kl": 1.5888671875,
"learning_rate": 3.592235088678458e-08,
"loss": 0.1971,
"reward": 2.000000089406967,
"reward_std": 0.4781561218202114,
"rewards/accuracy_reward": 0.21785715334117411,
"rewards/format_reward": 0.873214328289032,
"rewards/tag_count_reward": 0.9089286148548126,
"step": 2745
},
{
"clip_ratio": 0.0,
"completion_length": 710.5107421875,
"epoch": 0.9388068618246992,
"grad_norm": 1.3229178190231323,
"kl": 1.6900390625,
"learning_rate": 3.400388517728348e-08,
"loss": 0.2153,
"reward": 1.9354911625385285,
"reward_std": 0.5064380072057247,
"rewards/accuracy_reward": 0.15625000633299352,
"rewards/format_reward": 0.8687500476837158,
"rewards/tag_count_reward": 0.9104911088943481,
"step": 2750
},
{
"clip_ratio": 0.0,
"completion_length": 694.1669952392579,
"epoch": 0.9405137833916531,
"grad_norm": 2.7505228519439697,
"kl": 1.655078125,
"learning_rate": 3.2137476628395054e-08,
"loss": 0.1961,
"reward": 1.9437500953674316,
"reward_std": 0.4771438464522362,
"rewards/accuracy_reward": 0.1848214389756322,
"rewards/format_reward": 0.8598214745521545,
"rewards/tag_count_reward": 0.8991071879863739,
"step": 2755
},
{
"clip_ratio": 0.0,
"completion_length": 741.4875366210938,
"epoch": 0.9422207049586071,
"grad_norm": 3.672633409500122,
"kl": 1.748828125,
"learning_rate": 3.0323191515826076e-08,
"loss": 0.2204,
"reward": 1.8966518878936767,
"reward_std": 0.5492107257246971,
"rewards/accuracy_reward": 0.16160714998841286,
"rewards/format_reward": 0.8446429044008255,
"rewards/tag_count_reward": 0.8904018312692642,
"step": 2760
},
{
"clip_ratio": 0.0,
"completion_length": 706.8419982910157,
"epoch": 0.9439276265255612,
"grad_norm": 2.4161508083343506,
"kl": 1.6482421875,
"learning_rate": 2.856109426439435e-08,
"loss": 0.2147,
"reward": 1.9370536506175995,
"reward_std": 0.5207428842782974,
"rewards/accuracy_reward": 0.183035721629858,
"rewards/format_reward": 0.8562500387430191,
"rewards/tag_count_reward": 0.897767898440361,
"step": 2765
},
{
"clip_ratio": 0.0,
"completion_length": 731.3250274658203,
"epoch": 0.9456345480925151,
"grad_norm": 2.095442771911621,
"kl": 1.7171875,
"learning_rate": 2.6851247445738247e-08,
"loss": 0.2129,
"reward": 1.8669643700122833,
"reward_std": 0.536584535241127,
"rewards/accuracy_reward": 0.14910715017467738,
"rewards/format_reward": 0.8392857581377029,
"rewards/tag_count_reward": 0.8785714656114578,
"step": 2770
},
{
"clip_ratio": 0.0,
"completion_length": 713.9116394042969,
"epoch": 0.9473414696594692,
"grad_norm": 2.0644876956939697,
"kl": 1.33837890625,
"learning_rate": 2.519371177609714e-08,
"loss": 0.1655,
"reward": 1.9444197297096253,
"reward_std": 0.4576558813452721,
"rewards/accuracy_reward": 0.1696428654715419,
"rewards/format_reward": 0.8705357521772384,
"rewards/tag_count_reward": 0.9042411178350449,
"step": 2775
},
{
"clip_ratio": 0.0,
"completion_length": 738.437533569336,
"epoch": 0.9490483912264231,
"grad_norm": 2.707775354385376,
"kl": 1.655078125,
"learning_rate": 2.358854611415362e-08,
"loss": 0.2686,
"reward": 1.9511161744594574,
"reward_std": 0.5168059259653092,
"rewards/accuracy_reward": 0.20178572358563543,
"rewards/format_reward": 0.8526786148548127,
"rewards/tag_count_reward": 0.896651828289032,
"step": 2780
},
{
"clip_ratio": 0.0,
"completion_length": 666.6893157958984,
"epoch": 0.9507553127933771,
"grad_norm": 2.604048013687134,
"kl": 1.36875,
"learning_rate": 2.2035807458944845e-08,
"loss": 0.1994,
"reward": 2.0129465401172637,
"reward_std": 0.44142256677150726,
"rewards/accuracy_reward": 0.2133928656578064,
"rewards/format_reward": 0.8803571850061417,
"rewards/tag_count_reward": 0.9191964626312256,
"step": 2785
},
{
"clip_ratio": 0.0,
"completion_length": 699.650032043457,
"epoch": 0.9524622343603312,
"grad_norm": 0.9698619246482849,
"kl": 1.36435546875,
"learning_rate": 2.0535550947837824e-08,
"loss": 0.2257,
"reward": 1.9832590162754058,
"reward_std": 0.5147185429930687,
"rewards/accuracy_reward": 0.2035714380443096,
"rewards/format_reward": 0.8687500417232513,
"rewards/tag_count_reward": 0.9109375387430191,
"step": 2790
},
{
"clip_ratio": 0.0,
"completion_length": 724.2723602294922,
"epoch": 0.9541691559272851,
"grad_norm": 2.4312736988067627,
"kl": 1.88720703125,
"learning_rate": 1.9087829854571137e-08,
"loss": 0.2303,
"reward": 1.9149554252624512,
"reward_std": 0.5436064839363098,
"rewards/accuracy_reward": 0.19196429159492254,
"rewards/format_reward": 0.8366071879863739,
"rewards/tag_count_reward": 0.8863839745521546,
"step": 2795
},
{
"clip_ratio": 0.0,
"completion_length": 687.9143157958985,
"epoch": 0.9558760774942391,
"grad_norm": 1.6051888465881348,
"kl": 1.741796875,
"learning_rate": 1.7692695587363804e-08,
"loss": 0.2005,
"reward": 1.9395090281963348,
"reward_std": 0.5374544084072113,
"rewards/accuracy_reward": 0.1964285785332322,
"rewards/format_reward": 0.8526786088943481,
"rewards/tag_count_reward": 0.890401828289032,
"step": 2800
},
{
"clip_ratio": 0.0,
"completion_length": 733.0598510742187,
"epoch": 0.9575829990611932,
"grad_norm": 1.3395869731903076,
"kl": 1.4322265625,
"learning_rate": 1.6350197687089897e-08,
"loss": 0.2022,
"reward": 1.9455358266830445,
"reward_std": 0.4857571929693222,
"rewards/accuracy_reward": 0.1794642936438322,
"rewards/format_reward": 0.8642857521772385,
"rewards/tag_count_reward": 0.9017857551574707,
"step": 2805
},
{
"clip_ratio": 0.0,
"completion_length": 743.7893157958985,
"epoch": 0.9592899206281471,
"grad_norm": 2.4686439037323,
"kl": 1.9875,
"learning_rate": 1.5060383825518943e-08,
"loss": 0.1842,
"reward": 1.9044643819332123,
"reward_std": 0.5226494466885925,
"rewards/accuracy_reward": 0.17142857694998384,
"rewards/format_reward": 0.8437500417232513,
"rewards/tag_count_reward": 0.8892857521772385,
"step": 2810
},
{
"clip_ratio": 0.0,
"completion_length": 695.3214599609375,
"epoch": 0.9609968421951012,
"grad_norm": 1.3053197860717773,
"kl": 1.5158203125,
"learning_rate": 1.3823299803622957e-08,
"loss": 0.2142,
"reward": 1.9276786625385285,
"reward_std": 0.45726575776934625,
"rewards/accuracy_reward": 0.15892857713624836,
"rewards/format_reward": 0.866964328289032,
"rewards/tag_count_reward": 0.9017857551574707,
"step": 2815
},
{
"clip_ratio": 0.0,
"completion_length": 665.1518127441407,
"epoch": 0.9627037637620551,
"grad_norm": 1.9331916570663452,
"kl": 1.36396484375,
"learning_rate": 1.2638989549950742e-08,
"loss": 0.2132,
"reward": 1.9772322356700898,
"reward_std": 0.44482519626617434,
"rewards/accuracy_reward": 0.17500000707805158,
"rewards/format_reward": 0.8839286059141159,
"rewards/tag_count_reward": 0.9183036178350449,
"step": 2820
},
{
"clip_ratio": 0.0,
"completion_length": 741.8071838378906,
"epoch": 0.9644106853290091,
"grad_norm": 2.253441572189331,
"kl": 1.952734375,
"learning_rate": 1.150749511906729e-08,
"loss": 0.2679,
"reward": 1.9303572177886963,
"reward_std": 0.5730097323656083,
"rewards/accuracy_reward": 0.213392869848758,
"rewards/format_reward": 0.8303571820259095,
"rewards/tag_count_reward": 0.8866071850061417,
"step": 2825
},
{
"clip_ratio": 0.0,
"completion_length": 726.2223510742188,
"epoch": 0.9661176068959632,
"grad_norm": 1.3774980306625366,
"kl": 1.4826171875,
"learning_rate": 1.0428856690061161e-08,
"loss": 0.2107,
"reward": 1.9537947297096252,
"reward_std": 0.49637353494763375,
"rewards/accuracy_reward": 0.17500000782310962,
"rewards/format_reward": 0.8705357491970063,
"rewards/tag_count_reward": 0.9082589685916901,
"step": 2830
},
{
"clip_ratio": 0.0,
"completion_length": 691.8553924560547,
"epoch": 0.9678245284629171,
"grad_norm": 2.7155189514160156,
"kl": 1.510546875,
"learning_rate": 9.403112565116612e-09,
"loss": 0.2297,
"reward": 1.971428644657135,
"reward_std": 0.4926897309720516,
"rewards/accuracy_reward": 0.1866071511991322,
"rewards/format_reward": 0.8732143253087997,
"rewards/tag_count_reward": 0.9116071850061417,
"step": 2835
},
{
"clip_ratio": 0.0,
"completion_length": 736.6098480224609,
"epoch": 0.9695314500298712,
"grad_norm": 1.3236608505249023,
"kl": 1.8693359375,
"learning_rate": 8.430299168154853e-09,
"loss": 0.163,
"reward": 1.8939732909202576,
"reward_std": 0.5278340607881546,
"rewards/accuracy_reward": 0.15625000735744835,
"rewards/format_reward": 0.8473214715719223,
"rewards/tag_count_reward": 0.8904018312692642,
"step": 2840
},
{
"clip_ratio": 0.0,
"completion_length": 715.6053894042968,
"epoch": 0.9712383715968251,
"grad_norm": 1.6628596782684326,
"kl": 1.5591796875,
"learning_rate": 7.510451043539923e-09,
"loss": 0.2034,
"reward": 1.9250000953674316,
"reward_std": 0.5011376716196537,
"rewards/accuracy_reward": 0.146428578812629,
"rewards/format_reward": 0.869642898440361,
"rewards/tag_count_reward": 0.9089286118745804,
"step": 2845
},
{
"clip_ratio": 0.0,
"completion_length": 694.3768157958984,
"epoch": 0.9729452931637791,
"grad_norm": 1.6406400203704834,
"kl": 1.3693359375,
"learning_rate": 6.643600854851828e-09,
"loss": 0.1667,
"reward": 1.9819197297096252,
"reward_std": 0.4982582703232765,
"rewards/accuracy_reward": 0.19285715315490962,
"rewards/format_reward": 0.8776786088943481,
"rewards/tag_count_reward": 0.9113839745521546,
"step": 2850
},
{
"clip_ratio": 0.0,
"completion_length": 706.5500335693359,
"epoch": 0.9746522147307332,
"grad_norm": 1.9430170059204102,
"kl": 1.658251953125,
"learning_rate": 5.829779383726808e-09,
"loss": 0.207,
"reward": 1.9593750953674316,
"reward_std": 0.5111976288259029,
"rewards/accuracy_reward": 0.188392866961658,
"rewards/format_reward": 0.86607146859169,
"rewards/tag_count_reward": 0.9049107611179352,
"step": 2855
},
{
"clip_ratio": 0.0,
"completion_length": 663.8089660644531,
"epoch": 0.9763591362976871,
"grad_norm": 2.3743481636047363,
"kl": 1.263671875,
"learning_rate": 5.069015528765042e-09,
"loss": 0.1589,
"reward": 1.9944197475910186,
"reward_std": 0.45694540068507195,
"rewards/accuracy_reward": 0.17410715138539673,
"rewards/format_reward": 0.8955357521772385,
"rewards/tag_count_reward": 0.924776828289032,
"step": 2860
},
{
"clip_ratio": 0.0,
"completion_length": 694.6955627441406,
"epoch": 0.9780660578646411,
"grad_norm": 1.926790714263916,
"kl": 1.60859375,
"learning_rate": 4.361336304503305e-09,
"loss": 0.1852,
"reward": 1.9319197297096253,
"reward_std": 0.5001587726175785,
"rewards/accuracy_reward": 0.1946428671479225,
"rewards/format_reward": 0.850892898440361,
"rewards/tag_count_reward": 0.8863839656114578,
"step": 2865
},
{
"clip_ratio": 0.0,
"completion_length": 706.7044982910156,
"epoch": 0.9797729794315951,
"grad_norm": 0.963211715221405,
"kl": 1.59814453125,
"learning_rate": 3.7067668404563994e-09,
"loss": 0.2251,
"reward": 1.9821429610252381,
"reward_std": 0.509521733224392,
"rewards/accuracy_reward": 0.21160715334117414,
"rewards/format_reward": 0.8651786148548126,
"rewards/tag_count_reward": 0.9053571850061417,
"step": 2870
},
{
"clip_ratio": 0.0,
"completion_length": 714.2839630126953,
"epoch": 0.9814799009985491,
"grad_norm": 1.6829819679260254,
"kl": 1.930078125,
"learning_rate": 3.105330380224536e-09,
"loss": 0.2473,
"reward": 1.8424108028411865,
"reward_std": 0.5233809776604176,
"rewards/accuracy_reward": 0.13750000530853868,
"rewards/format_reward": 0.8294643253087998,
"rewards/tag_count_reward": 0.8754464626312256,
"step": 2875
},
{
"clip_ratio": 0.0,
"completion_length": 713.0312805175781,
"epoch": 0.9831868225655032,
"grad_norm": 1.9121073484420776,
"kl": 1.67421875,
"learning_rate": 2.5570482806681615e-09,
"loss": 0.2478,
"reward": 1.891071504354477,
"reward_std": 0.5421105667948722,
"rewards/accuracy_reward": 0.1598214370198548,
"rewards/format_reward": 0.8437500447034836,
"rewards/tag_count_reward": 0.8875000387430191,
"step": 2880
},
{
"clip_ratio": 0.0,
"completion_length": 735.0348571777344,
"epoch": 0.9848937441324571,
"grad_norm": 2.0713632106781006,
"kl": 1.6849609375,
"learning_rate": 2.061940011149566e-09,
"loss": 0.2128,
"reward": 1.889509028196335,
"reward_std": 0.47978220880031586,
"rewards/accuracy_reward": 0.1500000067986548,
"rewards/format_reward": 0.850892898440361,
"rewards/tag_count_reward": 0.8886161148548126,
"step": 2885
},
{
"clip_ratio": 0.0,
"completion_length": 706.9044921875,
"epoch": 0.9866006656994111,
"grad_norm": 2.39575457572937,
"kl": 1.4171875,
"learning_rate": 1.6200231528412657e-09,
"loss": 0.1681,
"reward": 1.9350447356700897,
"reward_std": 0.49725582599639895,
"rewards/accuracy_reward": 0.17589286342263222,
"rewards/format_reward": 0.859821480512619,
"rewards/tag_count_reward": 0.8993303954601288,
"step": 2890
},
{
"clip_ratio": 0.0,
"completion_length": 748.419677734375,
"epoch": 0.9883075872663651,
"grad_norm": 2.3295395374298096,
"kl": 1.81171875,
"learning_rate": 1.2313133981020074e-09,
"loss": 0.2504,
"reward": 1.9466518700122832,
"reward_std": 0.5394668459892273,
"rewards/accuracy_reward": 0.19464286556467414,
"rewards/format_reward": 0.855357187986374,
"rewards/tag_count_reward": 0.896651828289032,
"step": 2895
},
{
"clip_ratio": 0.0,
"completion_length": 719.3134246826172,
"epoch": 0.9900145088333191,
"grad_norm": 3.916964292526245,
"kl": 1.43359375,
"learning_rate": 8.958245499192108e-10,
"loss": 0.2138,
"reward": 1.9185268938541413,
"reward_std": 0.47515787184238434,
"rewards/accuracy_reward": 0.16071429513394833,
"rewards/format_reward": 0.8625000417232513,
"rewards/tag_count_reward": 0.8953125387430191,
"step": 2900
},
{
"clip_ratio": 0.0,
"completion_length": 722.6116424560547,
"epoch": 0.9917214304002732,
"grad_norm": 1.7201131582260132,
"kl": 1.637890625,
"learning_rate": 6.13568521419361e-10,
"loss": 0.2208,
"reward": 1.8674107909202575,
"reward_std": 0.4943607971072197,
"rewards/accuracy_reward": 0.15535714905709028,
"rewards/format_reward": 0.8321429014205932,
"rewards/tag_count_reward": 0.8799107581377029,
"step": 2905
},
{
"clip_ratio": 0.0,
"completion_length": 698.618782043457,
"epoch": 0.9934283519672271,
"grad_norm": 1.546420931816101,
"kl": 1.32275390625,
"learning_rate": 3.8455533544418106e-10,
"loss": 0.1588,
"reward": 2.012500077486038,
"reward_std": 0.461934956908226,
"rewards/accuracy_reward": 0.19642858058214188,
"rewards/format_reward": 0.8928571909666061,
"rewards/tag_count_reward": 0.9232143253087998,
"step": 2910
},
{
"clip_ratio": 0.0,
"completion_length": 721.3750366210937,
"epoch": 0.9951352735341811,
"grad_norm": 4.14056396484375,
"kl": 1.686181640625,
"learning_rate": 2.0879312419574969e-10,
"loss": 0.2039,
"reward": 1.9272322297096252,
"reward_std": 0.5018501503393054,
"rewards/accuracy_reward": 0.17857143925502897,
"rewards/format_reward": 0.8553571850061417,
"rewards/tag_count_reward": 0.8933036118745804,
"step": 2915
},
{
"clip_ratio": 0.0,
"completion_length": 747.6687835693359,
"epoch": 0.9968421951011351,
"grad_norm": 2.800555944442749,
"kl": 1.7328125,
"learning_rate": 8.628812894656557e-11,
"loss": 0.2431,
"reward": 1.953125101327896,
"reward_std": 0.5268502771854401,
"rewards/accuracy_reward": 0.19375001024454833,
"rewards/format_reward": 0.8625000387430191,
"rewards/tag_count_reward": 0.8968750387430191,
"step": 2920
},
{
"clip_ratio": 0.0,
"completion_length": 714.4089721679687,
"epoch": 0.9985491166680891,
"grad_norm": 2.166287660598755,
"kl": 1.687109375,
"learning_rate": 1.7044699819057652e-11,
"loss": 0.2281,
"reward": 1.9647322416305542,
"reward_std": 0.4754629820585251,
"rewards/accuracy_reward": 0.18571429569274187,
"rewards/format_reward": 0.8696429014205933,
"rewards/tag_count_reward": 0.9093750327825546,
"step": 2925
},
{
"clip_ratio": 0.0,
"completion_length": 734.4546127319336,
"epoch": 0.9999146539216524,
"kl": 1.63232421875,
"reward": 1.9659599140286446,
"reward_std": 0.5131321512162685,
"rewards/accuracy_reward": 0.1875000074505806,
"rewards/format_reward": 0.8705357499420643,
"rewards/tag_count_reward": 0.9079241491854191,
"step": 2929,
"total_flos": 0.0,
"train_loss": 0.22819482568542057,
"train_runtime": 410492.8024,
"train_samples_per_second": 0.228,
"train_steps_per_second": 0.007
}
],
"logging_steps": 5,
"max_steps": 2929,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}