Qwen2.5-1.5B-Open-R1-GRPO / trainer_state.json
Blancy's picture
Model save
68e305f verified
raw
history blame
143 kB
Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN, "... is not valid JSON
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9993002099370188,
"eval_steps": 500,
"global_step": 357,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 629.7701110839844,
"epoch": 0.0027991602519244225,
"grad_norm": 0.6973966956138611,
"kl": 0.0,
"learning_rate": 5.555555555555555e-07,
"loss": -0.0,
"reward": 0.2823660857975483,
"reward_std": 0.362668514251709,
"rewards/accuracy_reward": 0.14732143841683865,
"rewards/format_reward": 0.026785715715959668,
"rewards/tag_count_reward": 0.1082589328289032,
"step": 1
},
{
"completion_length": 643.1250305175781,
"epoch": 0.005598320503848845,
"grad_norm": 1.0619760751724243,
"kl": 0.0,
"learning_rate": 1.111111111111111e-06,
"loss": 0.0,
"reward": 0.273995541036129,
"reward_std": 0.37962110340595245,
"rewards/accuracy_reward": 0.09151785937137902,
"rewards/format_reward": 0.0424107164144516,
"rewards/tag_count_reward": 0.14006696827709675,
"step": 2
},
{
"completion_length": 638.2344055175781,
"epoch": 0.008397480755773267,
"grad_norm": 1.4537550210952759,
"kl": 0.000217437744140625,
"learning_rate": 1.6666666666666667e-06,
"loss": -0.0,
"reward": 0.21093751303851604,
"reward_std": 0.34829047322273254,
"rewards/accuracy_reward": 0.09151786309666932,
"rewards/format_reward": 0.022321430267766118,
"rewards/tag_count_reward": 0.09709821827709675,
"step": 3
},
{
"completion_length": 609.8281402587891,
"epoch": 0.01119664100769769,
"grad_norm": 0.9107578992843628,
"kl": 0.00018262863159179688,
"learning_rate": 2.222222222222222e-06,
"loss": 0.0,
"reward": 0.309709832072258,
"reward_std": 0.36093416810035706,
"rewards/accuracy_reward": 0.13839286309666932,
"rewards/format_reward": 0.026785715948790312,
"rewards/tag_count_reward": 0.1445312574505806,
"step": 4
},
{
"completion_length": 608.8281555175781,
"epoch": 0.013995801259622114,
"grad_norm": 0.997071385383606,
"kl": 0.0002675056457519531,
"learning_rate": 2.7777777777777783e-06,
"loss": -0.0,
"reward": 0.2036830447614193,
"reward_std": 0.34512022137641907,
"rewards/accuracy_reward": 0.06250000232830644,
"rewards/format_reward": 0.024553573224693537,
"rewards/tag_count_reward": 0.1166294701397419,
"step": 5
},
{
"completion_length": 607.8839416503906,
"epoch": 0.016794961511546535,
"grad_norm": 0.7406116724014282,
"kl": 0.00133514404296875,
"learning_rate": 3.3333333333333333e-06,
"loss": -0.0,
"reward": 0.2421875074505806,
"reward_std": 0.368218295276165,
"rewards/accuracy_reward": 0.09598214761354029,
"rewards/format_reward": 0.022321429569274187,
"rewards/tag_count_reward": 0.1238839365541935,
"step": 6
},
{
"completion_length": 599.4933319091797,
"epoch": 0.01959412176347096,
"grad_norm": 0.7837677597999573,
"kl": 0.011138916015625,
"learning_rate": 3.88888888888889e-06,
"loss": -0.0,
"reward": 0.3147321604192257,
"reward_std": 0.38796762377023697,
"rewards/accuracy_reward": 0.15178571944124997,
"rewards/format_reward": 0.02232142980210483,
"rewards/tag_count_reward": 0.1406250074505806,
"step": 7
},
{
"completion_length": 646.279052734375,
"epoch": 0.02239328201539538,
"grad_norm": 1.032494306564331,
"kl": 0.0482177734375,
"learning_rate": 4.444444444444444e-06,
"loss": -0.0,
"reward": 0.2382812611758709,
"reward_std": 0.4348388612270355,
"rewards/accuracy_reward": 0.022321430034935474,
"rewards/format_reward": 0.03571428754366934,
"rewards/tag_count_reward": 0.1802455447614193,
"step": 8
},
{
"completion_length": 630.9799194335938,
"epoch": 0.025192442267319804,
"grad_norm": 0.912998378276825,
"kl": 0.50537109375,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.301897332072258,
"reward_std": 0.4177221581339836,
"rewards/accuracy_reward": 0.10267857555299997,
"rewards/format_reward": 0.033482144586741924,
"rewards/tag_count_reward": 0.1657366119325161,
"step": 9
},
{
"completion_length": 638.5826110839844,
"epoch": 0.02799160251924423,
"grad_norm": 1.2475032806396484,
"kl": 272.5,
"learning_rate": 5.555555555555557e-06,
"loss": 0.0,
"reward": 0.3320312649011612,
"reward_std": 0.43727361410856247,
"rewards/accuracy_reward": 0.026785715948790312,
"rewards/format_reward": 0.037946430034935474,
"rewards/tag_count_reward": 0.2672991193830967,
"step": 10
},
{
"completion_length": 525.0491333007812,
"epoch": 0.03079076277116865,
"grad_norm": 0.9516815543174744,
"kl": 4692.0,
"learning_rate": 6.111111111111112e-06,
"loss": -0.0,
"reward": 0.5831473469734192,
"reward_std": 0.4952840134501457,
"rewards/accuracy_reward": 0.129464291036129,
"rewards/format_reward": 0.058035717345774174,
"rewards/tag_count_reward": 0.3956473395228386,
"step": 11
},
{
"completion_length": 554.5111999511719,
"epoch": 0.03358992302309307,
"grad_norm": 0.8821593523025513,
"kl": 107712.0,
"learning_rate": 6.666666666666667e-06,
"loss": 0.0,
"reward": 0.4581473395228386,
"reward_std": 0.44926758110523224,
"rewards/accuracy_reward": 0.05580357275903225,
"rewards/format_reward": 0.04464285960420966,
"rewards/tag_count_reward": 0.357700914144516,
"step": 12
},
{
"completion_length": 456.93082427978516,
"epoch": 0.0363890832750175,
"grad_norm": 1.2099668979644775,
"kl": 34275328.0,
"learning_rate": 7.222222222222223e-06,
"loss": 0.0,
"reward": 0.6450893208384514,
"reward_std": 0.44861604273319244,
"rewards/accuracy_reward": 0.09821429033763707,
"rewards/format_reward": 0.029017858672887087,
"rewards/tag_count_reward": 0.517857164144516,
"step": 13
},
{
"completion_length": 359.7901916503906,
"epoch": 0.03918824352694192,
"grad_norm": 1.3564475774765015,
"kl": 1912602624.0,
"learning_rate": 7.77777777777778e-06,
"loss": -0.0,
"reward": 0.6975446790456772,
"reward_std": 0.42205285280942917,
"rewards/accuracy_reward": 0.03348214388824999,
"rewards/format_reward": 0.0223214291036129,
"rewards/tag_count_reward": 0.6417410969734192,
"step": 14
},
{
"completion_length": 250.95537185668945,
"epoch": 0.04198740377886634,
"grad_norm": 2.748863697052002,
"kl": 121970360320.0,
"learning_rate": 8.333333333333334e-06,
"loss": 0.0,
"reward": 0.9246652126312256,
"reward_std": 0.309246726334095,
"rewards/accuracy_reward": 0.10491071967408061,
"rewards/format_reward": 0.03125000186264515,
"rewards/tag_count_reward": 0.788504496216774,
"step": 15
},
{
"completion_length": 179.02456283569336,
"epoch": 0.04478656403079076,
"grad_norm": 8.485617637634277,
"kl": 1172190527488.0,
"learning_rate": 8.888888888888888e-06,
"loss": 0.0,
"reward": 0.9799107611179352,
"reward_std": 0.2687787376344204,
"rewards/accuracy_reward": 0.1093750037252903,
"rewards/format_reward": 0.03348214365541935,
"rewards/tag_count_reward": 0.8370536118745804,
"step": 16
},
{
"completion_length": 140.1183090209961,
"epoch": 0.04758572428271519,
"grad_norm": 9.24393367767334,
"kl": 9486509015040.0,
"learning_rate": 9.444444444444445e-06,
"loss": 0.0,
"reward": 0.96261166036129,
"reward_std": 0.2797929644584656,
"rewards/accuracy_reward": 0.03125000139698386,
"rewards/format_reward": 0.0580357164144516,
"rewards/tag_count_reward": 0.8733259290456772,
"step": 17
},
{
"completion_length": 117.89286231994629,
"epoch": 0.05038488453463961,
"grad_norm": 23.932544708251953,
"kl": 1.1927502138114048e+16,
"learning_rate": 1e-05,
"loss": -0.0,
"reward": 0.9838170111179352,
"reward_std": 0.2322257235646248,
"rewards/accuracy_reward": 0.04017857322469354,
"rewards/format_reward": 0.05357143096625805,
"rewards/tag_count_reward": 0.8900670111179352,
"step": 18
},
{
"completion_length": 86.49553871154785,
"epoch": 0.05318404478656403,
"grad_norm": 46.19959259033203,
"kl": 1.2350671618100848e+20,
"learning_rate": 1.0555555555555557e-05,
"loss": -0.0,
"reward": 1.0697545111179352,
"reward_std": 0.24332104623317719,
"rewards/accuracy_reward": 0.0468750037252903,
"rewards/format_reward": 0.10267857648432255,
"rewards/tag_count_reward": 0.9202009290456772,
"step": 19
},
{
"completion_length": 76.75670051574707,
"epoch": 0.05598320503848846,
"grad_norm": 56.76476287841797,
"kl": 3.6570670126129186e+22,
"learning_rate": 1.1111111111111113e-05,
"loss": 0.0,
"reward": 1.1439732909202576,
"reward_std": 0.2484389767050743,
"rewards/accuracy_reward": 0.1116071492433548,
"rewards/format_reward": 0.0959821492433548,
"rewards/tag_count_reward": 0.9363839775323868,
"step": 20
},
{
"completion_length": 81.15625286102295,
"epoch": 0.05878236529041288,
"grad_norm": 72.71674346923828,
"kl": 1.3916814025016844e+25,
"learning_rate": 1.1666666666666668e-05,
"loss": 0.0,
"reward": 1.1205357611179352,
"reward_std": 0.3521122932434082,
"rewards/accuracy_reward": 0.0446428582072258,
"rewards/format_reward": 0.1875000074505806,
"rewards/tag_count_reward": 0.88839291036129,
"step": 21
},
{
"completion_length": 82.05134105682373,
"epoch": 0.0615815255423373,
"grad_norm": 290.8921203613281,
"kl": 1.9149384982695726e+27,
"learning_rate": 1.2222222222222224e-05,
"loss": -0.0,
"reward": 1.315290242433548,
"reward_std": 0.42858483642339706,
"rewards/accuracy_reward": 0.08035714668221772,
"rewards/format_reward": 0.4308035895228386,
"rewards/tag_count_reward": 0.804129496216774,
"step": 22
},
{
"completion_length": 91.14732551574707,
"epoch": 0.06438068579426172,
"grad_norm": 397.04351806640625,
"kl": 7.412165985221214e+28,
"learning_rate": 1.2777777777777777e-05,
"loss": -0.0,
"reward": 1.479352742433548,
"reward_std": 0.4619317427277565,
"rewards/accuracy_reward": 0.082589291036129,
"rewards/format_reward": 0.6339286118745804,
"rewards/tag_count_reward": 0.7628348767757416,
"step": 23
},
{
"completion_length": 132.92634391784668,
"epoch": 0.06717984604618614,
"grad_norm": 211.87890625,
"kl": 1.4108164314902722e+27,
"learning_rate": 1.3333333333333333e-05,
"loss": -0.0,
"reward": 1.3069196939468384,
"reward_std": 0.4795772433280945,
"rewards/accuracy_reward": 0.03794643026776612,
"rewards/format_reward": 0.5625000298023224,
"rewards/tag_count_reward": 0.706473246216774,
"step": 24
},
{
"completion_length": 414.10717010498047,
"epoch": 0.06997900629811056,
"grad_norm": 275.24444580078125,
"kl": 6.022578291123375e+29,
"learning_rate": 1.388888888888889e-05,
"loss": 0.0,
"reward": 0.9899553954601288,
"reward_std": 0.5238821133971214,
"rewards/accuracy_reward": 0.0736607164144516,
"rewards/format_reward": 0.4174107387661934,
"rewards/tag_count_reward": 0.4988839477300644,
"step": 25
},
{
"completion_length": 391.50670623779297,
"epoch": 0.072778166550035,
"grad_norm": 199.7307891845703,
"kl": 6.953063542251838e+32,
"learning_rate": 1.4444444444444446e-05,
"loss": -0.0,
"reward": 0.957589328289032,
"reward_std": 0.5497933924198151,
"rewards/accuracy_reward": 0.07812500232830644,
"rewards/format_reward": 0.3816964477300644,
"rewards/tag_count_reward": 0.4977678805589676,
"step": 26
},
{
"completion_length": 485.42859649658203,
"epoch": 0.07557732680195942,
"grad_norm": NaN,
"kl": Infinity,
"learning_rate": 1.5000000000000002e-05,
"loss": -0.0,
"reward": 0.651227705180645,
"reward_std": 0.41753336787223816,
"rewards/accuracy_reward": 0.03794643026776612,
"rewards/format_reward": 0.2299107238650322,
"rewards/tag_count_reward": 0.3833705559372902,
"step": 27
},
{
"completion_length": 1024.0,
"epoch": 0.07837648705388384,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.555555555555556e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 28
},
{
"completion_length": 1024.0,
"epoch": 0.08117564730580826,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.6111111111111115e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 29
},
{
"completion_length": 1024.0,
"epoch": 0.08397480755773268,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 30
},
{
"completion_length": 1024.0,
"epoch": 0.0867739678096571,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.7222222222222224e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 31
},
{
"completion_length": 1024.0,
"epoch": 0.08957312806158152,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.7777777777777777e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 32
},
{
"completion_length": 1024.0,
"epoch": 0.09237228831350595,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.8333333333333333e-05,
"loss": 0.0,
"reward": 0.1428571492433548,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1428571492433548,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 33
},
{
"completion_length": 1024.0,
"epoch": 0.09517144856543037,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.888888888888889e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 34
},
{
"completion_length": 1024.0,
"epoch": 0.0979706088173548,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9444444444444445e-05,
"loss": 0.0,
"reward": 0.1428571492433548,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1428571492433548,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 35
},
{
"completion_length": 1024.0,
"epoch": 0.10076976906927922,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 36
},
{
"completion_length": 1024.0,
"epoch": 0.10356892932120364,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9999521087449523e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 37
},
{
"completion_length": 1024.0,
"epoch": 0.10636808957312806,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9998084395669537e-05,
"loss": 0.0,
"reward": 0.2142857238650322,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.2142857238650322,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 38
},
{
"completion_length": 1024.0,
"epoch": 0.10916724982505248,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9995690062269985e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 39
},
{
"completion_length": 1024.0,
"epoch": 0.11196641007697691,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9992338316586132e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 40
},
{
"completion_length": 1024.0,
"epoch": 0.11476557032890133,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9988029479656596e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 41
},
{
"completion_length": 1024.0,
"epoch": 0.11756473058082575,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9982763964192586e-05,
"loss": 0.0,
"reward": 0.1428571492433548,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1428571492433548,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 42
},
{
"completion_length": 1024.0,
"epoch": 0.12036389083275018,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9976542274538394e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 43
},
{
"completion_length": 1024.0,
"epoch": 0.1231630510846746,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9969365006623072e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 44
},
{
"completion_length": 1024.0,
"epoch": 0.12596221133659902,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.996123284790336e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 45
},
{
"completion_length": 1024.0,
"epoch": 0.12876137158852344,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9952146577297827e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 46
},
{
"completion_length": 1024.0,
"epoch": 0.13156053184044786,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9942107065112286e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 47
},
{
"completion_length": 1024.0,
"epoch": 0.13435969209237228,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9931115272956405e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 48
},
{
"completion_length": 1024.0,
"epoch": 0.1371588523442967,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9919172253651637e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 49
},
{
"completion_length": 1024.0,
"epoch": 0.13995801259622112,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9906279151130338e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 50
},
{
"completion_length": 1024.0,
"epoch": 0.14275717284814557,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.989243720032624e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 51
},
{
"completion_length": 1024.0,
"epoch": 0.14555633310007,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.987764772705613e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 52
},
{
"completion_length": 1024.0,
"epoch": 0.1483554933519944,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9861912147892884e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 53
},
{
"completion_length": 1024.0,
"epoch": 0.15115465360391883,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9845231970029774e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 54
},
{
"completion_length": 1024.0,
"epoch": 0.15395381385584325,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.98276087911361e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 55
},
{
"completion_length": 1024.0,
"epoch": 0.15675297410776767,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9809044299204173e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 56
},
{
"completion_length": 1024.0,
"epoch": 0.1595521343596921,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.978954027238763e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 57
},
{
"completion_length": 1024.0,
"epoch": 0.16235129461161651,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9769098578831113e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 58
},
{
"completion_length": 1024.0,
"epoch": 0.16515045486354094,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.974772117649135e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 59
},
{
"completion_length": 1024.0,
"epoch": 0.16794961511546536,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.972541011294959e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 60
},
{
"completion_length": 1024.0,
"epoch": 0.17074877536738978,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9702167525215504e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 61
},
{
"completion_length": 1024.0,
"epoch": 0.1735479356193142,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9677995639522482e-05,
"loss": 0.0,
"reward": 0.1428571492433548,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1428571492433548,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 62
},
{
"completion_length": 1024.0,
"epoch": 0.17634709587123862,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9652896771114416e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 63
},
{
"completion_length": 1024.0,
"epoch": 0.17914625612316304,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9626873324023915e-05,
"loss": 0.0,
"reward": 0.2500000149011612,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.2500000149011612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 64
},
{
"completion_length": 1024.0,
"epoch": 0.1819454163750875,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.959992779084207e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 65
},
{
"completion_length": 1024.0,
"epoch": 0.1847445766270119,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9572062752479684e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 66
},
{
"completion_length": 1024.0,
"epoch": 0.18754373687893633,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9543280877920073e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 67
},
{
"completion_length": 1024.0,
"epoch": 0.19034289713086075,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9513584923963426e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 68
},
{
"completion_length": 1024.0,
"epoch": 0.19314205738278517,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9482977734962753e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 69
},
{
"completion_length": 1024.0,
"epoch": 0.1959412176347096,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.945146224255145e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 70
},
{
"completion_length": 1024.0,
"epoch": 0.198740377886634,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9419041465362477e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 71
},
{
"completion_length": 1024.0,
"epoch": 0.20153953813855843,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9385718508739263e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 72
},
{
"completion_length": 1024.0,
"epoch": 0.20433869839048285,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9351496564438228e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 73
},
{
"completion_length": 1024.0,
"epoch": 0.20713785864240727,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.93163789103231e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 74
},
{
"completion_length": 1024.0,
"epoch": 0.2099370188943317,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9280368910050943e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 75
},
{
"completion_length": 1024.0,
"epoch": 0.21273617914625612,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9243470012749968e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 76
},
{
"completion_length": 1024.0,
"epoch": 0.21553533939818054,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9205685752689178e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 77
},
{
"completion_length": 1024.0,
"epoch": 0.21833449965010496,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9167019748939847e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 78
},
{
"completion_length": 1024.0,
"epoch": 0.22113365990202938,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9127475705028864e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 79
},
{
"completion_length": 1024.0,
"epoch": 0.22393282015395383,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.908705740858402e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 80
},
{
"completion_length": 1024.0,
"epoch": 0.22673198040587825,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.9045768730971198e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 81
},
{
"completion_length": 1024.0,
"epoch": 0.22953114065780267,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.900361362692358e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 82
},
{
"completion_length": 1024.0,
"epoch": 0.2323303009097271,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.8960596134162845e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 83
},
{
"completion_length": 1024.0,
"epoch": 0.2351294611616515,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.8916720373012425e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 84
},
{
"completion_length": 1024.0,
"epoch": 0.23792862141357593,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.887199054600286e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 85
},
{
"completion_length": 1024.0,
"epoch": 0.24072778166550035,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.8826410937469256e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 86
},
{
"completion_length": 1024.0,
"epoch": 0.24352694191742477,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.8779985913140927e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 87
},
{
"completion_length": 1024.0,
"epoch": 0.2463261021693492,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.873271991972323e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 88
},
{
"completion_length": 1024.0,
"epoch": 0.2491252624212736,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.8684617484471662e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 89
},
{
"completion_length": 1024.0,
"epoch": 0.25192442267319803,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.8635683214758213e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 90
},
{
"completion_length": 1024.0,
"epoch": 0.2547235829251225,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.8585921797630064e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 91
},
{
"completion_length": 1024.0,
"epoch": 0.2575227431770469,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.8535337999360655e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 92
},
{
"completion_length": 1024.0,
"epoch": 0.2603219034289713,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.8483936664993152e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 93
},
{
"completion_length": 1024.0,
"epoch": 0.2631210636808957,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.8431722717876383e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 94
},
{
"completion_length": 1024.0,
"epoch": 0.26592022393282017,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.837870115919327e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 95
},
{
"completion_length": 1024.0,
"epoch": 0.26871938418474456,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.8324877067481782e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 96
},
{
"completion_length": 1024.0,
"epoch": 0.271518544436669,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.8270255598148542e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 97
},
{
"completion_length": 1024.0,
"epoch": 0.2743177046885934,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.8214841982974975e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 98
},
{
"completion_length": 1024.0,
"epoch": 0.27711686494051785,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.815864152961624e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 99
},
{
"completion_length": 1024.0,
"epoch": 0.27991602519244224,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.8101659621092832e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 100
},
{
"completion_length": 1024.0,
"epoch": 0.2827151854443667,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.804390171527497e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 101
},
{
"completion_length": 1024.0,
"epoch": 0.28551434569629114,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.798537334435986e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 102
},
{
"completion_length": 1024.0,
"epoch": 0.28831350594821553,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.792608011434178e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 103
},
{
"completion_length": 1024.0,
"epoch": 0.29111266620014,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.786602770447513e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 104
},
{
"completion_length": 1024.0,
"epoch": 0.2939118264520644,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.780522186673046e-05,
"loss": 0.0,
"reward": 0.1428571529686451,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1428571529686451,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 105
},
{
"completion_length": 1024.0,
"epoch": 0.2967109867039888,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.7743668425243547e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 106
},
{
"completion_length": 1024.0,
"epoch": 0.2995101469559132,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.768137327575751e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 107
},
{
"completion_length": 1024.0,
"epoch": 0.30230930720783766,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.7618342385058147e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 108
},
{
"completion_length": 1024.0,
"epoch": 0.30510846745976206,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.7554581790402372e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 109
},
{
"completion_length": 1024.0,
"epoch": 0.3079076277116865,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.749009759893999e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 110
},
{
"completion_length": 1024.0,
"epoch": 0.3107067879636109,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.7424895987128723e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 111
},
{
"completion_length": 1024.0,
"epoch": 0.31350594821553535,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.7358983200142608e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 112
},
{
"completion_length": 1024.0,
"epoch": 0.31630510846745974,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.7292365551273835e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 113
},
{
"completion_length": 1024.0,
"epoch": 0.3191042687193842,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.7225049421328024e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 114
},
{
"completion_length": 1024.0,
"epoch": 0.3219034289713086,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.7157041258013074e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 115
},
{
"completion_length": 1024.0,
"epoch": 0.32470258922323303,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.7088347575321575e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 116
},
{
"completion_length": 1024.0,
"epoch": 0.3275017494751575,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.7018974952906885e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 117
},
{
"completion_length": 1024.0,
"epoch": 0.33030090972708187,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.6948930035452905e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 118
},
{
"completion_length": 1024.0,
"epoch": 0.3331000699790063,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.687821953203765e-05,
"loss": 0.0,
"reward": 0.1785714365541935,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1785714365541935,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 119
},
{
"completion_length": 1024.0,
"epoch": 0.3358992302309307,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.680685021549063e-05,
"loss": 0.0,
"reward": 0.1785714365541935,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1785714365541935,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 120
},
{
"completion_length": 1024.0,
"epoch": 0.33869839048285516,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.6734828921744127e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 121
},
{
"completion_length": 1024.0,
"epoch": 0.34149755073477955,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.6662162549178433e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 122
},
{
"completion_length": 1024.0,
"epoch": 0.344296710986704,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.658885805796111e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 123
},
{
"completion_length": 1024.0,
"epoch": 0.3470958712386284,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.651492246938034e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 124
},
{
"completion_length": 1024.0,
"epoch": 0.34989503149055284,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.6440362865172373e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 125
},
{
"completion_length": 1024.0,
"epoch": 0.35269419174247724,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.636518638684325e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 126
},
{
"completion_length": 1024.0,
"epoch": 0.3554933519944017,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.628940023498477e-05,
"loss": 0.0,
"reward": 0.1428571492433548,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1428571492433548,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 127
},
{
"completion_length": 1024.0,
"epoch": 0.3582925122463261,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.621301166858479e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 128
},
{
"completion_length": 1024.0,
"epoch": 0.3610916724982505,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.613602800433194e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 129
},
{
"completion_length": 1024.0,
"epoch": 0.363890832750175,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.6058456615914815e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 130
},
{
"completion_length": 1024.0,
"epoch": 0.36668999300209937,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.598030493331572e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 131
},
{
"completion_length": 1024.0,
"epoch": 0.3694891532540238,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.590158044209897e-05,
"loss": 0.0,
"reward": 0.1428571492433548,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1428571492433548,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 132
},
{
"completion_length": 1024.0,
"epoch": 0.3722883135059482,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.5822290682693944e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 133
},
{
"completion_length": 1024.0,
"epoch": 0.37508747375787266,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.574244324967283e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 134
},
{
"completion_length": 1024.0,
"epoch": 0.37788663400979705,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.566204579102317e-05,
"loss": 0.0,
"reward": 0.1428571492433548,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1428571492433548,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 135
},
{
"completion_length": 1024.0,
"epoch": 0.3806857942617215,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.5581106007415382e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 136
},
{
"completion_length": 1024.0,
"epoch": 0.3834849545136459,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.5499631651465086e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 137
},
{
"completion_length": 1024.0,
"epoch": 0.38628411476557034,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.5417630526990613e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 138
},
{
"completion_length": 1024.0,
"epoch": 0.38908327501749473,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.5335110488265497e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 139
},
{
"completion_length": 1024.0,
"epoch": 0.3918824352694192,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.5252079439266179e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 140
},
{
"completion_length": 1024.0,
"epoch": 0.3946815955213436,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.5168545332914942e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 141
},
{
"completion_length": 1024.0,
"epoch": 0.397480755773268,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.5084516170318181e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 142
},
{
"completion_length": 1024.0,
"epoch": 0.4002799160251924,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 143
},
{
"completion_length": 1024.0,
"epoch": 0.40307907627711687,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.4915004917131345e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 144
},
{
"completion_length": 1024.0,
"epoch": 0.4058782365290413,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.4829539062754597e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 145
},
{
"completion_length": 1024.0,
"epoch": 0.4086773967809657,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.474361062300381e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 146
},
{
"completion_length": 1024.0,
"epoch": 0.41147655703289016,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.4657227828320637e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 147
},
{
"completion_length": 1024.0,
"epoch": 0.41427571728481455,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.4570398952665982e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 148
},
{
"completion_length": 1024.0,
"epoch": 0.417074877536739,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.4483132312727501e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 149
},
{
"completion_length": 1024.0,
"epoch": 0.4198740377886634,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.4395436267123017e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 150
},
{
"completion_length": 1024.0,
"epoch": 0.42267319804058784,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.4307319215599904e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 151
},
{
"completion_length": 1024.0,
"epoch": 0.42547235829251223,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.4218789598230536e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 152
},
{
"completion_length": 1024.0,
"epoch": 0.4282715185444367,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.4129855894603885e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 153
},
{
"completion_length": 1024.0,
"epoch": 0.4310706787963611,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.4040526623013317e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 154
},
{
"completion_length": 1024.0,
"epoch": 0.4338698390482855,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.3950810339640689e-05,
"loss": 0.0,
"reward": 0.1428571492433548,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1428571492433548,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 155
},
{
"completion_length": 1024.0,
"epoch": 0.4366689993002099,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.3860715637736817e-05,
"loss": 0.0,
"reward": 0.1071428656578064,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428656578064,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 156
},
{
"completion_length": 1024.0,
"epoch": 0.43946815955213436,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.3770251146798401e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 157
},
{
"completion_length": 1024.0,
"epoch": 0.44226731980405876,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.367942553174145e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 158
},
{
"completion_length": 1024.0,
"epoch": 0.4450664800559832,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.358824749207136e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 159
},
{
"completion_length": 1024.0,
"epoch": 0.44786564030790765,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.3496725761049637e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 160
},
{
"completion_length": 1024.0,
"epoch": 0.45066480055983205,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.3404869104857405e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 161
},
{
"completion_length": 1024.0,
"epoch": 0.4534639608117565,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.331268632175576e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 162
},
{
"completion_length": 1024.0,
"epoch": 0.4562631210636809,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.3220186241243063e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 163
},
{
"completion_length": 1024.0,
"epoch": 0.45906228131560534,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.31273777232092e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 164
},
{
"completion_length": 1024.0,
"epoch": 0.46186144156752973,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.3034269657086993e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 165
},
{
"completion_length": 1024.0,
"epoch": 0.4646606018194542,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.2940870961000725e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 166
},
{
"completion_length": 1024.0,
"epoch": 0.46745976207137857,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.2847190580911942e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 167
},
{
"completion_length": 1024.0,
"epoch": 0.470258922323303,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.27532374897626e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 168
},
{
"completion_length": 1024.0,
"epoch": 0.4730580825752274,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.2659020686615602e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 169
},
{
"completion_length": 1024.0,
"epoch": 0.47585724282715186,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.2564549195792842e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 170
},
{
"completion_length": 1024.0,
"epoch": 0.47865640307907625,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.2469832066010843e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 171
},
{
"completion_length": 1024.0,
"epoch": 0.4814555633310007,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.237487836951405e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 172
},
{
"completion_length": 1024.0,
"epoch": 0.4842547235829251,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.2279697201205852e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 173
},
{
"completion_length": 1024.0,
"epoch": 0.48705388383484954,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.2184297677777463e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 174
},
{
"completion_length": 1024.0,
"epoch": 0.489853044086774,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.2088688936834705e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 175
},
{
"completion_length": 1024.0,
"epoch": 0.4926522043386984,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.1992880136022766e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 176
},
{
"completion_length": 1024.0,
"epoch": 0.49545136459062283,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.1896880452149077e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 177
},
{
"completion_length": 1024.0,
"epoch": 0.4982505248425472,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.1800699080304333e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 178
},
{
"completion_length": 1024.0,
"epoch": 0.5010496850944717,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.170434523298175e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 179
},
{
"completion_length": 1024.0,
"epoch": 0.5038488453463961,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.1607828139194683e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 180
},
{
"completion_length": 1024.0,
"epoch": 0.5066480055983205,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.1511157043592642e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 181
},
{
"completion_length": 1024.0,
"epoch": 0.509447165850245,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.1414341205575817e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 182
},
{
"completion_length": 1024.0,
"epoch": 0.5122463261021694,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.1317389898408188e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 183
},
{
"completion_length": 1024.0,
"epoch": 0.5150454863540938,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.122031240832932e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 184
},
{
"completion_length": 1024.0,
"epoch": 0.5178446466060181,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.1123118033664877e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 185
},
{
"completion_length": 1024.0,
"epoch": 0.5206438068579426,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.1025816083936036e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 186
},
{
"completion_length": 1024.0,
"epoch": 0.523442967109867,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.0928415878967781e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 187
},
{
"completion_length": 1024.0,
"epoch": 0.5262421273617914,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.0830926747996225e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 188
},
{
"completion_length": 1024.0,
"epoch": 0.5290412876137159,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.073335802877504e-05,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 189
},
{
"completion_length": 1024.0,
"epoch": 0.5318404478656403,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.0635719066681064e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 190
},
{
"completion_length": 1024.0,
"epoch": 0.5346396081175647,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.053801921381916e-05,
"loss": 0.0,
"reward": 0.1428571492433548,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1428571492433548,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 191
},
{
"completion_length": 1024.0,
"epoch": 0.5374387683694891,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.0440267828126478e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 192
},
{
"completion_length": 1024.0,
"epoch": 0.5402379286214136,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.0342474272476108e-05,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 193
},
{
"completion_length": 1024.0,
"epoch": 0.543037088873338,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.0244647913780272e-05,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 194
},
{
"completion_length": 1024.0,
"epoch": 0.5458362491252624,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.0146798122093167e-05,
"loss": 0.0,
"reward": 0.1785714402794838,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1785714402794838,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 195
},
{
"completion_length": 1024.0,
"epoch": 0.5486354093771868,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.004893426971345e-05,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 196
},
{
"completion_length": 1024.0,
"epoch": 0.5514345696291113,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 9.951065730286553e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 197
},
{
"completion_length": 1024.0,
"epoch": 0.5542337298810357,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 9.853201877906836e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 198
},
{
"completion_length": 1024.0,
"epoch": 0.5570328901329601,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 9.755352086219733e-06,
"loss": 0.0,
"reward": 0.1428571529686451,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1428571529686451,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 199
},
{
"completion_length": 1024.0,
"epoch": 0.5598320503848845,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 9.657525727523897e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 200
},
{
"completion_length": 1024.0,
"epoch": 0.562631210636809,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 9.559732171873524e-06,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 201
},
{
"completion_length": 1024.0,
"epoch": 0.5654303708887334,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 9.461980786180844e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 202
},
{
"completion_length": 1024.0,
"epoch": 0.5682295311406578,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 9.364280933318943e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 203
},
{
"completion_length": 1024.0,
"epoch": 0.5710286913925823,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 9.266641971224963e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 204
},
{
"completion_length": 1024.0,
"epoch": 0.5738278516445067,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 9.16907325200378e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 205
},
{
"completion_length": 1024.0,
"epoch": 0.5766270118964311,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 9.071584121032224e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 206
},
{
"completion_length": 1024.0,
"epoch": 0.5794261721483555,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 8.974183916063967e-06,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 207
},
{
"completion_length": 1024.0,
"epoch": 0.58222533240028,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 8.876881966335128e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 208
},
{
"completion_length": 1024.0,
"epoch": 0.5850244926522044,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 8.779687591670687e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 209
},
{
"completion_length": 1024.0,
"epoch": 0.5878236529041287,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 8.682610101591813e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 210
},
{
"completion_length": 1024.0,
"epoch": 0.5906228131560531,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 8.585658794424188e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 211
},
{
"completion_length": 1024.0,
"epoch": 0.5934219734079776,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 8.488842956407361e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 212
},
{
"completion_length": 1024.0,
"epoch": 0.596221133659902,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 8.39217186080532e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 213
},
{
"completion_length": 1024.0,
"epoch": 0.5990202939118264,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 8.295654767018254e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 214
},
{
"completion_length": 1024.0,
"epoch": 0.6018194541637508,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 8.19930091969567e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 215
},
{
"completion_length": 1024.0,
"epoch": 0.6046186144156753,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 8.103119547850924e-06,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 216
},
{
"completion_length": 1024.0,
"epoch": 0.6074177746675997,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 8.00711986397724e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 217
},
{
"completion_length": 1024.0,
"epoch": 0.6102169349195241,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 7.911311063165298e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 218
},
{
"completion_length": 1024.0,
"epoch": 0.6130160951714486,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 7.815702322222539e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 219
},
{
"completion_length": 1024.0,
"epoch": 0.615815255423373,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 7.720302798794153e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 220
},
{
"completion_length": 1024.0,
"epoch": 0.6186144156752974,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 7.6251216304859555e-06,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 221
},
{
"completion_length": 1024.0,
"epoch": 0.6214135759272218,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 7.530167933989161e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 222
},
{
"completion_length": 1024.0,
"epoch": 0.6242127361791463,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 7.435450804207165e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 223
},
{
"completion_length": 1024.0,
"epoch": 0.6270118964310707,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 7.340979313384404e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 224
},
{
"completion_length": 1024.0,
"epoch": 0.6298110566829951,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 7.246762510237404e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 225
},
{
"completion_length": 1024.0,
"epoch": 0.6326102169349195,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 7.1528094190880625e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 226
},
{
"completion_length": 1024.0,
"epoch": 0.635409377186844,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 7.059129038999282e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 227
},
{
"completion_length": 1024.0,
"epoch": 0.6382085374387684,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 6.965730342913011e-06,
"loss": 0.0,
"reward": 0.1428571492433548,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1428571492433548,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 228
},
{
"completion_length": 1024.0,
"epoch": 0.6410076976906928,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 6.872622276790804e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 229
},
{
"completion_length": 1024.0,
"epoch": 0.6438068579426172,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 6.779813758756943e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 230
},
{
"completion_length": 1024.0,
"epoch": 0.6466060181945417,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 6.687313678244243e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 231
},
{
"completion_length": 1024.0,
"epoch": 0.6494051784464661,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 6.595130895142601e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 232
},
{
"completion_length": 1024.0,
"epoch": 0.6522043386983905,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 6.5032742389503676e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 233
},
{
"completion_length": 1024.0,
"epoch": 0.655003498950315,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 6.411752507928643e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 234
},
{
"completion_length": 1024.0,
"epoch": 0.6578026592022393,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 6.3205744682585545e-06,
"loss": 0.0,
"reward": 0.1785714365541935,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1785714365541935,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 235
},
{
"completion_length": 1024.0,
"epoch": 0.6606018194541637,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 6.229748853201605e-06,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 236
},
{
"completion_length": 1024.0,
"epoch": 0.6634009797060881,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 6.139284362263185e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 237
},
{
"completion_length": 1024.0,
"epoch": 0.6662001399580126,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 6.049189660359316e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 238
},
{
"completion_length": 1024.0,
"epoch": 0.668999300209937,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 5.959473376986686e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 239
},
{
"completion_length": 1024.0,
"epoch": 0.6717984604618614,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 5.8701441053961185e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 240
},
{
"completion_length": 1024.0,
"epoch": 0.6745976207137858,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 5.781210401769466e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 241
},
{
"completion_length": 1024.0,
"epoch": 0.6773967809657103,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 5.692680784400102e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 242
},
{
"completion_length": 1024.0,
"epoch": 0.6801959412176347,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 5.604563732876989e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 243
},
{
"completion_length": 1024.0,
"epoch": 0.6829951014695591,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 5.516867687272504e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 244
},
{
"completion_length": 1024.0,
"epoch": 0.6857942617214835,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 5.429601047334022e-06,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 245
},
{
"completion_length": 1024.0,
"epoch": 0.688593421973408,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 5.342772171679364e-06,
"loss": 0.0,
"reward": 0.1785714402794838,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1785714402794838,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 246
},
{
"completion_length": 1024.0,
"epoch": 0.6913925822253324,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 5.256389376996192e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 247
},
{
"completion_length": 1024.0,
"epoch": 0.6941917424772568,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 5.17046093724541e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 248
},
{
"completion_length": 1024.0,
"epoch": 0.6969909027291813,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 5.084995082868658e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 249
},
{
"completion_length": 1024.0,
"epoch": 0.6997900629811057,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 5.000000000000003e-06,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 250
},
{
"completion_length": 1024.0,
"epoch": 0.7025892232330301,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 4.9154838296818246e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 251
},
{
"completion_length": 1024.0,
"epoch": 0.7053883834849545,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 4.831454667085059e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 252
},
{
"completion_length": 1024.0,
"epoch": 0.708187543736879,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 4.747920560733825e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 253
},
{
"completion_length": 1024.0,
"epoch": 0.7109867039888034,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 4.664889511734509e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 254
},
{
"completion_length": 1024.0,
"epoch": 0.7137858642407278,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 4.58236947300939e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 255
},
{
"completion_length": 1024.0,
"epoch": 0.7165850244926522,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 4.500368348534918e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 256
},
{
"completion_length": 1024.0,
"epoch": 0.7193841847445767,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 4.418893992584624e-06,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 257
},
{
"completion_length": 1024.0,
"epoch": 0.722183344996501,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 4.33795420897683e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 258
},
{
"completion_length": 1024.0,
"epoch": 0.7249825052484254,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 4.257556750327176e-06,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 259
},
{
"completion_length": 1024.0,
"epoch": 0.72778166550035,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 4.17770931730606e-06,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 260
},
{
"completion_length": 1024.0,
"epoch": 0.7305808257522743,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 4.098419557901036e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 261
},
{
"completion_length": 1024.0,
"epoch": 0.7333799860041987,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 4.019695066684285e-06,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 262
},
{
"completion_length": 1024.0,
"epoch": 0.7361791462561231,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 3.9415433840851845e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 263
},
{
"completion_length": 1024.0,
"epoch": 0.7389783065080476,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 3.8639719956680624e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 264
},
{
"completion_length": 1024.0,
"epoch": 0.741777466759972,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 3.7869883314152114e-06,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 265
},
{
"completion_length": 1024.0,
"epoch": 0.7445766270118964,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 3.7105997650152326e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 266
},
{
"completion_length": 1024.0,
"epoch": 0.7473757872638208,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 3.6348136131567537e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 267
},
{
"completion_length": 1024.0,
"epoch": 0.7501749475157453,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 3.5596371348276325e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 268
},
{
"completion_length": 1024.0,
"epoch": 0.7529741077676697,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 3.485077530619664e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 269
},
{
"completion_length": 1024.0,
"epoch": 0.7557732680195941,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 3.4111419420388904e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 270
},
{
"completion_length": 1024.0,
"epoch": 0.7585724282715185,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 3.3378374508215704e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 271
},
{
"completion_length": 1024.0,
"epoch": 0.761371588523443,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 3.2651710782558798e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 272
},
{
"completion_length": 1024.0,
"epoch": 0.7641707487753674,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 3.1931497845093753e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 273
},
{
"completion_length": 1024.0,
"epoch": 0.7669699090272918,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 3.121780467962353e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 274
},
{
"completion_length": 1024.0,
"epoch": 0.7697690692792163,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 3.0510699645470988e-06,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 275
},
{
"completion_length": 1024.0,
"epoch": 0.7725682295311407,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.981025047093118e-06,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 276
},
{
"completion_length": 1024.0,
"epoch": 0.7753673897830651,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.911652424678425e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 277
},
{
"completion_length": 1024.0,
"epoch": 0.7781665500349895,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.8429587419869288e-06,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 278
},
{
"completion_length": 1024.0,
"epoch": 0.780965710286914,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.77495057867198e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 279
},
{
"completion_length": 1024.0,
"epoch": 0.7837648705388384,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.7076344487261695e-06,
"loss": 0.0,
"reward": 0.1428571492433548,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1428571492433548,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 280
},
{
"completion_length": 1024.0,
"epoch": 0.7865640307907628,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.6410167998573945e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 281
},
{
"completion_length": 1024.0,
"epoch": 0.7893631910426872,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.57510401287128e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 282
},
{
"completion_length": 1024.0,
"epoch": 0.7921623512946117,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.5099024010600136e-06,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 283
},
{
"completion_length": 1024.0,
"epoch": 0.794961511546536,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.445418209597632e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 284
},
{
"completion_length": 1024.0,
"epoch": 0.7977606717984604,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.381657614941858e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 285
},
{
"completion_length": 1024.0,
"epoch": 0.8005598320503848,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.318626724242491e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 286
},
{
"completion_length": 1024.0,
"epoch": 0.8033589923023093,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.2563315747564575e-06,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 287
},
{
"completion_length": 1024.0,
"epoch": 0.8061581525542337,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.1947781332695406e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 288
},
{
"completion_length": 1024.0,
"epoch": 0.8089573128061581,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.133972295524875e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 289
},
{
"completion_length": 1024.0,
"epoch": 0.8117564730580826,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.073919885658223e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 290
},
{
"completion_length": 1024.0,
"epoch": 0.814555633310007,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.0146266556401405e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 291
},
{
"completion_length": 1024.0,
"epoch": 0.8173547935619314,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.956098284725031e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 292
},
{
"completion_length": 1024.0,
"epoch": 0.8201539538138558,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.898340378907172e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 293
},
{
"completion_length": 1024.0,
"epoch": 0.8229531140657803,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.8413584703837618e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 294
},
{
"completion_length": 1024.0,
"epoch": 0.8257522743177047,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.7851580170250304e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 295
},
{
"completion_length": 1024.0,
"epoch": 0.8285514345696291,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.729744401851463e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 296
},
{
"completion_length": 1024.0,
"epoch": 0.8313505948215535,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.6751229325182194e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 297
},
{
"completion_length": 1024.0,
"epoch": 0.834149755073478,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.6212988408067354e-06,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 298
},
{
"completion_length": 1024.0,
"epoch": 0.8369489153254024,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.5682772821236192e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 299
},
{
"completion_length": 1024.0,
"epoch": 0.8397480755773268,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.516063335006851e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 300
},
{
"completion_length": 1024.0,
"epoch": 0.8425472358292512,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.4646620006393497e-06,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 301
},
{
"completion_length": 1024.0,
"epoch": 0.8453463960811757,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.4140782023699396e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 302
},
{
"completion_length": 1024.0,
"epoch": 0.8481455563331001,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.3643167852417894e-06,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 303
},
{
"completion_length": 1024.0,
"epoch": 0.8509447165850245,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.3153825155283395e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 304
},
{
"completion_length": 1024.0,
"epoch": 0.853743876836949,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.2672800802767715e-06,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 305
},
{
"completion_length": 1024.0,
"epoch": 0.8565430370888734,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.2200140868590759e-06,
"loss": 0.0,
"reward": 0.1428571492433548,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1428571492433548,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 306
},
{
"completion_length": 1024.0,
"epoch": 0.8593421973407978,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.1735890625307466e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 307
},
{
"completion_length": 1024.0,
"epoch": 0.8621413575927221,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.128009453997142e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 308
},
{
"completion_length": 1024.0,
"epoch": 0.8649405178446467,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.0832796269875757e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 309
},
{
"completion_length": 1024.0,
"epoch": 0.867739678096571,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.0394038658371575e-06,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 310
},
{
"completion_length": 1024.0,
"epoch": 0.8705388383484954,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 9.963863730764222e-07,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 311
},
{
"completion_length": 1024.0,
"epoch": 0.8733379986004198,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 9.542312690288035e-07,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 312
},
{
"completion_length": 1024.0,
"epoch": 0.8761371588523443,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 9.129425914159839e-07,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 313
},
{
"completion_length": 1024.0,
"epoch": 0.8789363191042687,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 8.725242949711376e-07,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 314
},
{
"completion_length": 1024.0,
"epoch": 0.8817354793561931,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 8.329802510601559e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 315
},
{
"completion_length": 1024.0,
"epoch": 0.8845346396081175,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 7.943142473108234e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 316
},
{
"completion_length": 1024.0,
"epoch": 0.887333799860042,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 7.565299872500331e-07,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 317
},
{
"completion_length": 1024.0,
"epoch": 0.8901329601119664,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 7.196310899490577e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 318
},
{
"completion_length": 1024.0,
"epoch": 0.8929321203638908,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 6.836210896769014e-07,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 319
},
{
"completion_length": 1024.0,
"epoch": 0.8957312806158153,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 6.485034355617748e-07,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 320
},
{
"completion_length": 1024.0,
"epoch": 0.8985304408677397,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 6.142814912607409e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 321
},
{
"completion_length": 1024.0,
"epoch": 0.9013296011196641,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 5.809585346375235e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 322
},
{
"completion_length": 1024.0,
"epoch": 0.9041287613715885,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 5.485377574485528e-07,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 323
},
{
"completion_length": 1024.0,
"epoch": 0.906927921623513,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 5.17022265037247e-07,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 324
},
{
"completion_length": 1024.0,
"epoch": 0.9097270818754374,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 4.864150760365771e-07,
"loss": 0.0,
"reward": 0.1428571492433548,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1428571492433548,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 325
},
{
"completion_length": 1024.0,
"epoch": 0.9125262421273618,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 4.567191220799305e-07,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 326
},
{
"completion_length": 1024.0,
"epoch": 0.9153254023792862,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 4.2793724752031807e-07,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 327
},
{
"completion_length": 1024.0,
"epoch": 0.9181245626312107,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 4.000722091579301e-07,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 328
},
{
"completion_length": 1024.0,
"epoch": 0.9209237228831351,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 3.731266759760854e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 329
},
{
"completion_length": 1024.0,
"epoch": 0.9237228831350595,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 3.471032288855869e-07,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 330
},
{
"completion_length": 1024.0,
"epoch": 0.9265220433869839,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 3.2200436047752026e-07,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 331
},
{
"completion_length": 1024.0,
"epoch": 0.9293212036389084,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.978324747844996e-07,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 332
},
{
"completion_length": 1024.0,
"epoch": 0.9321203638908327,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.745898870504116e-07,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 333
},
{
"completion_length": 1024.0,
"epoch": 0.9349195241427571,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.5227882350865154e-07,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 334
},
{
"completion_length": 1024.0,
"epoch": 0.9377186843946816,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.309014211688865e-07,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 335
},
{
"completion_length": 1024.0,
"epoch": 0.940517844646606,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.104597276123721e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 336
},
{
"completion_length": 1024.0,
"epoch": 0.9433170048985304,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.909557007958307e-07,
"loss": 0.0,
"reward": 0.1428571492433548,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1428571492433548,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 337
},
{
"completion_length": 1024.0,
"epoch": 0.9461161651504548,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.7239120886390347e-07,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 338
},
{
"completion_length": 1024.0,
"epoch": 0.9489153254023793,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.5476802997022812e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 339
},
{
"completion_length": 1024.0,
"epoch": 0.9517144856543037,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.3808785210711606e-07,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 340
},
{
"completion_length": 1024.0,
"epoch": 0.9545136459062281,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.2235227294387085e-07,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 341
},
{
"completion_length": 1024.0,
"epoch": 0.9573128061581525,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.075627996737627e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 342
},
{
"completion_length": 1024.0,
"epoch": 0.960111966410077,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 9.372084886966392e-08,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 343
},
{
"completion_length": 1024.0,
"epoch": 0.9629111266620014,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 8.082774634836754e-08,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 344
},
{
"completion_length": 1024.0,
"epoch": 0.9657102869139258,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 6.888472704359661e-08,
"loss": 0.0,
"reward": 0.0357142873108387,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 345
},
{
"completion_length": 1024.0,
"epoch": 0.9685094471658502,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 5.7892934887717746e-08,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 346
},
{
"completion_length": 1024.0,
"epoch": 0.9713086074177747,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 4.785342270217319e-08,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 347
},
{
"completion_length": 1024.0,
"epoch": 0.9741077676696991,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 3.8767152096641504e-08,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 348
},
{
"completion_length": 1024.0,
"epoch": 0.9769069279216235,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 3.063499337692788e-08,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 349
},
{
"completion_length": 1024.0,
"epoch": 0.979706088173548,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 2.3457725461607518e-08,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 350
},
{
"completion_length": 1024.0,
"epoch": 0.9825052484254724,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.7236035807416397e-08,
"loss": 0.0,
"reward": 0.1428571492433548,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1428571492433548,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 351
},
{
"completion_length": 1024.0,
"epoch": 0.9853044086773968,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.1970520343408398e-08,
"loss": 0.0,
"reward": 0.0714285746216774,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 352
},
{
"completion_length": 1024.0,
"epoch": 0.9881035689293212,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 7.661683413868748e-09,
"loss": 0.0,
"reward": 0.1428571492433548,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1428571492433548,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 353
},
{
"completion_length": 1024.0,
"epoch": 0.9909027291812457,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 4.309937730015978e-09,
"loss": 0.0,
"reward": 0.1785714365541935,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1785714365541935,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 354
},
{
"completion_length": 1024.0,
"epoch": 0.9937018894331701,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 1.915604330464671e-09,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 355
},
{
"completion_length": 1024.0,
"epoch": 0.9965010496850945,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 4.789125504778281e-10,
"loss": 0.0,
"reward": 0.1071428619325161,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 356
},
{
"completion_length": 1024.0,
"epoch": 0.9993002099370188,
"grad_norm": NaN,
"kl": NaN,
"learning_rate": 0.0,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 357
},
{
"epoch": 0.9993002099370188,
"step": 357,
"total_flos": 0.0,
"train_loss": -2.92552325735344e-11,
"train_runtime": 25378.3621,
"train_samples_per_second": 0.394,
"train_steps_per_second": 0.014
}
],
"logging_steps": 1,
"max_steps": 357,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}