Qwen2.5-3B-Knowledge-R1-GRPO / trainer_state.json
hzy's picture
Model save
4b48b2b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 2143,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 211.3571517944336,
"epoch": 0.004666355576294913,
"grad_norm": 2.9974076747894287,
"kl": 0.00047588348388671875,
"learning_rate": 4.6511627906976744e-08,
"loss": 0.044,
"reward": 0.6535714566707611,
"reward_std": 0.5595175564289093,
"rewards/accuracy_reward": 0.010714286193251609,
"rewards/format_reward": 0.6428571760654449,
"step": 10
},
{
"completion_length": 211.85358123779298,
"epoch": 0.009332711152589827,
"grad_norm": 2.421142816543579,
"kl": 0.0005392074584960937,
"learning_rate": 9.302325581395349e-08,
"loss": 0.0331,
"reward": 0.6000000357627868,
"reward_std": 0.6225969046354294,
"rewards/accuracy_reward": 0.021428572386503218,
"rewards/format_reward": 0.5785714596509933,
"step": 20
},
{
"completion_length": 225.71072235107422,
"epoch": 0.013999066728884742,
"grad_norm": 2.8735339641571045,
"kl": 0.0006612777709960937,
"learning_rate": 1.3953488372093021e-07,
"loss": 0.0571,
"reward": 0.5857143223285675,
"reward_std": 0.5592944413423538,
"rewards/accuracy_reward": 0.00714285746216774,
"rewards/format_reward": 0.5785714656114578,
"step": 30
},
{
"completion_length": 205.19286651611327,
"epoch": 0.018665422305179653,
"grad_norm": 2.0310211181640625,
"kl": 0.0027374267578125,
"learning_rate": 1.8604651162790698e-07,
"loss": 0.0065,
"reward": 0.6285714566707611,
"reward_std": 0.5308447808027268,
"rewards/accuracy_reward": 0.01428571492433548,
"rewards/format_reward": 0.6142857350409031,
"step": 40
},
{
"completion_length": 194.54643707275392,
"epoch": 0.02333177788147457,
"grad_norm": 3.083293914794922,
"kl": 0.00838165283203125,
"learning_rate": 2.3255813953488372e-07,
"loss": 0.0626,
"reward": 0.7464286029338837,
"reward_std": 0.46758472323417666,
"rewards/accuracy_reward": 0.01785714365541935,
"rewards/format_reward": 0.7285714626312256,
"step": 50
},
{
"completion_length": 197.08929595947265,
"epoch": 0.027998133457769483,
"grad_norm": 2.4293816089630127,
"kl": 0.0138580322265625,
"learning_rate": 2.7906976744186043e-07,
"loss": 0.0292,
"reward": 0.685714328289032,
"reward_std": 0.47790482342243196,
"rewards/accuracy_reward": 0.00714285746216774,
"rewards/format_reward": 0.67857146859169,
"step": 60
},
{
"completion_length": 193.36786499023438,
"epoch": 0.032664489034064395,
"grad_norm": 1.755541443824768,
"kl": 0.00519561767578125,
"learning_rate": 3.2558139534883724e-07,
"loss": 0.028,
"reward": 0.8035714745521545,
"reward_std": 0.36622021347284317,
"rewards/accuracy_reward": 0.01785714365541935,
"rewards/format_reward": 0.7857143342494964,
"step": 70
},
{
"completion_length": 206.29644012451172,
"epoch": 0.03733084461035931,
"grad_norm": 1.8061885833740234,
"kl": 0.0106536865234375,
"learning_rate": 3.7209302325581396e-07,
"loss": 0.023,
"reward": 0.6857143223285675,
"reward_std": 0.4770329385995865,
"rewards/accuracy_reward": 0.00714285746216774,
"rewards/format_reward": 0.6785714626312256,
"step": 80
},
{
"completion_length": 205.0571517944336,
"epoch": 0.041997200186654225,
"grad_norm": 1.5979427099227905,
"kl": 0.008978271484375,
"learning_rate": 4.186046511627907e-07,
"loss": 0.0297,
"reward": 0.8071428894996643,
"reward_std": 0.3088400363922119,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.8071428894996643,
"step": 90
},
{
"completion_length": 217.22858123779298,
"epoch": 0.04666355576294914,
"grad_norm": 1.7703214883804321,
"kl": 0.0110107421875,
"learning_rate": 4.6511627906976743e-07,
"loss": 0.037,
"reward": 0.7285714745521545,
"reward_std": 0.4834458529949188,
"rewards/accuracy_reward": 0.00714285746216774,
"rewards/format_reward": 0.7214286148548126,
"step": 100
},
{
"completion_length": 198.60000915527343,
"epoch": 0.05132991133924405,
"grad_norm": 1.311502456665039,
"kl": 0.0128143310546875,
"learning_rate": 5.116279069767442e-07,
"loss": 0.0184,
"reward": 0.7964286148548126,
"reward_std": 0.3687057480216026,
"rewards/accuracy_reward": 0.010714286193251609,
"rewards/format_reward": 0.7857143342494964,
"step": 110
},
{
"completion_length": 210.16072540283204,
"epoch": 0.05599626691553897,
"grad_norm": 4.063913345336914,
"kl": 0.010107421875,
"learning_rate": 5.581395348837209e-07,
"loss": 0.0155,
"reward": 0.7964286148548126,
"reward_std": 0.34110155403614045,
"rewards/accuracy_reward": 0.010714286193251609,
"rewards/format_reward": 0.785714328289032,
"step": 120
},
{
"completion_length": 203.16429443359374,
"epoch": 0.06066262249183388,
"grad_norm": 1.8343654870986938,
"kl": 0.00831298828125,
"learning_rate": 6.046511627906976e-07,
"loss": 0.0343,
"reward": 0.8607143282890319,
"reward_std": 0.25221002399921416,
"rewards/accuracy_reward": 0.00357142873108387,
"rewards/format_reward": 0.8571429014205932,
"step": 130
},
{
"completion_length": 202.78929595947267,
"epoch": 0.06532897806812879,
"grad_norm": 0.8969087600708008,
"kl": 0.0120025634765625,
"learning_rate": 6.511627906976745e-07,
"loss": 0.0145,
"reward": 0.8321428954601288,
"reward_std": 0.2753357619047165,
"rewards/accuracy_reward": 0.00357142873108387,
"rewards/format_reward": 0.8285714685916901,
"step": 140
},
{
"completion_length": 200.78929595947267,
"epoch": 0.0699953336444237,
"grad_norm": 2.4758172035217285,
"kl": 0.016839599609375,
"learning_rate": 6.976744186046511e-07,
"loss": 0.0205,
"reward": 0.90357146859169,
"reward_std": 0.22839727699756623,
"rewards/accuracy_reward": 0.01785714365541935,
"rewards/format_reward": 0.8857143342494964,
"step": 150
},
{
"completion_length": 208.1607238769531,
"epoch": 0.07466168922071861,
"grad_norm": 1.8052928447723389,
"kl": 0.012713623046875,
"learning_rate": 7.441860465116279e-07,
"loss": 0.0122,
"reward": 0.9321428835391998,
"reward_std": 0.1500000089406967,
"rewards/accuracy_reward": 0.00357142873108387,
"rewards/format_reward": 0.9285714566707611,
"step": 160
},
{
"completion_length": 210.90000762939454,
"epoch": 0.07932804479701354,
"grad_norm": 1.3043458461761475,
"kl": 0.0185272216796875,
"learning_rate": 7.906976744186046e-07,
"loss": 0.0076,
"reward": 0.9178571820259094,
"reward_std": 0.1664957284927368,
"rewards/accuracy_reward": 0.00357142873108387,
"rewards/format_reward": 0.9142857551574707,
"step": 170
},
{
"completion_length": 204.02858123779296,
"epoch": 0.08399440037330845,
"grad_norm": 1.9711344242095947,
"kl": 0.024969482421875,
"learning_rate": 8.372093023255814e-07,
"loss": -0.0035,
"reward": 0.903571480512619,
"reward_std": 0.1994871750473976,
"rewards/accuracy_reward": 0.010714286193251609,
"rewards/format_reward": 0.8928571879863739,
"step": 180
},
{
"completion_length": 201.1821517944336,
"epoch": 0.08866075594960336,
"grad_norm": 8.437910079956055,
"kl": 0.023162841796875,
"learning_rate": 8.837209302325581e-07,
"loss": 0.0113,
"reward": 0.9178571820259094,
"reward_std": 0.18299144729971886,
"rewards/accuracy_reward": 0.010714286193251609,
"rewards/format_reward": 0.9071428894996643,
"step": 190
},
{
"completion_length": 205.58929443359375,
"epoch": 0.09332711152589827,
"grad_norm": 0.09394335001707077,
"kl": 0.01868896484375,
"learning_rate": 9.302325581395349e-07,
"loss": 0.0195,
"reward": 0.9214285910129547,
"reward_std": 0.13299144804477692,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.9214285910129547,
"step": 200
},
{
"completion_length": 217.59286651611328,
"epoch": 0.09799346710219319,
"grad_norm": 2.8316187858581543,
"kl": 0.021905517578125,
"learning_rate": 9.767441860465115e-07,
"loss": 0.0257,
"reward": 0.860714340209961,
"reward_std": 0.2807814501225948,
"rewards/accuracy_reward": 0.010714286193251609,
"rewards/format_reward": 0.8500000536441803,
"step": 210
},
{
"completion_length": 205.23929290771486,
"epoch": 0.1026598226784881,
"grad_norm": 1.3520991802215576,
"kl": 0.015557861328125,
"learning_rate": 9.99983405533249e-07,
"loss": 0.0129,
"reward": 0.9571429014205932,
"reward_std": 0.11428571939468384,
"rewards/accuracy_reward": 0.00714285746216774,
"rewards/format_reward": 0.9500000238418579,
"step": 220
},
{
"completion_length": 201.4571533203125,
"epoch": 0.10732617825478301,
"grad_norm": 6.0174970626831055,
"kl": 0.034246826171875,
"learning_rate": 9.99850656408199e-07,
"loss": 0.016,
"reward": 0.9785714983940125,
"reward_std": 0.1857142947614193,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.9428571701049805,
"step": 230
},
{
"completion_length": 197.5357223510742,
"epoch": 0.11199253383107793,
"grad_norm": 1.7986341714859009,
"kl": 0.0531494140625,
"learning_rate": 9.995851934039294e-07,
"loss": 0.0582,
"reward": 1.096428632736206,
"reward_std": 0.23630646169185637,
"rewards/accuracy_reward": 0.12500000596046448,
"rewards/format_reward": 0.9714285850524902,
"step": 240
},
{
"completion_length": 208.5428680419922,
"epoch": 0.11665888940737285,
"grad_norm": 2.8913028240203857,
"kl": 0.0509765625,
"learning_rate": 9.991870870027424e-07,
"loss": 0.0426,
"reward": 1.0678572058677673,
"reward_std": 0.19603439420461655,
"rewards/accuracy_reward": 0.10357143431901931,
"rewards/format_reward": 0.9642857313156128,
"step": 250
},
{
"completion_length": 187.68929443359374,
"epoch": 0.12132524498366776,
"grad_norm": 0.7588302493095398,
"kl": 0.05869140625,
"learning_rate": 9.98656442904699e-07,
"loss": 0.028,
"reward": 1.110714328289032,
"reward_std": 0.17129081785678862,
"rewards/accuracy_reward": 0.13928571566939354,
"rewards/format_reward": 0.9714285850524902,
"step": 260
},
{
"completion_length": 196.61429443359376,
"epoch": 0.12599160055996267,
"grad_norm": 36.95159912109375,
"kl": 0.0792724609375,
"learning_rate": 9.979934019995547e-07,
"loss": 0.0407,
"reward": 1.1428571820259095,
"reward_std": 0.299724480509758,
"rewards/accuracy_reward": 0.20714286752045155,
"rewards/format_reward": 0.935714316368103,
"step": 270
},
{
"completion_length": 207.1464385986328,
"epoch": 0.13065795613625758,
"grad_norm": 2.451353073120117,
"kl": 0.057666015625,
"learning_rate": 9.97198140329352e-07,
"loss": 0.032,
"reward": 1.1285714864730836,
"reward_std": 0.24082783833146096,
"rewards/accuracy_reward": 0.17142857611179352,
"rewards/format_reward": 0.9571428775787354,
"step": 280
},
{
"completion_length": 207.25000610351563,
"epoch": 0.1353243117125525,
"grad_norm": 1.8005753755569458,
"kl": 0.056298828125,
"learning_rate": 9.962708690416806e-07,
"loss": 0.0108,
"reward": 1.221428632736206,
"reward_std": 0.20909458994865418,
"rewards/accuracy_reward": 0.25000000894069674,
"rewards/format_reward": 0.9714285850524902,
"step": 290
},
{
"completion_length": 201.07143859863282,
"epoch": 0.1399906672888474,
"grad_norm": 1.7425264120101929,
"kl": 0.052978515625,
"learning_rate": 9.952118343336157e-07,
"loss": 0.0208,
"reward": 1.1678571820259094,
"reward_std": 0.24439741671085358,
"rewards/accuracy_reward": 0.22500001043081283,
"rewards/format_reward": 0.9428571581840515,
"step": 300
},
{
"completion_length": 211.52858123779296,
"epoch": 0.14465702286514232,
"grad_norm": 2.8271803855895996,
"kl": 0.0644287109375,
"learning_rate": 9.940213173863515e-07,
"loss": 0.0253,
"reward": 1.1714285969734193,
"reward_std": 0.2888915419578552,
"rewards/accuracy_reward": 0.22857143878936767,
"rewards/format_reward": 0.9428571701049805,
"step": 310
},
{
"completion_length": 212.41786499023436,
"epoch": 0.14932337844143723,
"grad_norm": 1.6736899614334106,
"kl": 0.063427734375,
"learning_rate": 9.926996342905446e-07,
"loss": 0.0327,
"reward": 1.2321429014205934,
"reward_std": 0.22101710960268975,
"rewards/accuracy_reward": 0.26785715818405154,
"rewards/format_reward": 0.9642857313156128,
"step": 320
},
{
"completion_length": 222.3071533203125,
"epoch": 0.15398973401773214,
"grad_norm": 1.0232563018798828,
"kl": 0.0625244140625,
"learning_rate": 9.912471359623905e-07,
"loss": 0.0303,
"reward": 1.2321429014205934,
"reward_std": 0.27462306767702105,
"rewards/accuracy_reward": 0.30357144474983216,
"rewards/format_reward": 0.9285714626312256,
"step": 330
},
{
"completion_length": 199.1357223510742,
"epoch": 0.15865608959402708,
"grad_norm": 0.4898677170276642,
"kl": 0.057177734375,
"learning_rate": 9.89664208050453e-07,
"loss": 0.0513,
"reward": 1.2321429133415223,
"reward_std": 0.25317725613713266,
"rewards/accuracy_reward": 0.27500001192092893,
"rewards/format_reward": 0.9571428775787354,
"step": 340
},
{
"completion_length": 184.546435546875,
"epoch": 0.163322445170322,
"grad_norm": 2.1455774307250977,
"kl": 0.069482421875,
"learning_rate": 9.879512708332718e-07,
"loss": 0.0067,
"reward": 1.2250000476837157,
"reward_std": 0.18287093117833136,
"rewards/accuracy_reward": 0.2607142999768257,
"rewards/format_reward": 0.9642857313156128,
"step": 350
},
{
"completion_length": 182.8071517944336,
"epoch": 0.1679888007466169,
"grad_norm": 1.5658093690872192,
"kl": 0.0677490234375,
"learning_rate": 9.861087791077743e-07,
"loss": 0.0095,
"reward": 1.2750000476837158,
"reward_std": 0.16363511905074118,
"rewards/accuracy_reward": 0.29642858952283857,
"rewards/format_reward": 0.9785714387893677,
"step": 360
},
{
"completion_length": 190.7214385986328,
"epoch": 0.1726551563229118,
"grad_norm": 1.7997597455978394,
"kl": 0.05615234375,
"learning_rate": 9.841372220685253e-07,
"loss": 0.0157,
"reward": 1.235714316368103,
"reward_std": 0.2049168437719345,
"rewards/accuracy_reward": 0.2785714462399483,
"rewards/format_reward": 0.9571428775787354,
"step": 370
},
{
"completion_length": 193.371435546875,
"epoch": 0.17732151189920672,
"grad_norm": 1.4841110706329346,
"kl": 0.060205078125,
"learning_rate": 9.820371231778422e-07,
"loss": 0.0343,
"reward": 1.3142857551574707,
"reward_std": 0.23386457264423371,
"rewards/accuracy_reward": 0.3642857328057289,
"rewards/format_reward": 0.9500000178813934,
"step": 380
},
{
"completion_length": 201.2071563720703,
"epoch": 0.18198786747550164,
"grad_norm": 1.3704997301101685,
"kl": 0.0554443359375,
"learning_rate": 9.79809040026811e-07,
"loss": 0.0187,
"reward": 1.2250000596046449,
"reward_std": 0.2502213083207607,
"rewards/accuracy_reward": 0.28214287348091605,
"rewards/format_reward": 0.9428571701049805,
"step": 390
},
{
"completion_length": 186.56429443359374,
"epoch": 0.18665422305179655,
"grad_norm": 2.2330009937286377,
"kl": 0.0556884765625,
"learning_rate": 9.774535641872433e-07,
"loss": 0.0379,
"reward": 1.2428571820259093,
"reward_std": 0.3126678854227066,
"rewards/accuracy_reward": 0.3214285898953676,
"rewards/format_reward": 0.9214286029338836,
"step": 400
},
{
"completion_length": 171.86786651611328,
"epoch": 0.19132057862809146,
"grad_norm": 1.9319186210632324,
"kl": 0.05830078125,
"learning_rate": 9.749713210546087e-07,
"loss": 0.0061,
"reward": 1.371428632736206,
"reward_std": 0.2255903147161007,
"rewards/accuracy_reward": 0.4000000223517418,
"rewards/format_reward": 0.9714285790920257,
"step": 410
},
{
"completion_length": 194.71429443359375,
"epoch": 0.19598693420438637,
"grad_norm": 1.76435387134552,
"kl": 0.0529052734375,
"learning_rate": 9.723629696819884e-07,
"loss": 0.0143,
"reward": 1.285714340209961,
"reward_std": 0.20479509681463243,
"rewards/accuracy_reward": 0.3357142999768257,
"rewards/format_reward": 0.9500000238418579,
"step": 420
},
{
"completion_length": 188.8714370727539,
"epoch": 0.20065328978068128,
"grad_norm": 1.158916711807251,
"kl": 0.0750244140625,
"learning_rate": 9.696292026050922e-07,
"loss": 0.0165,
"reward": 1.3178571939468384,
"reward_std": 0.26929790526628494,
"rewards/accuracy_reward": 0.382142873480916,
"rewards/format_reward": 0.935714316368103,
"step": 430
},
{
"completion_length": 179.84644012451173,
"epoch": 0.2053196453569762,
"grad_norm": 1.382643699645996,
"kl": 0.11826171875,
"learning_rate": 9.66770745658385e-07,
"loss": 0.0183,
"reward": 1.2571429371833802,
"reward_std": 0.21266788095235825,
"rewards/accuracy_reward": 0.3071428656578064,
"rewards/format_reward": 0.9500000238418579,
"step": 440
},
{
"completion_length": 180.546435546875,
"epoch": 0.2099860009332711,
"grad_norm": 1.8777272701263428,
"kl": 0.0839599609375,
"learning_rate": 9.637883577823721e-07,
"loss": 0.0276,
"reward": 1.3428572058677672,
"reward_std": 0.24049336314201356,
"rewards/accuracy_reward": 0.40000001937150953,
"rewards/format_reward": 0.9428571701049805,
"step": 450
},
{
"completion_length": 190.68572387695312,
"epoch": 0.21465235650956602,
"grad_norm": 1.0944005250930786,
"kl": 0.0735107421875,
"learning_rate": 9.606828308220969e-07,
"loss": 0.0113,
"reward": 1.2785714745521546,
"reward_std": 0.21487789005041122,
"rewards/accuracy_reward": 0.32857144325971605,
"rewards/format_reward": 0.9500000238418579,
"step": 460
},
{
"completion_length": 197.60357971191405,
"epoch": 0.21931871208586096,
"grad_norm": 0.9634405970573425,
"kl": 0.073779296875,
"learning_rate": 9.574549893168977e-07,
"loss": 0.0197,
"reward": 1.2535714626312255,
"reward_std": 0.17683308124542235,
"rewards/accuracy_reward": 0.2892857242375612,
"rewards/format_reward": 0.9642857313156128,
"step": 470
},
{
"completion_length": 203.69643859863282,
"epoch": 0.22398506766215587,
"grad_norm": 1.3643263578414917,
"kl": 0.08203125,
"learning_rate": 9.541056902814896e-07,
"loss": 0.0227,
"reward": 1.285714340209961,
"reward_std": 0.27460705786943435,
"rewards/accuracy_reward": 0.37857144623994826,
"rewards/format_reward": 0.9071428894996643,
"step": 480
},
{
"completion_length": 220.28929595947267,
"epoch": 0.22865142323845078,
"grad_norm": 1.668428897857666,
"kl": 0.0689453125,
"learning_rate": 9.506358229784194e-07,
"loss": 0.0146,
"reward": 1.3071429133415222,
"reward_std": 0.17622366920113564,
"rewards/accuracy_reward": 0.3571428686380386,
"rewards/format_reward": 0.9500000238418579,
"step": 490
},
{
"completion_length": 216.17501220703124,
"epoch": 0.2333177788147457,
"grad_norm": 2.019341468811035,
"kl": 0.079345703125,
"learning_rate": 9.4704630868196e-07,
"loss": 0.0646,
"reward": 1.1821429014205933,
"reward_std": 0.23459327667951585,
"rewards/accuracy_reward": 0.2535714410245419,
"rewards/format_reward": 0.9285714566707611,
"step": 500
},
{
"completion_length": 205.07501068115235,
"epoch": 0.2379841343910406,
"grad_norm": 1.7414186000823975,
"kl": 0.0832763671875,
"learning_rate": 9.433381004335061e-07,
"loss": 0.0468,
"reward": 1.2071429073810578,
"reward_std": 0.2705150328576565,
"rewards/accuracy_reward": 0.30000001192092896,
"rewards/format_reward": 0.9071428894996643,
"step": 510
},
{
"completion_length": 180.91429595947267,
"epoch": 0.24265048996733551,
"grad_norm": 2.2701659202575684,
"kl": 0.124853515625,
"learning_rate": 9.395121827885355e-07,
"loss": 0.0327,
"reward": 1.3142857670783996,
"reward_std": 0.21858522519469262,
"rewards/accuracy_reward": 0.37142859064042566,
"rewards/format_reward": 0.9428571701049805,
"step": 520
},
{
"completion_length": 192.1821517944336,
"epoch": 0.24731684554363043,
"grad_norm": 14.688100814819336,
"kl": 0.170263671875,
"learning_rate": 9.355695715552011e-07,
"loss": 0.0272,
"reward": 1.2142857670783997,
"reward_std": 0.20738017484545707,
"rewards/accuracy_reward": 0.27142858095467093,
"rewards/format_reward": 0.9428571701049805,
"step": 530
},
{
"completion_length": 202.57500915527345,
"epoch": 0.25198320111992534,
"grad_norm": 2.822713613510132,
"kl": 0.1373046875,
"learning_rate": 9.315113135246283e-07,
"loss": 0.0432,
"reward": 1.1928571820259095,
"reward_std": 0.35453804582357407,
"rewards/accuracy_reward": 0.28571429997682574,
"rewards/format_reward": 0.9071429014205933,
"step": 540
},
{
"completion_length": 202.9321517944336,
"epoch": 0.25664955669622025,
"grad_norm": 1.5090000629425049,
"kl": 0.12421875,
"learning_rate": 9.273384861929836e-07,
"loss": 0.0491,
"reward": 1.1178572058677674,
"reward_std": 0.23188644349575044,
"rewards/accuracy_reward": 0.19642858132719992,
"rewards/format_reward": 0.9214286029338836,
"step": 550
},
{
"completion_length": 192.82857666015624,
"epoch": 0.26131591227251516,
"grad_norm": 0.5006041526794434,
"kl": 0.205029296875,
"learning_rate": 9.230521974753919e-07,
"loss": 0.0594,
"reward": 1.2321429252624512,
"reward_std": 0.2444589801132679,
"rewards/accuracy_reward": 0.3321428716182709,
"rewards/format_reward": 0.9000000298023224,
"step": 560
},
{
"completion_length": 171.6428649902344,
"epoch": 0.26598226784881007,
"grad_norm": 1.1855828762054443,
"kl": 0.193359375,
"learning_rate": 9.186535854117776e-07,
"loss": 0.037,
"reward": 1.260714328289032,
"reward_std": 0.2411015473306179,
"rewards/accuracy_reward": 0.33928573578596116,
"rewards/format_reward": 0.9214286029338836,
"step": 570
},
{
"completion_length": 180.17143707275392,
"epoch": 0.270648623425105,
"grad_norm": 1.677296757698059,
"kl": 0.1857421875,
"learning_rate": 9.141438178647065e-07,
"loss": 0.0374,
"reward": 1.3000000596046448,
"reward_std": 0.2666125223040581,
"rewards/accuracy_reward": 0.3714285910129547,
"rewards/format_reward": 0.9285714626312256,
"step": 580
},
{
"completion_length": 186.6821533203125,
"epoch": 0.2753149790013999,
"grad_norm": 5.2433247566223145,
"kl": 0.16669921875,
"learning_rate": 9.095240922093104e-07,
"loss": 0.0407,
"reward": 1.3500000715255738,
"reward_std": 0.18397593572735788,
"rewards/accuracy_reward": 0.39285715818405154,
"rewards/format_reward": 0.9571428656578064,
"step": 590
},
{
"completion_length": 180.6428680419922,
"epoch": 0.2799813345776948,
"grad_norm": 0.4842807650566101,
"kl": 0.190380859375,
"learning_rate": 9.047956350153752e-07,
"loss": 0.0147,
"reward": 1.246428620815277,
"reward_std": 0.21586237102746964,
"rewards/accuracy_reward": 0.28928572684526443,
"rewards/format_reward": 0.9571428775787354,
"step": 600
},
{
"completion_length": 191.20358123779297,
"epoch": 0.2846476901539897,
"grad_norm": 2.350338935852051,
"kl": 0.14912109375,
"learning_rate": 8.999597017216782e-07,
"loss": 0.0334,
"reward": 1.3035714983940125,
"reward_std": 0.16870573312044143,
"rewards/accuracy_reward": 0.33928572833538057,
"rewards/format_reward": 0.9642857313156128,
"step": 610
},
{
"completion_length": 193.98215026855468,
"epoch": 0.28931404573028463,
"grad_norm": 2.8800251483917236,
"kl": 0.21923828125,
"learning_rate": 8.950175763026604e-07,
"loss": 0.0245,
"reward": 1.2071429133415221,
"reward_std": 0.2516971692442894,
"rewards/accuracy_reward": 0.2642857298254967,
"rewards/format_reward": 0.942857164144516,
"step": 620
},
{
"completion_length": 191.61429290771486,
"epoch": 0.29398040130657954,
"grad_norm": 2.0671656131744385,
"kl": 0.1837890625,
"learning_rate": 8.899705709275217e-07,
"loss": 0.0145,
"reward": 1.385714340209961,
"reward_std": 0.20700510069727898,
"rewards/accuracy_reward": 0.4428571671247482,
"rewards/format_reward": 0.9428571701049805,
"step": 630
},
{
"completion_length": 199.62858123779296,
"epoch": 0.29864675688287445,
"grad_norm": 5.944628715515137,
"kl": 0.29609375,
"learning_rate": 8.848200256118312e-07,
"loss": 0.0386,
"reward": 1.246428644657135,
"reward_std": 0.2530567437410355,
"rewards/accuracy_reward": 0.3035714406520128,
"rewards/format_reward": 0.942857164144516,
"step": 640
},
{
"completion_length": 227.48572540283203,
"epoch": 0.30331311245916937,
"grad_norm": 8.969457626342773,
"kl": 0.387255859375,
"learning_rate": 8.795673078617432e-07,
"loss": 0.0707,
"reward": 1.2464286088943481,
"reward_std": 0.2629224382340908,
"rewards/accuracy_reward": 0.3250000149011612,
"rewards/format_reward": 0.9214286029338836,
"step": 650
},
{
"completion_length": 210.75715026855468,
"epoch": 0.3079794680354643,
"grad_norm": 4.416165828704834,
"kl": 0.473828125,
"learning_rate": 8.74213812310915e-07,
"loss": 0.0801,
"reward": 1.221428632736206,
"reward_std": 0.27719090431928634,
"rewards/accuracy_reward": 0.29285715967416764,
"rewards/format_reward": 0.9285714626312256,
"step": 660
},
{
"completion_length": 200.8928649902344,
"epoch": 0.31264582361175924,
"grad_norm": 7.957707405090332,
"kl": 0.52421875,
"learning_rate": 8.68760960350222e-07,
"loss": 0.0485,
"reward": 1.221428644657135,
"reward_std": 0.31562927216291425,
"rewards/accuracy_reward": 0.3071428701281548,
"rewards/format_reward": 0.9142857432365418,
"step": 670
},
{
"completion_length": 194.02857971191406,
"epoch": 0.31731217918805416,
"grad_norm": 2.773921012878418,
"kl": 0.4138671875,
"learning_rate": 8.632101997503674e-07,
"loss": 0.0431,
"reward": 1.246428620815277,
"reward_std": 0.2256075546145439,
"rewards/accuracy_reward": 0.3178571492433548,
"rewards/format_reward": 0.9285714626312256,
"step": 680
},
{
"completion_length": 183.4214324951172,
"epoch": 0.32197853476434907,
"grad_norm": 3.124154567718506,
"kl": 0.189404296875,
"learning_rate": 8.575630042774902e-07,
"loss": 0.0263,
"reward": 1.260714340209961,
"reward_std": 0.1950671538710594,
"rewards/accuracy_reward": 0.3107142999768257,
"rewards/format_reward": 0.950000011920929,
"step": 690
},
{
"completion_length": 214.20000915527345,
"epoch": 0.326644890340644,
"grad_norm": 5.170936107635498,
"kl": 0.279833984375,
"learning_rate": 8.518208733018689e-07,
"loss": 0.0798,
"reward": 1.2071429252624513,
"reward_std": 0.36203873455524443,
"rewards/accuracy_reward": 0.30714287534356116,
"rewards/format_reward": 0.9000000417232513,
"step": 700
},
{
"completion_length": 236.79644165039062,
"epoch": 0.3313112459169389,
"grad_norm": 4.168279647827148,
"kl": 0.5509765625,
"learning_rate": 8.459853313998283e-07,
"loss": 0.1131,
"reward": 1.2428571939468385,
"reward_std": 0.33234085887670517,
"rewards/accuracy_reward": 0.35714287459850313,
"rewards/format_reward": 0.8857143223285675,
"step": 710
},
{
"completion_length": 212.58929443359375,
"epoch": 0.3359776014932338,
"grad_norm": 5.4327569007873535,
"kl": 0.4990234375,
"learning_rate": 8.400579279489541e-07,
"loss": 0.095,
"reward": 1.2250000715255738,
"reward_std": 0.31540548279881475,
"rewards/accuracy_reward": 0.2964285884052515,
"rewards/format_reward": 0.9285714626312256,
"step": 720
},
{
"completion_length": 219.4178665161133,
"epoch": 0.3406439570695287,
"grad_norm": 6.700848579406738,
"kl": 0.37919921875,
"learning_rate": 8.340402367167216e-07,
"loss": 0.0824,
"reward": 1.1571429014205932,
"reward_std": 0.30023063272237777,
"rewards/accuracy_reward": 0.25000000819563867,
"rewards/format_reward": 0.9071428835391998,
"step": 730
},
{
"completion_length": 193.31786346435547,
"epoch": 0.3453103126458236,
"grad_norm": 4.084702014923096,
"kl": 0.28115234375,
"learning_rate": 8.2793385544265e-07,
"loss": 0.0288,
"reward": 1.2535714745521545,
"reward_std": 0.3358024753630161,
"rewards/accuracy_reward": 0.339285734295845,
"rewards/format_reward": 0.9142857432365418,
"step": 740
},
{
"completion_length": 208.07858123779297,
"epoch": 0.34997666822211854,
"grad_norm": 5.686543941497803,
"kl": 0.33671875,
"learning_rate": 8.217404054140909e-07,
"loss": 0.0335,
"reward": 1.1714286088943482,
"reward_std": 0.2581101007759571,
"rewards/accuracy_reward": 0.23571430072188376,
"rewards/format_reward": 0.9357143104076385,
"step": 750
},
{
"completion_length": 205.19286651611327,
"epoch": 0.35464302379841345,
"grad_norm": 2.9921562671661377,
"kl": 0.177294921875,
"learning_rate": 8.154615310357649e-07,
"loss": 0.0755,
"reward": 1.2428571939468385,
"reward_std": 0.27265038043260575,
"rewards/accuracy_reward": 0.30714286789298056,
"rewards/format_reward": 0.935714316368103,
"step": 760
},
{
"completion_length": 211.06072692871095,
"epoch": 0.35930937937470836,
"grad_norm": 2.7796568870544434,
"kl": 0.48134765625,
"learning_rate": 8.090988993931609e-07,
"loss": 0.0967,
"reward": 1.3071429133415222,
"reward_std": 0.27672048956155776,
"rewards/accuracy_reward": 0.39285716861486436,
"rewards/format_reward": 0.9142857491970062,
"step": 770
},
{
"completion_length": 210.21429595947265,
"epoch": 0.36397573495100327,
"grad_norm": 4.467612266540527,
"kl": 0.39296875,
"learning_rate": 8.026541998099126e-07,
"loss": 0.1026,
"reward": 1.1821429014205933,
"reward_std": 0.20331501960754395,
"rewards/accuracy_reward": 0.24642858393490313,
"rewards/format_reward": 0.9357143044471741,
"step": 780
},
{
"completion_length": 184.62858123779296,
"epoch": 0.3686420905272982,
"grad_norm": 2.8182897567749023,
"kl": 0.4158203125,
"learning_rate": 7.961291433992723e-07,
"loss": 0.0864,
"reward": 1.2571429133415222,
"reward_std": 0.3440789520740509,
"rewards/accuracy_reward": 0.35714287906885145,
"rewards/format_reward": 0.9000000417232513,
"step": 790
},
{
"completion_length": 177.10000915527343,
"epoch": 0.3733084461035931,
"grad_norm": 5.841248035430908,
"kl": 0.25205078125,
"learning_rate": 7.895254626097964e-07,
"loss": 0.0477,
"reward": 1.3178572177886962,
"reward_std": 0.26967298090457914,
"rewards/accuracy_reward": 0.3750000178813934,
"rewards/format_reward": 0.9428571701049805,
"step": 800
},
{
"completion_length": 186.17857971191407,
"epoch": 0.377974801679888,
"grad_norm": 2.7772974967956543,
"kl": 0.372216796875,
"learning_rate": 7.828449107653703e-07,
"loss": 0.0548,
"reward": 1.2035714864730835,
"reward_std": 0.21819290220737458,
"rewards/accuracy_reward": 0.26071429774165156,
"rewards/format_reward": 0.9428571701049805,
"step": 810
},
{
"completion_length": 179.5714370727539,
"epoch": 0.3826411572561829,
"grad_norm": 3.573528528213501,
"kl": 0.5103515625,
"learning_rate": 7.760892615996862e-07,
"loss": 0.0807,
"reward": 1.296428620815277,
"reward_std": 0.20319449976086618,
"rewards/accuracy_reward": 0.3392857268452644,
"rewards/format_reward": 0.9571428775787354,
"step": 820
},
{
"completion_length": 201.9464370727539,
"epoch": 0.38730751283247783,
"grad_norm": 2.7865989208221436,
"kl": 0.32451171875,
"learning_rate": 7.692603087853061e-07,
"loss": 0.129,
"reward": 1.1964286327362061,
"reward_std": 0.2745025597512722,
"rewards/accuracy_reward": 0.28214287012815475,
"rewards/format_reward": 0.9142857372760773,
"step": 830
},
{
"completion_length": 187.04286499023436,
"epoch": 0.39197386840877274,
"grad_norm": 5.086669445037842,
"kl": 0.412451171875,
"learning_rate": 7.623598654574282e-07,
"loss": 0.0784,
"reward": 1.2285714864730835,
"reward_std": 0.19613576233386992,
"rewards/accuracy_reward": 0.2785714466124773,
"rewards/format_reward": 0.9500000238418579,
"step": 840
},
{
"completion_length": 205.9321517944336,
"epoch": 0.39664022398506765,
"grad_norm": 6.3026580810546875,
"kl": 0.42822265625,
"learning_rate": 7.553897637324871e-07,
"loss": 0.1118,
"reward": 1.23571435213089,
"reward_std": 0.264027439057827,
"rewards/accuracy_reward": 0.32142858356237414,
"rewards/format_reward": 0.9142857491970062,
"step": 850
},
{
"completion_length": 211.72857971191405,
"epoch": 0.40130657956136256,
"grad_norm": 7.240902423858643,
"kl": 0.609912109375,
"learning_rate": 7.483518542217136e-07,
"loss": 0.1452,
"reward": 1.2392857789993286,
"reward_std": 0.2891633503139019,
"rewards/accuracy_reward": 0.3250000163912773,
"rewards/format_reward": 0.9142857491970062,
"step": 860
},
{
"completion_length": 196.15000915527344,
"epoch": 0.4059729351376575,
"grad_norm": 7.294248104095459,
"kl": 0.38701171875,
"learning_rate": 7.412480055397843e-07,
"loss": 0.0556,
"reward": 1.2500000596046448,
"reward_std": 0.2849683463573456,
"rewards/accuracy_reward": 0.3285714417695999,
"rewards/format_reward": 0.9214285969734192,
"step": 870
},
{
"completion_length": 202.96786651611328,
"epoch": 0.4106392907139524,
"grad_norm": 2.0189414024353027,
"kl": 0.35,
"learning_rate": 7.340801038086918e-07,
"loss": 0.0262,
"reward": 1.2250000476837157,
"reward_std": 0.19948717057704926,
"rewards/accuracy_reward": 0.2750000137835741,
"rewards/format_reward": 0.9500000238418579,
"step": 880
},
{
"completion_length": 190.95001068115235,
"epoch": 0.4153056462902473,
"grad_norm": 11.575712203979492,
"kl": 0.3896484375,
"learning_rate": 7.268500521569655e-07,
"loss": 0.0922,
"reward": 1.2142857670783997,
"reward_std": 0.26723918691277504,
"rewards/accuracy_reward": 0.3071428701281548,
"rewards/format_reward": 0.9071428835391998,
"step": 890
},
{
"completion_length": 181.60000762939453,
"epoch": 0.4199720018665422,
"grad_norm": 8.530049324035645,
"kl": 0.45927734375,
"learning_rate": 7.195597702143772e-07,
"loss": 0.0985,
"reward": 1.1571429133415223,
"reward_std": 0.27336115539073946,
"rewards/accuracy_reward": 0.22857143953442574,
"rewards/format_reward": 0.9285714626312256,
"step": 900
},
{
"completion_length": 190.7464385986328,
"epoch": 0.4246383574428371,
"grad_norm": 5.701374530792236,
"kl": 1.1736328125,
"learning_rate": 7.122111936022668e-07,
"loss": 0.1988,
"reward": 1.2500000596046448,
"reward_std": 0.24506716057658195,
"rewards/accuracy_reward": 0.32857144474983213,
"rewards/format_reward": 0.9214286029338836,
"step": 910
},
{
"completion_length": 172.73214950561524,
"epoch": 0.42930471301913203,
"grad_norm": 4.281832695007324,
"kl": 1.024951171875,
"learning_rate": 7.048062734196204e-07,
"loss": 0.1912,
"reward": 1.2642857909202576,
"reward_std": 0.343449330329895,
"rewards/accuracy_reward": 0.35000001043081286,
"rewards/format_reward": 0.9142857432365418,
"step": 920
},
{
"completion_length": 197.4178665161133,
"epoch": 0.43397106859542695,
"grad_norm": 4.720562934875488,
"kl": 0.543603515625,
"learning_rate": 6.9734697572504e-07,
"loss": 0.0907,
"reward": 1.2571429133415222,
"reward_std": 0.2518312208354473,
"rewards/accuracy_reward": 0.3285714462399483,
"rewards/format_reward": 0.9285714566707611,
"step": 930
},
{
"completion_length": 196.92858123779297,
"epoch": 0.4386374241717219,
"grad_norm": 77.73806762695312,
"kl": 1.05439453125,
"learning_rate": 6.89835281014741e-07,
"loss": 0.1745,
"reward": 1.285714328289032,
"reward_std": 0.28164542019367217,
"rewards/accuracy_reward": 0.3642857253551483,
"rewards/format_reward": 0.9214285969734192,
"step": 940
},
{
"completion_length": 179.39286575317382,
"epoch": 0.4433037797480168,
"grad_norm": 2.6931166648864746,
"kl": 0.42705078125,
"learning_rate": 6.822731836967168e-07,
"loss": 0.0645,
"reward": 1.3428572177886964,
"reward_std": 0.19739395827054979,
"rewards/accuracy_reward": 0.392857152223587,
"rewards/format_reward": 0.9500000238418579,
"step": 950
},
{
"completion_length": 187.36429443359376,
"epoch": 0.44797013532431174,
"grad_norm": 5.5171427726745605,
"kl": 0.443359375,
"learning_rate": 6.746626915612085e-07,
"loss": 0.0781,
"reward": 1.2857143521308898,
"reward_std": 0.19799869433045386,
"rewards/accuracy_reward": 0.357142873108387,
"rewards/format_reward": 0.9285714566707611,
"step": 960
},
{
"completion_length": 184.37500915527343,
"epoch": 0.45263649090060665,
"grad_norm": 13.36310863494873,
"kl": 1.1681640625,
"learning_rate": 6.670058252476235e-07,
"loss": 0.2008,
"reward": 1.3428571820259094,
"reward_std": 0.262357784062624,
"rewards/accuracy_reward": 0.41428573429584503,
"rewards/format_reward": 0.9285714626312256,
"step": 970
},
{
"completion_length": 195.95358123779297,
"epoch": 0.45730284647690156,
"grad_norm": 2.2677857875823975,
"kl": 1.1369140625,
"learning_rate": 6.593046177080408e-07,
"loss": 0.1455,
"reward": 1.1714286088943482,
"reward_std": 0.3041789963841438,
"rewards/accuracy_reward": 0.2714285865426064,
"rewards/format_reward": 0.9000000298023224,
"step": 980
},
{
"completion_length": 193.67857818603517,
"epoch": 0.46196920205319647,
"grad_norm": 1.030552625656128,
"kl": 0.56953125,
"learning_rate": 6.515611136674479e-07,
"loss": 0.0992,
"reward": 1.2642857789993287,
"reward_std": 0.1844715215265751,
"rewards/accuracy_reward": 0.32142859101295473,
"rewards/format_reward": 0.9428571701049805,
"step": 990
},
{
"completion_length": 191.07143630981446,
"epoch": 0.4666355576294914,
"grad_norm": 7.8564453125,
"kl": 0.504833984375,
"learning_rate": 6.437773690808524e-07,
"loss": 0.099,
"reward": 1.3071429133415222,
"reward_std": 0.24715664908289908,
"rewards/accuracy_reward": 0.3714285910129547,
"rewards/format_reward": 0.935714316368103,
"step": 1000
},
{
"completion_length": 201.9714370727539,
"epoch": 0.4713019132057863,
"grad_norm": 2.8117690086364746,
"kl": 1.044677734375,
"learning_rate": 6.359554505874109e-07,
"loss": 0.2054,
"reward": 1.196428608894348,
"reward_std": 0.3307777248322964,
"rewards/accuracy_reward": 0.2821428716182709,
"rewards/format_reward": 0.9142857551574707,
"step": 1010
},
{
"completion_length": 189.82500915527345,
"epoch": 0.4759682687820812,
"grad_norm": 11.323598861694336,
"kl": 0.63037109375,
"learning_rate": 6.280974349617214e-07,
"loss": 0.095,
"reward": 1.2785714745521546,
"reward_std": 0.26157640293240547,
"rewards/accuracy_reward": 0.35000001788139345,
"rewards/format_reward": 0.9285714566707611,
"step": 1020
},
{
"completion_length": 190.90001068115234,
"epoch": 0.4806346243583761,
"grad_norm": 14.018318176269531,
"kl": 0.6455078125,
"learning_rate": 6.202054085624261e-07,
"loss": 0.1192,
"reward": 1.2857143521308898,
"reward_std": 0.23778653591871263,
"rewards/accuracy_reward": 0.35714287161827085,
"rewards/format_reward": 0.9285714566707611,
"step": 1030
},
{
"completion_length": 189.96786499023438,
"epoch": 0.48530097993467103,
"grad_norm": 5.3534932136535645,
"kl": 0.632958984375,
"learning_rate": 6.122814667782673e-07,
"loss": 0.0864,
"reward": 1.2285714626312256,
"reward_std": 0.1533150166273117,
"rewards/accuracy_reward": 0.2500000149011612,
"rewards/format_reward": 0.9785714387893677,
"step": 1040
},
{
"completion_length": 195.0642936706543,
"epoch": 0.48996733551096594,
"grad_norm": 6.974513530731201,
"kl": 0.494384765625,
"learning_rate": 6.043277134717475e-07,
"loss": 0.0765,
"reward": 1.3321428894996643,
"reward_std": 0.1950671575963497,
"rewards/accuracy_reward": 0.37500001639127734,
"rewards/format_reward": 0.9571428775787354,
"step": 1050
},
{
"completion_length": 204.17501068115234,
"epoch": 0.49463369108726085,
"grad_norm": 6.67943000793457,
"kl": 0.497265625,
"learning_rate": 5.963462604205392e-07,
"loss": 0.0889,
"reward": 1.260714340209961,
"reward_std": 0.23447152674198152,
"rewards/accuracy_reward": 0.3250000193715096,
"rewards/format_reward": 0.935714316368103,
"step": 1060
},
{
"completion_length": 217.91072692871094,
"epoch": 0.49930004666355576,
"grad_norm": 2.6225903034210205,
"kl": 1.211474609375,
"learning_rate": 5.883392267567924e-07,
"loss": 0.1539,
"reward": 1.2142857789993287,
"reward_std": 0.2801480941474438,
"rewards/accuracy_reward": 0.28571429699659345,
"rewards/format_reward": 0.9285714566707611,
"step": 1070
},
{
"completion_length": 200.80000762939454,
"epoch": 0.5039664022398507,
"grad_norm": 17.208133697509766,
"kl": 1.3998046875,
"learning_rate": 5.803087384044902e-07,
"loss": 0.2627,
"reward": 1.2071429014205932,
"reward_std": 0.3747034803032875,
"rewards/accuracy_reward": 0.3214285850524902,
"rewards/format_reward": 0.885714328289032,
"step": 1080
},
{
"completion_length": 184.55357818603517,
"epoch": 0.5086327578161456,
"grad_norm": 4.367872714996338,
"kl": 0.80498046875,
"learning_rate": 5.722569275150019e-07,
"loss": 0.1581,
"reward": 1.2642857551574707,
"reward_std": 0.2569018341600895,
"rewards/accuracy_reward": 0.3285714417695999,
"rewards/format_reward": 0.9357143104076385,
"step": 1090
},
{
"completion_length": 179.18572387695312,
"epoch": 0.5132991133924405,
"grad_norm": 4.963561058044434,
"kl": 0.592578125,
"learning_rate": 5.641859319009801e-07,
"loss": 0.0957,
"reward": 1.3250000715255736,
"reward_std": 0.2563889928162098,
"rewards/accuracy_reward": 0.3892857372760773,
"rewards/format_reward": 0.9357143104076385,
"step": 1100
},
{
"completion_length": 180.88929290771483,
"epoch": 0.5179654689687354,
"grad_norm": 1.608384609222412,
"kl": 0.348974609375,
"learning_rate": 5.560978944687576e-07,
"loss": 0.0775,
"reward": 1.2714286208152772,
"reward_std": 0.19492939710617066,
"rewards/accuracy_reward": 0.32142858654260636,
"rewards/format_reward": 0.9500000238418579,
"step": 1110
},
{
"completion_length": 164.9321502685547,
"epoch": 0.5226318245450303,
"grad_norm": 6.095414638519287,
"kl": 0.59853515625,
"learning_rate": 5.479949626493908e-07,
"loss": 0.0792,
"reward": 1.3428571939468383,
"reward_std": 0.18014808967709542,
"rewards/accuracy_reward": 0.37142859399318695,
"rewards/format_reward": 0.9714285850524902,
"step": 1120
},
{
"completion_length": 179.71072387695312,
"epoch": 0.5272981801213252,
"grad_norm": 13.79627513885498,
"kl": 0.299853515625,
"learning_rate": 5.398792878285002e-07,
"loss": 0.0579,
"reward": 1.296428632736206,
"reward_std": 0.16341925486922265,
"rewards/accuracy_reward": 0.32500001788139343,
"rewards/format_reward": 0.9714285850524902,
"step": 1130
},
{
"completion_length": 189.71429595947265,
"epoch": 0.5319645356976201,
"grad_norm": 7.43311071395874,
"kl": 0.44296875,
"learning_rate": 5.317530247750639e-07,
"loss": 0.0818,
"reward": 1.2785714983940124,
"reward_std": 0.1776016980409622,
"rewards/accuracy_reward": 0.32142858393490314,
"rewards/format_reward": 0.9571428716182708,
"step": 1140
},
{
"completion_length": 207.7714416503906,
"epoch": 0.5366308912739151,
"grad_norm": 27.736406326293945,
"kl": 1.2056640625,
"learning_rate": 5.2361833106931e-07,
"loss": 0.2633,
"reward": 1.228571480512619,
"reward_std": 0.36894305497407914,
"rewards/accuracy_reward": 0.3571428719907999,
"rewards/format_reward": 0.8714286148548126,
"step": 1150
},
{
"completion_length": 216.8928695678711,
"epoch": 0.54129724685021,
"grad_norm": 8.771681785583496,
"kl": 0.88974609375,
"learning_rate": 5.154773665298648e-07,
"loss": 0.1611,
"reward": 1.1535714745521546,
"reward_std": 0.2724130667746067,
"rewards/accuracy_reward": 0.2392857253551483,
"rewards/format_reward": 0.9142857491970062,
"step": 1160
},
{
"completion_length": 190.18572082519532,
"epoch": 0.5459636024265049,
"grad_norm": 6.968381404876709,
"kl": 0.37958984375,
"learning_rate": 5.073322926403045e-07,
"loss": 0.0619,
"reward": 1.260714340209961,
"reward_std": 0.15576233565807343,
"rewards/accuracy_reward": 0.275000012293458,
"rewards/format_reward": 0.9857142925262451,
"step": 1170
},
{
"completion_length": 220.9714370727539,
"epoch": 0.5506299580027998,
"grad_norm": 1.6743552684783936,
"kl": 0.696728515625,
"learning_rate": 4.991852719752678e-07,
"loss": 0.1253,
"reward": 1.2321429014205934,
"reward_std": 0.24198277071118354,
"rewards/accuracy_reward": 0.31071430146694184,
"rewards/format_reward": 0.9214285969734192,
"step": 1180
},
{
"completion_length": 203.98929595947266,
"epoch": 0.5552963135790947,
"grad_norm": 8.832259178161621,
"kl": 1.82734375,
"learning_rate": 4.910384676262752e-07,
"loss": 0.1067,
"reward": 1.26071435213089,
"reward_std": 0.32460705041885374,
"rewards/accuracy_reward": 0.36785716116428374,
"rewards/format_reward": 0.8928571701049804,
"step": 1190
},
{
"completion_length": 188.37858123779296,
"epoch": 0.5599626691553896,
"grad_norm": 4.268427848815918,
"kl": 0.414404296875,
"learning_rate": 4.828940426274142e-07,
"loss": 0.0621,
"reward": 1.3285714864730835,
"reward_std": 0.23999654203653337,
"rewards/accuracy_reward": 0.3714285850524902,
"rewards/format_reward": 0.9571428775787354,
"step": 1200
},
{
"completion_length": 207.16429290771484,
"epoch": 0.5646290247316845,
"grad_norm": 22.58124542236328,
"kl": 0.891845703125,
"learning_rate": 4.747541593810377e-07,
"loss": 0.1984,
"reward": 1.2178572058677672,
"reward_std": 0.3189430497586727,
"rewards/accuracy_reward": 0.310714303329587,
"rewards/format_reward": 0.9071428954601288,
"step": 1210
},
{
"completion_length": 218.35000915527343,
"epoch": 0.5692953803079794,
"grad_norm": 7.771918773651123,
"kl": 0.96904296875,
"learning_rate": 4.666209790836316e-07,
"loss": 0.1555,
"reward": 1.2107143342494964,
"reward_std": 0.3533112980425358,
"rewards/accuracy_reward": 0.3178571552038193,
"rewards/format_reward": 0.8928571820259095,
"step": 1220
},
{
"completion_length": 187.87857971191406,
"epoch": 0.5739617358842743,
"grad_norm": 13.805558204650879,
"kl": 0.933251953125,
"learning_rate": 4.5849666115200143e-07,
"loss": 0.1366,
"reward": 1.2500000715255737,
"reward_std": 0.21033736318349838,
"rewards/accuracy_reward": 0.30714286863803864,
"rewards/format_reward": 0.942857164144516,
"step": 1230
},
{
"completion_length": 197.11786804199218,
"epoch": 0.5786280914605693,
"grad_norm": 3.612844228744507,
"kl": 0.625244140625,
"learning_rate": 4.503833626499317e-07,
"loss": 0.1048,
"reward": 1.1892857551574707,
"reward_std": 0.3342569015920162,
"rewards/accuracy_reward": 0.2821428693830967,
"rewards/format_reward": 0.9071428894996643,
"step": 1240
},
{
"completion_length": 193.096435546875,
"epoch": 0.5832944470368642,
"grad_norm": 4.014871597290039,
"kl": 0.572314453125,
"learning_rate": 4.42283237715471e-07,
"loss": 0.0812,
"reward": 1.160714316368103,
"reward_std": 0.28212499171495437,
"rewards/accuracy_reward": 0.26071429550647734,
"rewards/format_reward": 0.9000000357627869,
"step": 1250
},
{
"completion_length": 195.02500762939454,
"epoch": 0.5879608026131591,
"grad_norm": 4.279513359069824,
"kl": 0.7986328125,
"learning_rate": 4.3419843698899234e-07,
"loss": 0.1005,
"reward": 1.2928572058677674,
"reward_std": 0.25148131176829336,
"rewards/accuracy_reward": 0.3571428656578064,
"rewards/format_reward": 0.9357143044471741,
"step": 1260
},
{
"completion_length": 221.07858123779297,
"epoch": 0.592627158189454,
"grad_norm": 4.270975589752197,
"kl": 0.7974609375,
"learning_rate": 4.2613110704218336e-07,
"loss": 0.1913,
"reward": 1.210714340209961,
"reward_std": 0.27596538737416265,
"rewards/accuracy_reward": 0.30357144251465795,
"rewards/format_reward": 0.9071429014205933,
"step": 1270
},
{
"completion_length": 192.57857971191407,
"epoch": 0.5972935137657489,
"grad_norm": 6.560425281524658,
"kl": 1.0640625,
"learning_rate": 4.1808338980811666e-07,
"loss": 0.1447,
"reward": 1.2214286208152771,
"reward_std": 0.2975998237729073,
"rewards/accuracy_reward": 0.3285714462399483,
"rewards/format_reward": 0.8928571760654449,
"step": 1280
},
{
"completion_length": 210.92501220703124,
"epoch": 0.6019598693420438,
"grad_norm": 2.6335413455963135,
"kl": 1.1158203125,
"learning_rate": 4.100574220125506e-07,
"loss": 0.2254,
"reward": 1.2178571939468383,
"reward_std": 0.38201676979660987,
"rewards/accuracy_reward": 0.3250000149011612,
"rewards/format_reward": 0.892857164144516,
"step": 1290
},
{
"completion_length": 226.9035842895508,
"epoch": 0.6066262249183387,
"grad_norm": 6.515714645385742,
"kl": 1.377734375,
"learning_rate": 4.020553346066144e-07,
"loss": 0.2749,
"reward": 1.2035714745521546,
"reward_std": 0.37217203676700594,
"rewards/accuracy_reward": 0.3107142999768257,
"rewards/format_reward": 0.8928571879863739,
"step": 1300
},
{
"completion_length": 211.1928680419922,
"epoch": 0.6112925804946336,
"grad_norm": 16.117403030395508,
"kl": 1.2681640625,
"learning_rate": 3.9407925220102493e-07,
"loss": 0.2125,
"reward": 1.1928571820259095,
"reward_std": 0.3735316038131714,
"rewards/accuracy_reward": 0.2928571544587612,
"rewards/format_reward": 0.9000000357627869,
"step": 1310
},
{
"completion_length": 203.11429290771486,
"epoch": 0.6159589360709286,
"grad_norm": 7.737660884857178,
"kl": 0.8955078125,
"learning_rate": 3.86131292501988e-07,
"loss": 0.126,
"reward": 1.2571429014205933,
"reward_std": 0.31292245015501974,
"rewards/accuracy_reward": 0.3428571581840515,
"rewards/format_reward": 0.9142857551574707,
"step": 1320
},
{
"completion_length": 188.27857971191406,
"epoch": 0.6206252916472235,
"grad_norm": 3.5433154106140137,
"kl": 0.89716796875,
"learning_rate": 3.7821356574893204e-07,
"loss": 0.1548,
"reward": 1.31071435213089,
"reward_std": 0.26513244956731796,
"rewards/accuracy_reward": 0.36785716116428374,
"rewards/format_reward": 0.942857164144516,
"step": 1330
},
{
"completion_length": 196.4714385986328,
"epoch": 0.6252916472235185,
"grad_norm": 6.886636734008789,
"kl": 0.853076171875,
"learning_rate": 3.7032817415422517e-07,
"loss": 0.1634,
"reward": 1.2678572058677673,
"reward_std": 0.2711702950298786,
"rewards/accuracy_reward": 0.3321428790688515,
"rewards/format_reward": 0.935714316368103,
"step": 1340
},
{
"completion_length": 197.9714385986328,
"epoch": 0.6299580027998134,
"grad_norm": 10.59721565246582,
"kl": 1.061083984375,
"learning_rate": 3.624772113450223e-07,
"loss": 0.1761,
"reward": 1.2678572058677673,
"reward_std": 0.32303600385785103,
"rewards/accuracy_reward": 0.36071430891752243,
"rewards/format_reward": 0.9071428954601288,
"step": 1350
},
{
"completion_length": 180.4321502685547,
"epoch": 0.6346243583761083,
"grad_norm": 2.4233856201171875,
"kl": 0.690625,
"learning_rate": 3.5466276180739264e-07,
"loss": 0.0947,
"reward": 1.2892857670783997,
"reward_std": 0.21290518939495087,
"rewards/accuracy_reward": 0.3392857272177935,
"rewards/format_reward": 0.9500000238418579,
"step": 1360
},
{
"completion_length": 196.02857971191406,
"epoch": 0.6392907139524032,
"grad_norm": 18.396207809448242,
"kl": 0.962060546875,
"learning_rate": 3.4688690033287414e-07,
"loss": 0.155,
"reward": 1.3535714745521545,
"reward_std": 0.24271938800811768,
"rewards/accuracy_reward": 0.4250000178813934,
"rewards/format_reward": 0.9285714626312256,
"step": 1370
},
{
"completion_length": 194.17501068115234,
"epoch": 0.6439570695286981,
"grad_norm": 12.984419822692871,
"kl": 0.37880859375,
"learning_rate": 3.3915169146760137e-07,
"loss": 0.096,
"reward": 1.2642857909202576,
"reward_std": 0.2268330782651901,
"rewards/accuracy_reward": 0.33571430034935473,
"rewards/format_reward": 0.9285714507102967,
"step": 1380
},
{
"completion_length": 180.59286499023438,
"epoch": 0.648623425104993,
"grad_norm": 3.471736192703247,
"kl": 0.78525390625,
"learning_rate": 3.3145918896415394e-07,
"loss": 0.0905,
"reward": 1.3535714745521545,
"reward_std": 0.1773286685347557,
"rewards/accuracy_reward": 0.417857164144516,
"rewards/format_reward": 0.935714316368103,
"step": 1390
},
{
"completion_length": 172.69286499023437,
"epoch": 0.653289780681288,
"grad_norm": 5.568473815917969,
"kl": 0.4484130859375,
"learning_rate": 3.2381143523627106e-07,
"loss": 0.0142,
"reward": 1.3071429252624511,
"reward_std": 0.19887898862361908,
"rewards/accuracy_reward": 0.3500000134110451,
"rewards/format_reward": 0.9571428775787354,
"step": 1400
},
{
"completion_length": 197.35714874267578,
"epoch": 0.6579561362575829,
"grad_norm": 5.128924369812012,
"kl": 0.848291015625,
"learning_rate": 3.16210460816576e-07,
"loss": 0.1411,
"reward": 1.2464286088943481,
"reward_std": 0.20331502109766006,
"rewards/accuracy_reward": 0.3035714402794838,
"rewards/format_reward": 0.9428571701049805,
"step": 1410
},
{
"completion_length": 205.62858123779296,
"epoch": 0.6626224918338778,
"grad_norm": 6.538782596588135,
"kl": 0.71435546875,
"learning_rate": 3.086582838174551e-07,
"loss": 0.1207,
"reward": 1.210714328289032,
"reward_std": 0.26858522146940234,
"rewards/accuracy_reward": 0.2750000096857548,
"rewards/format_reward": 0.9357143104076385,
"step": 1420
},
{
"completion_length": 177.58214874267577,
"epoch": 0.6672888474101727,
"grad_norm": 2.0602848529815674,
"kl": 0.540087890625,
"learning_rate": 3.0115690939523514e-07,
"loss": 0.0609,
"reward": 1.2571429014205933,
"reward_std": 0.19617216065526008,
"rewards/accuracy_reward": 0.3071428701281548,
"rewards/format_reward": 0.9500000238418579,
"step": 1430
},
{
"completion_length": 190.9821517944336,
"epoch": 0.6719552029864676,
"grad_norm": 1.0282678604125977,
"kl": 0.74091796875,
"learning_rate": 2.9370832921779983e-07,
"loss": 0.1188,
"reward": 1.2035714626312255,
"reward_std": 0.22572807371616363,
"rewards/accuracy_reward": 0.25357144251465796,
"rewards/format_reward": 0.9500000238418579,
"step": 1440
},
{
"completion_length": 197.7464385986328,
"epoch": 0.6766215585627625,
"grad_norm": 5.613475799560547,
"kl": 0.711767578125,
"learning_rate": 2.8631452093578814e-07,
"loss": 0.1211,
"reward": 1.3035714864730834,
"reward_std": 0.20981329679489136,
"rewards/accuracy_reward": 0.3750000141561031,
"rewards/format_reward": 0.9285714626312256,
"step": 1450
},
{
"completion_length": 200.86786651611328,
"epoch": 0.6812879141390574,
"grad_norm": 6.9082841873168945,
"kl": 1.02734375,
"learning_rate": 2.7897744765751375e-07,
"loss": 0.1942,
"reward": 1.321428620815277,
"reward_std": 0.2701858140528202,
"rewards/accuracy_reward": 0.4000000197440386,
"rewards/format_reward": 0.9214286088943482,
"step": 1460
},
{
"completion_length": 191.17858123779297,
"epoch": 0.6859542697153523,
"grad_norm": 116.39089965820312,
"kl": 0.575634765625,
"learning_rate": 2.716990574277469e-07,
"loss": 0.086,
"reward": 1.2821429133415223,
"reward_std": 0.21748021617531776,
"rewards/accuracy_reward": 0.33214287310838697,
"rewards/format_reward": 0.9500000178813934,
"step": 1470
},
{
"completion_length": 196.97500762939453,
"epoch": 0.6906206252916472,
"grad_norm": 5.2841033935546875,
"kl": 0.8533203125,
"learning_rate": 2.644812827104933e-07,
"loss": 0.1501,
"reward": 1.175000047683716,
"reward_std": 0.30686734020709994,
"rewards/accuracy_reward": 0.2821428686380386,
"rewards/format_reward": 0.8928571820259095,
"step": 1480
},
{
"completion_length": 186.7571533203125,
"epoch": 0.6952869808679422,
"grad_norm": 4.615259170532227,
"kl": 0.45205078125,
"learning_rate": 2.573260398759125e-07,
"loss": 0.0948,
"reward": 1.346428632736206,
"reward_std": 0.12943540289998054,
"rewards/accuracy_reward": 0.36785715967416766,
"rewards/format_reward": 0.9785714387893677,
"step": 1490
},
{
"completion_length": 175.87857818603516,
"epoch": 0.6999533364442371,
"grad_norm": 2.3132364749908447,
"kl": 0.661865234375,
"learning_rate": 2.5023522869150705e-07,
"loss": 0.0561,
"reward": 1.2535714864730836,
"reward_std": 0.21969022005796432,
"rewards/accuracy_reward": 0.31785715371370316,
"rewards/format_reward": 0.935714316368103,
"step": 1500
},
{
"completion_length": 187.2821517944336,
"epoch": 0.704619692020532,
"grad_norm": 29.17850685119629,
"kl": 0.95556640625,
"learning_rate": 2.432107318177217e-07,
"loss": 0.1785,
"reward": 1.3500000596046449,
"reward_std": 0.25344905629754066,
"rewards/accuracy_reward": 0.42142859399318694,
"rewards/format_reward": 0.9285714566707611,
"step": 1510
},
{
"completion_length": 177.58572235107422,
"epoch": 0.7092860475968269,
"grad_norm": 1.220908284187317,
"kl": 0.591015625,
"learning_rate": 2.3625441430808347e-07,
"loss": 0.0738,
"reward": 1.4071429133415223,
"reward_std": 0.21487789303064347,
"rewards/accuracy_reward": 0.4714285969734192,
"rewards/format_reward": 0.9357143044471741,
"step": 1520
},
{
"completion_length": 180.8821533203125,
"epoch": 0.7139524031731218,
"grad_norm": 4.141109943389893,
"kl": 0.378857421875,
"learning_rate": 2.2936812311401682e-07,
"loss": 0.0597,
"reward": 1.3000000715255737,
"reward_std": 0.17239581793546677,
"rewards/accuracy_reward": 0.3357143022119999,
"rewards/format_reward": 0.9642857313156128,
"step": 1530
},
{
"completion_length": 183.73214874267578,
"epoch": 0.7186187587494167,
"grad_norm": 10.828474044799805,
"kl": 0.504296875,
"learning_rate": 2.225536865944646e-07,
"loss": 0.0564,
"reward": 1.3321429133415221,
"reward_std": 0.14654723256826402,
"rewards/accuracy_reward": 0.3607143074274063,
"rewards/format_reward": 0.9714285850524902,
"step": 1540
},
{
"completion_length": 189.9714370727539,
"epoch": 0.7232851143257116,
"grad_norm": 7.55503511428833,
"kl": 0.721484375,
"learning_rate": 2.1581291403044632e-07,
"loss": 0.1054,
"reward": 1.2250000596046449,
"reward_std": 0.2633721731603146,
"rewards/accuracy_reward": 0.2892857283353806,
"rewards/format_reward": 0.9357143104076385,
"step": 1550
},
{
"completion_length": 180.25000762939453,
"epoch": 0.7279514699020065,
"grad_norm": 9.971400260925293,
"kl": 0.6979736328125,
"learning_rate": 2.0914759514468106e-07,
"loss": 0.1232,
"reward": 1.2785714745521546,
"reward_std": 0.2545368172228336,
"rewards/accuracy_reward": 0.357142873480916,
"rewards/format_reward": 0.9214286029338836,
"step": 1560
},
{
"completion_length": 198.396435546875,
"epoch": 0.7326178254783015,
"grad_norm": 9.208026885986328,
"kl": 1.296337890625,
"learning_rate": 2.0255949962640333e-07,
"loss": 0.2623,
"reward": 1.2785715103149413,
"reward_std": 0.27719091176986693,
"rewards/accuracy_reward": 0.3785714417695999,
"rewards/format_reward": 0.900000023841858,
"step": 1570
},
{
"completion_length": 204.6928680419922,
"epoch": 0.7372841810545964,
"grad_norm": 7.680899620056152,
"kl": 0.668994140625,
"learning_rate": 1.9605037666149832e-07,
"loss": 0.1278,
"reward": 1.2857143521308898,
"reward_std": 0.24715665131807327,
"rewards/accuracy_reward": 0.357142873108387,
"rewards/format_reward": 0.9285714566707611,
"step": 1580
},
{
"completion_length": 185.85357818603515,
"epoch": 0.7419505366308913,
"grad_norm": 8.488438606262207,
"kl": 0.361083984375,
"learning_rate": 1.8962195446808083e-07,
"loss": 0.0404,
"reward": 1.196428644657135,
"reward_std": 0.25750192254781723,
"rewards/accuracy_reward": 0.26785715520381925,
"rewards/format_reward": 0.9285714507102967,
"step": 1590
},
{
"completion_length": 196.72500762939453,
"epoch": 0.7466168922071862,
"grad_norm": 4.548067092895508,
"kl": 1.42060546875,
"learning_rate": 1.8327593983764057e-07,
"loss": 0.2529,
"reward": 1.335714328289032,
"reward_std": 0.377190912514925,
"rewards/accuracy_reward": 0.4285714507102966,
"rewards/format_reward": 0.9071428894996643,
"step": 1600
},
{
"completion_length": 177.2678649902344,
"epoch": 0.7512832477834811,
"grad_norm": 3.3683552742004395,
"kl": 0.58388671875,
"learning_rate": 1.770140176818774e-07,
"loss": 0.0739,
"reward": 1.3428571939468383,
"reward_std": 0.18543876633048056,
"rewards/accuracy_reward": 0.3642857313156128,
"rewards/format_reward": 0.9785714387893677,
"step": 1610
},
{
"completion_length": 203.56429595947264,
"epoch": 0.755949603359776,
"grad_norm": 3.158090114593506,
"kl": 0.81240234375,
"learning_rate": 1.7083785058534566e-07,
"loss": 0.1285,
"reward": 1.2821429371833801,
"reward_std": 0.24443381130695344,
"rewards/accuracy_reward": 0.3392857268452644,
"rewards/format_reward": 0.9428571701049805,
"step": 1620
},
{
"completion_length": 203.41429443359374,
"epoch": 0.7606159589360709,
"grad_norm": 3.8291237354278564,
"kl": 1.1529296875,
"learning_rate": 1.6474907836402507e-07,
"loss": 0.1792,
"reward": 1.2678571939468384,
"reward_std": 0.24378738924860954,
"rewards/accuracy_reward": 0.3464285895228386,
"rewards/format_reward": 0.9214285969734192,
"step": 1630
},
{
"completion_length": 204.32500915527345,
"epoch": 0.7652823145123658,
"grad_norm": 127.166259765625,
"kl": 1.08779296875,
"learning_rate": 1.5874931762993933e-07,
"loss": 0.1349,
"reward": 1.196428620815277,
"reward_std": 0.26433941870927813,
"rewards/accuracy_reward": 0.28928572684526443,
"rewards/format_reward": 0.9071428775787354,
"step": 1640
},
{
"completion_length": 210.50001068115233,
"epoch": 0.7699486700886607,
"grad_norm": 1.736830472946167,
"kl": 0.989453125,
"learning_rate": 1.5284016136193396e-07,
"loss": 0.2122,
"reward": 1.2178571939468383,
"reward_std": 0.2605919159948826,
"rewards/accuracy_reward": 0.2964285835623741,
"rewards/format_reward": 0.9214286088943482,
"step": 1650
},
{
"completion_length": 201.36072235107423,
"epoch": 0.7746150256649557,
"grad_norm": 10.542801856994629,
"kl": 1.331201171875,
"learning_rate": 1.4702317848272838e-07,
"loss": 0.2161,
"reward": 1.3214286327362061,
"reward_std": 0.28298772796988486,
"rewards/accuracy_reward": 0.40000002086162567,
"rewards/format_reward": 0.9214286029338836,
"step": 1660
},
{
"completion_length": 192.8857208251953,
"epoch": 0.7792813812412506,
"grad_norm": 6.193233966827393,
"kl": 1.0849609375,
"learning_rate": 1.4129991344235653e-07,
"loss": 0.1358,
"reward": 1.2321429014205934,
"reward_std": 0.2178552895784378,
"rewards/accuracy_reward": 0.31785715706646445,
"rewards/format_reward": 0.9142857432365418,
"step": 1670
},
{
"completion_length": 173.0357223510742,
"epoch": 0.7839477368175455,
"grad_norm": 16.36906623840332,
"kl": 0.372314453125,
"learning_rate": 1.3567188580810435e-07,
"loss": 0.0753,
"reward": 1.4285714864730834,
"reward_std": 0.19271938651800155,
"rewards/accuracy_reward": 0.46428574323654176,
"rewards/format_reward": 0.9642857313156128,
"step": 1680
},
{
"completion_length": 194.05357971191407,
"epoch": 0.7886140923938404,
"grad_norm": 2.392770290374756,
"kl": 0.406396484375,
"learning_rate": 1.3014058986105374e-07,
"loss": 0.0856,
"reward": 1.2535714864730836,
"reward_std": 0.15812735334038736,
"rewards/accuracy_reward": 0.2892857283353806,
"rewards/format_reward": 0.9642857313156128,
"step": 1690
},
{
"completion_length": 175.20000762939452,
"epoch": 0.7932804479701353,
"grad_norm": 4.713147163391113,
"kl": 0.342333984375,
"learning_rate": 1.2470749419934057e-07,
"loss": 0.0522,
"reward": 1.435714340209961,
"reward_std": 0.1269535943865776,
"rewards/accuracy_reward": 0.4428571715950966,
"rewards/format_reward": 0.9928571462631226,
"step": 1700
},
{
"completion_length": 178.71429595947265,
"epoch": 0.7979468035464302,
"grad_norm": 2.6368408203125,
"kl": 0.63681640625,
"learning_rate": 1.1937404134823175e-07,
"loss": 0.0749,
"reward": 1.2642857551574707,
"reward_std": 0.21649573594331742,
"rewards/accuracy_reward": 0.3000000137835741,
"rewards/format_reward": 0.9642857313156128,
"step": 1710
},
{
"completion_length": 180.82500915527345,
"epoch": 0.8026131591227251,
"grad_norm": 4.20245361328125,
"kl": 0.404296875,
"learning_rate": 1.1414164737712401e-07,
"loss": 0.0445,
"reward": 1.3035714864730834,
"reward_std": 0.21377288773655892,
"rewards/accuracy_reward": 0.35357144474983215,
"rewards/format_reward": 0.9500000178813934,
"step": 1720
},
{
"completion_length": 193.33929290771485,
"epoch": 0.80727951469902,
"grad_norm": 12.832620620727539,
"kl": 0.924365234375,
"learning_rate": 1.0901170152356775e-07,
"loss": 0.1151,
"reward": 1.2142857551574706,
"reward_std": 0.24824440032243728,
"rewards/accuracy_reward": 0.3000000149011612,
"rewards/format_reward": 0.9142857551574707,
"step": 1730
},
{
"completion_length": 195.20358123779297,
"epoch": 0.811945870275315,
"grad_norm": 8.172798156738281,
"kl": 0.75888671875,
"learning_rate": 1.0398556582441481e-07,
"loss": 0.1337,
"reward": 1.271428644657135,
"reward_std": 0.271032539755106,
"rewards/accuracy_reward": 0.3428571570664644,
"rewards/format_reward": 0.9285714626312256,
"step": 1740
},
{
"completion_length": 183.7714385986328,
"epoch": 0.8166122258516099,
"grad_norm": 12.028782844543457,
"kl": 1.259375,
"learning_rate": 9.906457475418778e-08,
"loss": 0.1913,
"reward": 1.3107143759727478,
"reward_std": 0.25871951803565024,
"rewards/accuracy_reward": 0.38214287757873533,
"rewards/format_reward": 0.9285714566707611,
"step": 1750
},
{
"completion_length": 195.21072387695312,
"epoch": 0.8212785814279048,
"grad_norm": 8.357789039611816,
"kl": 0.935986328125,
"learning_rate": 9.425003487076789e-08,
"loss": 0.1143,
"reward": 1.2392857670783997,
"reward_std": 0.2283131591975689,
"rewards/accuracy_reward": 0.30357144549489024,
"rewards/format_reward": 0.9357143104076385,
"step": 1760
},
{
"completion_length": 186.48572387695313,
"epoch": 0.8259449370041997,
"grad_norm": 4.591169834136963,
"kl": 0.8271484375,
"learning_rate": 8.954322446849444e-08,
"loss": 0.1123,
"reward": 1.3535715103149415,
"reward_std": 0.23052316084504126,
"rewards/accuracy_reward": 0.40357144773006437,
"rewards/format_reward": 0.9500000238418579,
"step": 1770
},
{
"completion_length": 203.71072082519532,
"epoch": 0.8306112925804946,
"grad_norm": 4.072372913360596,
"kl": 1.01142578125,
"learning_rate": 8.494539323876871e-08,
"loss": 0.1496,
"reward": 1.2678571939468384,
"reward_std": 0.24972448274493217,
"rewards/accuracy_reward": 0.34642858654260633,
"rewards/format_reward": 0.9214285969734192,
"step": 1780
},
{
"completion_length": 173.66429290771484,
"epoch": 0.8352776481567895,
"grad_norm": 1.4727064371109009,
"kl": 0.4848876953125,
"learning_rate": 8.045776193825204e-08,
"loss": 0.0449,
"reward": 1.335714340209961,
"reward_std": 0.20700509771704673,
"rewards/accuracy_reward": 0.3857143074274063,
"rewards/format_reward": 0.9500000238418579,
"step": 1790
},
{
"completion_length": 178.2857223510742,
"epoch": 0.8399440037330844,
"grad_norm": 2.1876659393310547,
"kl": 0.60693359375,
"learning_rate": 7.608152206474638e-08,
"loss": 0.0354,
"reward": 1.36071435213089,
"reward_std": 0.2141479544341564,
"rewards/accuracy_reward": 0.41071430742740633,
"rewards/format_reward": 0.9500000238418579,
"step": 1800
},
{
"completion_length": 184.4964370727539,
"epoch": 0.8446103593093793,
"grad_norm": 1.164255976676941,
"kl": 0.330322265625,
"learning_rate": 7.181783554084308e-08,
"loss": 0.0332,
"reward": 1.3071429133415222,
"reward_std": 0.13999654576182366,
"rewards/accuracy_reward": 0.32142859026789666,
"rewards/format_reward": 0.9857142925262451,
"step": 1810
},
{
"completion_length": 180.47501068115236,
"epoch": 0.8492767148856742,
"grad_norm": 17.824142456054688,
"kl": 0.4853271484375,
"learning_rate": 6.766783440542434e-08,
"loss": 0.0599,
"reward": 1.2642857670783996,
"reward_std": 0.19838216677308082,
"rewards/accuracy_reward": 0.32857144847512243,
"rewards/format_reward": 0.935714316368103,
"step": 1820
},
{
"completion_length": 187.25000915527343,
"epoch": 0.8539430704619692,
"grad_norm": 7.02644681930542,
"kl": 0.693896484375,
"learning_rate": 6.363262051309908e-08,
"loss": 0.1129,
"reward": 1.2892857670783997,
"reward_std": 0.27438203766942026,
"rewards/accuracy_reward": 0.35357144474983215,
"rewards/format_reward": 0.9357143104076385,
"step": 1830
},
{
"completion_length": 190.3964370727539,
"epoch": 0.8586094260382641,
"grad_norm": 0.5623534321784973,
"kl": 0.6712890625,
"learning_rate": 5.971326524165226e-08,
"loss": 0.1025,
"reward": 1.296428644657135,
"reward_std": 0.2583474151790142,
"rewards/accuracy_reward": 0.36785715967416766,
"rewards/format_reward": 0.9285714566707611,
"step": 1840
},
{
"completion_length": 191.0107223510742,
"epoch": 0.863275781614559,
"grad_norm": 0.3061552047729492,
"kl": 0.79541015625,
"learning_rate": 5.591080920758695e-08,
"loss": 0.1553,
"reward": 1.2821429252624512,
"reward_std": 0.2908777602016926,
"rewards/accuracy_reward": 0.3678571581840515,
"rewards/format_reward": 0.9142857491970062,
"step": 1850
},
{
"completion_length": 190.6214385986328,
"epoch": 0.8679421371908539,
"grad_norm": 4.034696578979492,
"kl": 0.664599609375,
"learning_rate": 5.22262619898331e-08,
"loss": 0.1263,
"reward": 1.2821429133415223,
"reward_std": 0.24259887337684632,
"rewards/accuracy_reward": 0.3464285880327225,
"rewards/format_reward": 0.935714316368103,
"step": 1860
},
{
"completion_length": 192.10358276367188,
"epoch": 0.8726084927671488,
"grad_norm": 11.7506742477417,
"kl": 1.0009765625,
"learning_rate": 4.8660601861697294e-08,
"loss": 0.1442,
"reward": 1.3321429133415221,
"reward_std": 0.2509672470390797,
"rewards/accuracy_reward": 0.39642858803272246,
"rewards/format_reward": 0.935714316368103,
"step": 1870
},
{
"completion_length": 194.44286346435547,
"epoch": 0.8772748483434438,
"grad_norm": 2.883983850479126,
"kl": 0.7970703125,
"learning_rate": 4.5214775531124184e-08,
"loss": 0.0791,
"reward": 1.2214286148548126,
"reward_std": 0.2235020525753498,
"rewards/accuracy_reward": 0.2785714402794838,
"rewards/format_reward": 0.9428571581840515,
"step": 1880
},
{
"completion_length": 177.9107223510742,
"epoch": 0.8819412039197387,
"grad_norm": 12.7833251953125,
"kl": 0.6814453125,
"learning_rate": 4.188969788933899e-08,
"loss": 0.0794,
"reward": 1.2928571939468383,
"reward_std": 0.2521940000355244,
"rewards/accuracy_reward": 0.36428572833538053,
"rewards/format_reward": 0.9285714507102967,
"step": 1890
},
{
"completion_length": 181.83929595947265,
"epoch": 0.8866075594960336,
"grad_norm": 1.639757513999939,
"kl": 1.08232421875,
"learning_rate": 3.8686251767937325e-08,
"loss": 0.1071,
"reward": 1.3000000596046448,
"reward_std": 0.22904308661818504,
"rewards/accuracy_reward": 0.35714287161827085,
"rewards/format_reward": 0.942857164144516,
"step": 1900
},
{
"completion_length": 192.0964370727539,
"epoch": 0.8912739150723286,
"grad_norm": 3.8928933143615723,
"kl": 0.3626220703125,
"learning_rate": 3.560528770448712e-08,
"loss": 0.064,
"reward": 1.3571429014205934,
"reward_std": 0.14244568049907685,
"rewards/accuracy_reward": 0.3928571566939354,
"rewards/format_reward": 0.9642857313156128,
"step": 1910
},
{
"completion_length": 198.77500915527344,
"epoch": 0.8959402706486235,
"grad_norm": 4.202512741088867,
"kl": 0.386767578125,
"learning_rate": 3.264762371670493e-08,
"loss": 0.0725,
"reward": 1.2428572058677674,
"reward_std": 0.18409645855426787,
"rewards/accuracy_reward": 0.2928571604192257,
"rewards/format_reward": 0.9500000238418579,
"step": 1920
},
{
"completion_length": 182.1321548461914,
"epoch": 0.9006066262249184,
"grad_norm": 8.796235084533691,
"kl": 0.31845703125,
"learning_rate": 2.981404508526653e-08,
"loss": 0.049,
"reward": 1.3142857670783996,
"reward_std": 0.19875723943114282,
"rewards/accuracy_reward": 0.35714287757873536,
"rewards/format_reward": 0.9571428775787354,
"step": 1930
},
{
"completion_length": 183.12857818603516,
"epoch": 0.9052729818012133,
"grad_norm": 5.363933086395264,
"kl": 0.4515380859375,
"learning_rate": 2.7105304145309317e-08,
"loss": 0.0753,
"reward": 1.3535714864730835,
"reward_std": 0.22955592721700668,
"rewards/accuracy_reward": 0.40357145145535467,
"rewards/format_reward": 0.9500000238418579,
"step": 1940
},
{
"completion_length": 178.5964370727539,
"epoch": 0.9099393373775082,
"grad_norm": 4.07145357131958,
"kl": 0.243212890625,
"learning_rate": 2.4522120086681975e-08,
"loss": 0.035,
"reward": 1.3750000596046448,
"reward_std": 0.1731257550418377,
"rewards/accuracy_reward": 0.38928572833538055,
"rewards/format_reward": 0.9857142925262451,
"step": 1950
},
{
"completion_length": 191.87858123779296,
"epoch": 0.9146056929538031,
"grad_norm": 4.126840114593506,
"kl": 0.7501953125,
"learning_rate": 2.2065178762994517e-08,
"loss": 0.1034,
"reward": 1.2821429014205932,
"reward_std": 0.1828709363937378,
"rewards/accuracy_reward": 0.3392857313156128,
"rewards/format_reward": 0.942857164144516,
"step": 1960
},
{
"completion_length": 191.27857818603516,
"epoch": 0.919272048530098,
"grad_norm": 3.2844011783599854,
"kl": 1.066259765625,
"learning_rate": 1.9735132509519302e-08,
"loss": 0.1838,
"reward": 1.271428632736206,
"reward_std": 0.33275041803717614,
"rewards/accuracy_reward": 0.35714287757873536,
"rewards/format_reward": 0.9142857491970062,
"step": 1970
},
{
"completion_length": 182.68929290771484,
"epoch": 0.9239384041063929,
"grad_norm": 5.930522441864014,
"kl": 0.534765625,
"learning_rate": 1.7532599969991347e-08,
"loss": 0.0479,
"reward": 1.2785714864730835,
"reward_std": 0.19234431087970733,
"rewards/accuracy_reward": 0.3214285895228386,
"rewards/format_reward": 0.9571428775787354,
"step": 1980
},
{
"completion_length": 178.8928649902344,
"epoch": 0.9286047596826879,
"grad_norm": 0.8274029493331909,
"kl": 0.40849609375,
"learning_rate": 1.545816593235416e-08,
"loss": 0.0388,
"reward": 1.3142857789993285,
"reward_std": 0.20442002266645432,
"rewards/accuracy_reward": 0.3500000134110451,
"rewards/format_reward": 0.9642857313156128,
"step": 1990
},
{
"completion_length": 188.56786499023437,
"epoch": 0.9332711152589828,
"grad_norm": 6.531651020050049,
"kl": 1.097607421875,
"learning_rate": 1.3512381173494458e-08,
"loss": 0.2064,
"reward": 1.3535714983940124,
"reward_std": 0.24394118189811706,
"rewards/accuracy_reward": 0.4250000178813934,
"rewards/format_reward": 0.9285714626312256,
"step": 2000
},
{
"completion_length": 194.05000762939454,
"epoch": 0.9379374708352777,
"grad_norm": 7.329962253570557,
"kl": 0.97109375,
"learning_rate": 1.169576231300684e-08,
"loss": 0.1501,
"reward": 1.2750000476837158,
"reward_std": 0.3136565685272217,
"rewards/accuracy_reward": 0.3607142955064774,
"rewards/format_reward": 0.9142857491970062,
"step": 2010
},
{
"completion_length": 198.34286499023438,
"epoch": 0.9426038264115726,
"grad_norm": 5.229209899902344,
"kl": 0.5191162109375,
"learning_rate": 1.000879167602764e-08,
"loss": 0.087,
"reward": 1.285714340209961,
"reward_std": 0.1509844921529293,
"rewards/accuracy_reward": 0.32857144698500634,
"rewards/format_reward": 0.9571428775787354,
"step": 2020
},
{
"completion_length": 202.35357971191405,
"epoch": 0.9472701819878675,
"grad_norm": 2.168286085128784,
"kl": 0.821923828125,
"learning_rate": 8.451917165174404e-09,
"loss": 0.1315,
"reward": 1.3642857670783997,
"reward_std": 0.23890879452228547,
"rewards/accuracy_reward": 0.4214285880327225,
"rewards/format_reward": 0.9428571581840515,
"step": 2030
},
{
"completion_length": 205.11786346435548,
"epoch": 0.9519365375641624,
"grad_norm": 11.269577980041504,
"kl": 1.1117919921875,
"learning_rate": 7.025552141624369e-09,
"loss": 0.2032,
"reward": 1.1714286088943482,
"reward_std": 0.2642820030450821,
"rewards/accuracy_reward": 0.2571428693830967,
"rewards/format_reward": 0.9142857372760773,
"step": 2040
},
{
"completion_length": 190.096435546875,
"epoch": 0.9566028931404573,
"grad_norm": 0.18682968616485596,
"kl": 0.72197265625,
"learning_rate": 5.730075315364346e-09,
"loss": 0.1334,
"reward": 1.2321429133415223,
"reward_std": 0.25800683721899986,
"rewards/accuracy_reward": 0.29642858505249026,
"rewards/format_reward": 0.935714316368103,
"step": 2050
},
{
"completion_length": 184.56429443359374,
"epoch": 0.9612692487167522,
"grad_norm": 2.113898754119873,
"kl": 0.876220703125,
"learning_rate": 4.565830644640223e-09,
"loss": 0.1067,
"reward": 1.271428632736206,
"reward_std": 0.3147573724389076,
"rewards/accuracy_reward": 0.3642857313156128,
"rewards/format_reward": 0.9071428894996643,
"step": 2060
},
{
"completion_length": 184.8857208251953,
"epoch": 0.9659356042930471,
"grad_norm": 6.786799907684326,
"kl": 0.4216796875,
"learning_rate": 3.533127244634171e-09,
"loss": 0.046,
"reward": 1.3571429133415223,
"reward_std": 0.17572808191180228,
"rewards/accuracy_reward": 0.38571430146694186,
"rewards/format_reward": 0.9714285790920257,
"step": 2070
},
{
"completion_length": 188.2321517944336,
"epoch": 0.9706019598693421,
"grad_norm": 6.5551042556762695,
"kl": 0.491552734375,
"learning_rate": 2.6322393053916925e-09,
"loss": 0.082,
"reward": 1.2392857670783997,
"reward_std": 0.24259887263178825,
"rewards/accuracy_reward": 0.29642858952283857,
"rewards/format_reward": 0.9428571701049805,
"step": 2080
},
{
"completion_length": 197.81429748535157,
"epoch": 0.975268315445637,
"grad_norm": 3.5728259086608887,
"kl": 0.3767578125,
"learning_rate": 1.86340601902274e-09,
"loss": 0.052,
"reward": 1.2035714983940125,
"reward_std": 0.18816160932183265,
"rewards/accuracy_reward": 0.26071429550647734,
"rewards/format_reward": 0.9428571701049805,
"step": 2090
},
{
"completion_length": 184.00000915527343,
"epoch": 0.9799346710219319,
"grad_norm": 8.470995903015137,
"kl": 0.66552734375,
"learning_rate": 1.2268315161944044e-09,
"loss": 0.0892,
"reward": 1.2678572058677673,
"reward_std": 0.16525296717882157,
"rewards/accuracy_reward": 0.2964285921305418,
"rewards/format_reward": 0.9714285850524902,
"step": 2100
},
{
"completion_length": 190.1964370727539,
"epoch": 0.9846010265982268,
"grad_norm": 5.933376312255859,
"kl": 0.58525390625,
"learning_rate": 7.226848119326057e-10,
"loss": 0.0743,
"reward": 1.2571429133415222,
"reward_std": 0.20183914229273797,
"rewards/accuracy_reward": 0.3285714477300644,
"rewards/format_reward": 0.9285714566707611,
"step": 2110
},
{
"completion_length": 203.46429290771485,
"epoch": 0.9892673821745217,
"grad_norm": 11.98684310913086,
"kl": 0.8004150390625,
"learning_rate": 3.510997607475974e-10,
"loss": 0.144,
"reward": 1.2071429133415221,
"reward_std": 0.29701889082789423,
"rewards/accuracy_reward": 0.28571429923176767,
"rewards/format_reward": 0.9214285969734192,
"step": 2120
},
{
"completion_length": 193.4964370727539,
"epoch": 0.9939337377508166,
"grad_norm": 1.8809298276901245,
"kl": 0.745263671875,
"learning_rate": 1.121750210946737e-10,
"loss": 0.1279,
"reward": 1.2571429133415222,
"reward_std": 0.26994478702545166,
"rewards/accuracy_reward": 0.3357143022119999,
"rewards/format_reward": 0.9214286029338836,
"step": 2130
},
{
"completion_length": 185.08929443359375,
"epoch": 0.9986000933271115,
"grad_norm": 1.6485497951507568,
"kl": 0.567529296875,
"learning_rate": 5.974029179456331e-12,
"loss": 0.097,
"reward": 1.310714328289032,
"reward_std": 0.1667502835392952,
"rewards/accuracy_reward": 0.3535714462399483,
"rewards/format_reward": 0.9571428775787354,
"step": 2140
},
{
"epoch": 1.0,
"eval_completion_length": 203.0802721296038,
"eval_kl": 0.6515764508928571,
"eval_loss": 0.1416667252779007,
"eval_reward": 1.1919643453189306,
"eval_reward_std": 0.2649446129798889,
"eval_rewards/accuracy_reward": 0.24681123665400914,
"eval_rewards/format_reward": 0.9451530916350228,
"eval_runtime": 118.4791,
"eval_samples_per_second": 2.532,
"eval_steps_per_second": 0.025,
"step": 2143
},
{
"epoch": 1.0,
"step": 2143,
"total_flos": 0.0,
"train_loss": 0.08651691437249102,
"train_runtime": 12893.4863,
"train_samples_per_second": 1.163,
"train_steps_per_second": 0.166
}
],
"logging_steps": 10,
"max_steps": 2143,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}