{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 2143, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 211.3571517944336, "epoch": 0.004666355576294913, "grad_norm": 2.9974076747894287, "kl": 0.00047588348388671875, "learning_rate": 4.6511627906976744e-08, "loss": 0.044, "reward": 0.6535714566707611, "reward_std": 0.5595175564289093, "rewards/accuracy_reward": 0.010714286193251609, "rewards/format_reward": 0.6428571760654449, "step": 10 }, { "completion_length": 211.85358123779298, "epoch": 0.009332711152589827, "grad_norm": 2.421142816543579, "kl": 0.0005392074584960937, "learning_rate": 9.302325581395349e-08, "loss": 0.0331, "reward": 0.6000000357627868, "reward_std": 0.6225969046354294, "rewards/accuracy_reward": 0.021428572386503218, "rewards/format_reward": 0.5785714596509933, "step": 20 }, { "completion_length": 225.71072235107422, "epoch": 0.013999066728884742, "grad_norm": 2.8735339641571045, "kl": 0.0006612777709960937, "learning_rate": 1.3953488372093021e-07, "loss": 0.0571, "reward": 0.5857143223285675, "reward_std": 0.5592944413423538, "rewards/accuracy_reward": 0.00714285746216774, "rewards/format_reward": 0.5785714656114578, "step": 30 }, { "completion_length": 205.19286651611327, "epoch": 0.018665422305179653, "grad_norm": 2.0310211181640625, "kl": 0.0027374267578125, "learning_rate": 1.8604651162790698e-07, "loss": 0.0065, "reward": 0.6285714566707611, "reward_std": 0.5308447808027268, "rewards/accuracy_reward": 0.01428571492433548, "rewards/format_reward": 0.6142857350409031, "step": 40 }, { "completion_length": 194.54643707275392, "epoch": 0.02333177788147457, "grad_norm": 3.083293914794922, "kl": 0.00838165283203125, "learning_rate": 2.3255813953488372e-07, "loss": 0.0626, "reward": 0.7464286029338837, "reward_std": 0.46758472323417666, "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.7285714626312256, "step": 50 }, { "completion_length": 197.08929595947265, "epoch": 0.027998133457769483, "grad_norm": 2.4293816089630127, "kl": 0.0138580322265625, "learning_rate": 2.7906976744186043e-07, "loss": 0.0292, "reward": 0.685714328289032, "reward_std": 0.47790482342243196, "rewards/accuracy_reward": 0.00714285746216774, "rewards/format_reward": 0.67857146859169, "step": 60 }, { "completion_length": 193.36786499023438, "epoch": 0.032664489034064395, "grad_norm": 1.755541443824768, "kl": 0.00519561767578125, "learning_rate": 3.2558139534883724e-07, "loss": 0.028, "reward": 0.8035714745521545, "reward_std": 0.36622021347284317, "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.7857143342494964, "step": 70 }, { "completion_length": 206.29644012451172, "epoch": 0.03733084461035931, "grad_norm": 1.8061885833740234, "kl": 0.0106536865234375, "learning_rate": 3.7209302325581396e-07, "loss": 0.023, "reward": 0.6857143223285675, "reward_std": 0.4770329385995865, "rewards/accuracy_reward": 0.00714285746216774, "rewards/format_reward": 0.6785714626312256, "step": 80 }, { "completion_length": 205.0571517944336, "epoch": 0.041997200186654225, "grad_norm": 1.5979427099227905, "kl": 0.008978271484375, "learning_rate": 4.186046511627907e-07, "loss": 0.0297, "reward": 0.8071428894996643, "reward_std": 0.3088400363922119, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8071428894996643, "step": 90 }, { "completion_length": 217.22858123779298, "epoch": 0.04666355576294914, "grad_norm": 1.7703214883804321, "kl": 0.0110107421875, "learning_rate": 4.6511627906976743e-07, "loss": 0.037, "reward": 0.7285714745521545, "reward_std": 0.4834458529949188, "rewards/accuracy_reward": 0.00714285746216774, "rewards/format_reward": 0.7214286148548126, "step": 100 }, { "completion_length": 198.60000915527343, "epoch": 0.05132991133924405, "grad_norm": 1.311502456665039, "kl": 0.0128143310546875, "learning_rate": 5.116279069767442e-07, "loss": 0.0184, "reward": 0.7964286148548126, "reward_std": 0.3687057480216026, "rewards/accuracy_reward": 0.010714286193251609, "rewards/format_reward": 0.7857143342494964, "step": 110 }, { "completion_length": 210.16072540283204, "epoch": 0.05599626691553897, "grad_norm": 4.063913345336914, "kl": 0.010107421875, "learning_rate": 5.581395348837209e-07, "loss": 0.0155, "reward": 0.7964286148548126, "reward_std": 0.34110155403614045, "rewards/accuracy_reward": 0.010714286193251609, "rewards/format_reward": 0.785714328289032, "step": 120 }, { "completion_length": 203.16429443359374, "epoch": 0.06066262249183388, "grad_norm": 1.8343654870986938, "kl": 0.00831298828125, "learning_rate": 6.046511627906976e-07, "loss": 0.0343, "reward": 0.8607143282890319, "reward_std": 0.25221002399921416, "rewards/accuracy_reward": 0.00357142873108387, "rewards/format_reward": 0.8571429014205932, "step": 130 }, { "completion_length": 202.78929595947267, "epoch": 0.06532897806812879, "grad_norm": 0.8969087600708008, "kl": 0.0120025634765625, "learning_rate": 6.511627906976745e-07, "loss": 0.0145, "reward": 0.8321428954601288, "reward_std": 0.2753357619047165, "rewards/accuracy_reward": 0.00357142873108387, "rewards/format_reward": 0.8285714685916901, "step": 140 }, { "completion_length": 200.78929595947267, "epoch": 0.0699953336444237, "grad_norm": 2.4758172035217285, "kl": 0.016839599609375, "learning_rate": 6.976744186046511e-07, "loss": 0.0205, "reward": 0.90357146859169, "reward_std": 0.22839727699756623, "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.8857143342494964, "step": 150 }, { "completion_length": 208.1607238769531, "epoch": 0.07466168922071861, "grad_norm": 1.8052928447723389, "kl": 0.012713623046875, "learning_rate": 7.441860465116279e-07, "loss": 0.0122, "reward": 0.9321428835391998, "reward_std": 0.1500000089406967, "rewards/accuracy_reward": 0.00357142873108387, "rewards/format_reward": 0.9285714566707611, "step": 160 }, { "completion_length": 210.90000762939454, "epoch": 0.07932804479701354, "grad_norm": 1.3043458461761475, "kl": 0.0185272216796875, "learning_rate": 7.906976744186046e-07, "loss": 0.0076, "reward": 0.9178571820259094, "reward_std": 0.1664957284927368, "rewards/accuracy_reward": 0.00357142873108387, "rewards/format_reward": 0.9142857551574707, "step": 170 }, { "completion_length": 204.02858123779296, "epoch": 0.08399440037330845, "grad_norm": 1.9711344242095947, "kl": 0.024969482421875, "learning_rate": 8.372093023255814e-07, "loss": -0.0035, "reward": 0.903571480512619, "reward_std": 0.1994871750473976, "rewards/accuracy_reward": 0.010714286193251609, "rewards/format_reward": 0.8928571879863739, "step": 180 }, { "completion_length": 201.1821517944336, "epoch": 0.08866075594960336, "grad_norm": 8.437910079956055, "kl": 0.023162841796875, "learning_rate": 8.837209302325581e-07, "loss": 0.0113, "reward": 0.9178571820259094, "reward_std": 0.18299144729971886, "rewards/accuracy_reward": 0.010714286193251609, "rewards/format_reward": 0.9071428894996643, "step": 190 }, { "completion_length": 205.58929443359375, "epoch": 0.09332711152589827, "grad_norm": 0.09394335001707077, "kl": 0.01868896484375, "learning_rate": 9.302325581395349e-07, "loss": 0.0195, "reward": 0.9214285910129547, "reward_std": 0.13299144804477692, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9214285910129547, "step": 200 }, { "completion_length": 217.59286651611328, "epoch": 0.09799346710219319, "grad_norm": 2.8316187858581543, "kl": 0.021905517578125, "learning_rate": 9.767441860465115e-07, "loss": 0.0257, "reward": 0.860714340209961, "reward_std": 0.2807814501225948, "rewards/accuracy_reward": 0.010714286193251609, "rewards/format_reward": 0.8500000536441803, "step": 210 }, { "completion_length": 205.23929290771486, "epoch": 0.1026598226784881, "grad_norm": 1.3520991802215576, "kl": 0.015557861328125, "learning_rate": 9.99983405533249e-07, "loss": 0.0129, "reward": 0.9571429014205932, "reward_std": 0.11428571939468384, "rewards/accuracy_reward": 0.00714285746216774, "rewards/format_reward": 0.9500000238418579, "step": 220 }, { "completion_length": 201.4571533203125, "epoch": 0.10732617825478301, "grad_norm": 6.0174970626831055, "kl": 0.034246826171875, "learning_rate": 9.99850656408199e-07, "loss": 0.016, "reward": 0.9785714983940125, "reward_std": 0.1857142947614193, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9428571701049805, "step": 230 }, { "completion_length": 197.5357223510742, "epoch": 0.11199253383107793, "grad_norm": 1.7986341714859009, "kl": 0.0531494140625, "learning_rate": 9.995851934039294e-07, "loss": 0.0582, "reward": 1.096428632736206, "reward_std": 0.23630646169185637, "rewards/accuracy_reward": 0.12500000596046448, "rewards/format_reward": 0.9714285850524902, "step": 240 }, { "completion_length": 208.5428680419922, "epoch": 0.11665888940737285, "grad_norm": 2.8913028240203857, "kl": 0.0509765625, "learning_rate": 9.991870870027424e-07, "loss": 0.0426, "reward": 1.0678572058677673, "reward_std": 0.19603439420461655, "rewards/accuracy_reward": 0.10357143431901931, "rewards/format_reward": 0.9642857313156128, "step": 250 }, { "completion_length": 187.68929443359374, "epoch": 0.12132524498366776, "grad_norm": 0.7588302493095398, "kl": 0.05869140625, "learning_rate": 9.98656442904699e-07, "loss": 0.028, "reward": 1.110714328289032, "reward_std": 0.17129081785678862, "rewards/accuracy_reward": 0.13928571566939354, "rewards/format_reward": 0.9714285850524902, "step": 260 }, { "completion_length": 196.61429443359376, "epoch": 0.12599160055996267, "grad_norm": 36.95159912109375, "kl": 0.0792724609375, "learning_rate": 9.979934019995547e-07, "loss": 0.0407, "reward": 1.1428571820259095, "reward_std": 0.299724480509758, "rewards/accuracy_reward": 0.20714286752045155, "rewards/format_reward": 0.935714316368103, "step": 270 }, { "completion_length": 207.1464385986328, "epoch": 0.13065795613625758, "grad_norm": 2.451353073120117, "kl": 0.057666015625, "learning_rate": 9.97198140329352e-07, "loss": 0.032, "reward": 1.1285714864730836, "reward_std": 0.24082783833146096, "rewards/accuracy_reward": 0.17142857611179352, "rewards/format_reward": 0.9571428775787354, "step": 280 }, { "completion_length": 207.25000610351563, "epoch": 0.1353243117125525, "grad_norm": 1.8005753755569458, "kl": 0.056298828125, "learning_rate": 9.962708690416806e-07, "loss": 0.0108, "reward": 1.221428632736206, "reward_std": 0.20909458994865418, "rewards/accuracy_reward": 0.25000000894069674, "rewards/format_reward": 0.9714285850524902, "step": 290 }, { "completion_length": 201.07143859863282, "epoch": 0.1399906672888474, "grad_norm": 1.7425264120101929, "kl": 0.052978515625, "learning_rate": 9.952118343336157e-07, "loss": 0.0208, "reward": 1.1678571820259094, "reward_std": 0.24439741671085358, "rewards/accuracy_reward": 0.22500001043081283, "rewards/format_reward": 0.9428571581840515, "step": 300 }, { "completion_length": 211.52858123779296, "epoch": 0.14465702286514232, "grad_norm": 2.8271803855895996, "kl": 0.0644287109375, "learning_rate": 9.940213173863515e-07, "loss": 0.0253, "reward": 1.1714285969734193, "reward_std": 0.2888915419578552, "rewards/accuracy_reward": 0.22857143878936767, "rewards/format_reward": 0.9428571701049805, "step": 310 }, { "completion_length": 212.41786499023436, "epoch": 0.14932337844143723, "grad_norm": 1.6736899614334106, "kl": 0.063427734375, "learning_rate": 9.926996342905446e-07, "loss": 0.0327, "reward": 1.2321429014205934, "reward_std": 0.22101710960268975, "rewards/accuracy_reward": 0.26785715818405154, "rewards/format_reward": 0.9642857313156128, "step": 320 }, { "completion_length": 222.3071533203125, "epoch": 0.15398973401773214, "grad_norm": 1.0232563018798828, "kl": 0.0625244140625, "learning_rate": 9.912471359623905e-07, "loss": 0.0303, "reward": 1.2321429014205934, "reward_std": 0.27462306767702105, "rewards/accuracy_reward": 0.30357144474983216, "rewards/format_reward": 0.9285714626312256, "step": 330 }, { "completion_length": 199.1357223510742, "epoch": 0.15865608959402708, "grad_norm": 0.4898677170276642, "kl": 0.057177734375, "learning_rate": 9.89664208050453e-07, "loss": 0.0513, "reward": 1.2321429133415223, "reward_std": 0.25317725613713266, "rewards/accuracy_reward": 0.27500001192092893, "rewards/format_reward": 0.9571428775787354, "step": 340 }, { "completion_length": 184.546435546875, "epoch": 0.163322445170322, "grad_norm": 2.1455774307250977, "kl": 0.069482421875, "learning_rate": 9.879512708332718e-07, "loss": 0.0067, "reward": 1.2250000476837157, "reward_std": 0.18287093117833136, "rewards/accuracy_reward": 0.2607142999768257, "rewards/format_reward": 0.9642857313156128, "step": 350 }, { "completion_length": 182.8071517944336, "epoch": 0.1679888007466169, "grad_norm": 1.5658093690872192, "kl": 0.0677490234375, "learning_rate": 9.861087791077743e-07, "loss": 0.0095, "reward": 1.2750000476837158, "reward_std": 0.16363511905074118, "rewards/accuracy_reward": 0.29642858952283857, "rewards/format_reward": 0.9785714387893677, "step": 360 }, { "completion_length": 190.7214385986328, "epoch": 0.1726551563229118, "grad_norm": 1.7997597455978394, "kl": 0.05615234375, "learning_rate": 9.841372220685253e-07, "loss": 0.0157, "reward": 1.235714316368103, "reward_std": 0.2049168437719345, "rewards/accuracy_reward": 0.2785714462399483, "rewards/format_reward": 0.9571428775787354, "step": 370 }, { "completion_length": 193.371435546875, "epoch": 0.17732151189920672, "grad_norm": 1.4841110706329346, "kl": 0.060205078125, "learning_rate": 9.820371231778422e-07, "loss": 0.0343, "reward": 1.3142857551574707, "reward_std": 0.23386457264423371, "rewards/accuracy_reward": 0.3642857328057289, "rewards/format_reward": 0.9500000178813934, "step": 380 }, { "completion_length": 201.2071563720703, "epoch": 0.18198786747550164, "grad_norm": 1.3704997301101685, "kl": 0.0554443359375, "learning_rate": 9.79809040026811e-07, "loss": 0.0187, "reward": 1.2250000596046449, "reward_std": 0.2502213083207607, "rewards/accuracy_reward": 0.28214287348091605, "rewards/format_reward": 0.9428571701049805, "step": 390 }, { "completion_length": 186.56429443359374, "epoch": 0.18665422305179655, "grad_norm": 2.2330009937286377, "kl": 0.0556884765625, "learning_rate": 9.774535641872433e-07, "loss": 0.0379, "reward": 1.2428571820259093, "reward_std": 0.3126678854227066, "rewards/accuracy_reward": 0.3214285898953676, "rewards/format_reward": 0.9214286029338836, "step": 400 }, { "completion_length": 171.86786651611328, "epoch": 0.19132057862809146, "grad_norm": 1.9319186210632324, "kl": 0.05830078125, "learning_rate": 9.749713210546087e-07, "loss": 0.0061, "reward": 1.371428632736206, "reward_std": 0.2255903147161007, "rewards/accuracy_reward": 0.4000000223517418, "rewards/format_reward": 0.9714285790920257, "step": 410 }, { "completion_length": 194.71429443359375, "epoch": 0.19598693420438637, "grad_norm": 1.76435387134552, "kl": 0.0529052734375, "learning_rate": 9.723629696819884e-07, "loss": 0.0143, "reward": 1.285714340209961, "reward_std": 0.20479509681463243, "rewards/accuracy_reward": 0.3357142999768257, "rewards/format_reward": 0.9500000238418579, "step": 420 }, { "completion_length": 188.8714370727539, "epoch": 0.20065328978068128, "grad_norm": 1.158916711807251, "kl": 0.0750244140625, "learning_rate": 9.696292026050922e-07, "loss": 0.0165, "reward": 1.3178571939468384, "reward_std": 0.26929790526628494, "rewards/accuracy_reward": 0.382142873480916, "rewards/format_reward": 0.935714316368103, "step": 430 }, { "completion_length": 179.84644012451173, "epoch": 0.2053196453569762, "grad_norm": 1.382643699645996, "kl": 0.11826171875, "learning_rate": 9.66770745658385e-07, "loss": 0.0183, "reward": 1.2571429371833802, "reward_std": 0.21266788095235825, "rewards/accuracy_reward": 0.3071428656578064, "rewards/format_reward": 0.9500000238418579, "step": 440 }, { "completion_length": 180.546435546875, "epoch": 0.2099860009332711, "grad_norm": 1.8777272701263428, "kl": 0.0839599609375, "learning_rate": 9.637883577823721e-07, "loss": 0.0276, "reward": 1.3428572058677672, "reward_std": 0.24049336314201356, "rewards/accuracy_reward": 0.40000001937150953, "rewards/format_reward": 0.9428571701049805, "step": 450 }, { "completion_length": 190.68572387695312, "epoch": 0.21465235650956602, "grad_norm": 1.0944005250930786, "kl": 0.0735107421875, "learning_rate": 9.606828308220969e-07, "loss": 0.0113, "reward": 1.2785714745521546, "reward_std": 0.21487789005041122, "rewards/accuracy_reward": 0.32857144325971605, "rewards/format_reward": 0.9500000238418579, "step": 460 }, { "completion_length": 197.60357971191405, "epoch": 0.21931871208586096, "grad_norm": 0.9634405970573425, "kl": 0.073779296875, "learning_rate": 9.574549893168977e-07, "loss": 0.0197, "reward": 1.2535714626312255, "reward_std": 0.17683308124542235, "rewards/accuracy_reward": 0.2892857242375612, "rewards/format_reward": 0.9642857313156128, "step": 470 }, { "completion_length": 203.69643859863282, "epoch": 0.22398506766215587, "grad_norm": 1.3643263578414917, "kl": 0.08203125, "learning_rate": 9.541056902814896e-07, "loss": 0.0227, "reward": 1.285714340209961, "reward_std": 0.27460705786943435, "rewards/accuracy_reward": 0.37857144623994826, "rewards/format_reward": 0.9071428894996643, "step": 480 }, { "completion_length": 220.28929595947267, "epoch": 0.22865142323845078, "grad_norm": 1.668428897857666, "kl": 0.0689453125, "learning_rate": 9.506358229784194e-07, "loss": 0.0146, "reward": 1.3071429133415222, "reward_std": 0.17622366920113564, "rewards/accuracy_reward": 0.3571428686380386, "rewards/format_reward": 0.9500000238418579, "step": 490 }, { "completion_length": 216.17501220703124, "epoch": 0.2333177788147457, "grad_norm": 2.019341468811035, "kl": 0.079345703125, "learning_rate": 9.4704630868196e-07, "loss": 0.0646, "reward": 1.1821429014205933, "reward_std": 0.23459327667951585, "rewards/accuracy_reward": 0.2535714410245419, "rewards/format_reward": 0.9285714566707611, "step": 500 }, { "completion_length": 205.07501068115235, "epoch": 0.2379841343910406, "grad_norm": 1.7414186000823975, "kl": 0.0832763671875, "learning_rate": 9.433381004335061e-07, "loss": 0.0468, "reward": 1.2071429073810578, "reward_std": 0.2705150328576565, "rewards/accuracy_reward": 0.30000001192092896, "rewards/format_reward": 0.9071428894996643, "step": 510 }, { "completion_length": 180.91429595947267, "epoch": 0.24265048996733551, "grad_norm": 2.2701659202575684, "kl": 0.124853515625, "learning_rate": 9.395121827885355e-07, "loss": 0.0327, "reward": 1.3142857670783996, "reward_std": 0.21858522519469262, "rewards/accuracy_reward": 0.37142859064042566, "rewards/format_reward": 0.9428571701049805, "step": 520 }, { "completion_length": 192.1821517944336, "epoch": 0.24731684554363043, "grad_norm": 14.688100814819336, "kl": 0.170263671875, "learning_rate": 9.355695715552011e-07, "loss": 0.0272, "reward": 1.2142857670783997, "reward_std": 0.20738017484545707, "rewards/accuracy_reward": 0.27142858095467093, "rewards/format_reward": 0.9428571701049805, "step": 530 }, { "completion_length": 202.57500915527345, "epoch": 0.25198320111992534, "grad_norm": 2.822713613510132, "kl": 0.1373046875, "learning_rate": 9.315113135246283e-07, "loss": 0.0432, "reward": 1.1928571820259095, "reward_std": 0.35453804582357407, "rewards/accuracy_reward": 0.28571429997682574, "rewards/format_reward": 0.9071429014205933, "step": 540 }, { "completion_length": 202.9321517944336, "epoch": 0.25664955669622025, "grad_norm": 1.5090000629425049, "kl": 0.12421875, "learning_rate": 9.273384861929836e-07, "loss": 0.0491, "reward": 1.1178572058677674, "reward_std": 0.23188644349575044, "rewards/accuracy_reward": 0.19642858132719992, "rewards/format_reward": 0.9214286029338836, "step": 550 }, { "completion_length": 192.82857666015624, "epoch": 0.26131591227251516, "grad_norm": 0.5006041526794434, "kl": 0.205029296875, "learning_rate": 9.230521974753919e-07, "loss": 0.0594, "reward": 1.2321429252624512, "reward_std": 0.2444589801132679, "rewards/accuracy_reward": 0.3321428716182709, "rewards/format_reward": 0.9000000298023224, "step": 560 }, { "completion_length": 171.6428649902344, "epoch": 0.26598226784881007, "grad_norm": 1.1855828762054443, "kl": 0.193359375, "learning_rate": 9.186535854117776e-07, "loss": 0.037, "reward": 1.260714328289032, "reward_std": 0.2411015473306179, "rewards/accuracy_reward": 0.33928573578596116, "rewards/format_reward": 0.9214286029338836, "step": 570 }, { "completion_length": 180.17143707275392, "epoch": 0.270648623425105, "grad_norm": 1.677296757698059, "kl": 0.1857421875, "learning_rate": 9.141438178647065e-07, "loss": 0.0374, "reward": 1.3000000596046448, "reward_std": 0.2666125223040581, "rewards/accuracy_reward": 0.3714285910129547, "rewards/format_reward": 0.9285714626312256, "step": 580 }, { "completion_length": 186.6821533203125, "epoch": 0.2753149790013999, "grad_norm": 5.2433247566223145, "kl": 0.16669921875, "learning_rate": 9.095240922093104e-07, "loss": 0.0407, "reward": 1.3500000715255738, "reward_std": 0.18397593572735788, "rewards/accuracy_reward": 0.39285715818405154, "rewards/format_reward": 0.9571428656578064, "step": 590 }, { "completion_length": 180.6428680419922, "epoch": 0.2799813345776948, "grad_norm": 0.4842807650566101, "kl": 0.190380859375, "learning_rate": 9.047956350153752e-07, "loss": 0.0147, "reward": 1.246428620815277, "reward_std": 0.21586237102746964, "rewards/accuracy_reward": 0.28928572684526443, "rewards/format_reward": 0.9571428775787354, "step": 600 }, { "completion_length": 191.20358123779297, "epoch": 0.2846476901539897, "grad_norm": 2.350338935852051, "kl": 0.14912109375, "learning_rate": 8.999597017216782e-07, "loss": 0.0334, "reward": 1.3035714983940125, "reward_std": 0.16870573312044143, "rewards/accuracy_reward": 0.33928572833538057, "rewards/format_reward": 0.9642857313156128, "step": 610 }, { "completion_length": 193.98215026855468, "epoch": 0.28931404573028463, "grad_norm": 2.8800251483917236, "kl": 0.21923828125, "learning_rate": 8.950175763026604e-07, "loss": 0.0245, "reward": 1.2071429133415221, "reward_std": 0.2516971692442894, "rewards/accuracy_reward": 0.2642857298254967, "rewards/format_reward": 0.942857164144516, "step": 620 }, { "completion_length": 191.61429290771486, "epoch": 0.29398040130657954, "grad_norm": 2.0671656131744385, "kl": 0.1837890625, "learning_rate": 8.899705709275217e-07, "loss": 0.0145, "reward": 1.385714340209961, "reward_std": 0.20700510069727898, "rewards/accuracy_reward": 0.4428571671247482, "rewards/format_reward": 0.9428571701049805, "step": 630 }, { "completion_length": 199.62858123779296, "epoch": 0.29864675688287445, "grad_norm": 5.944628715515137, "kl": 0.29609375, "learning_rate": 8.848200256118312e-07, "loss": 0.0386, "reward": 1.246428644657135, "reward_std": 0.2530567437410355, "rewards/accuracy_reward": 0.3035714406520128, "rewards/format_reward": 0.942857164144516, "step": 640 }, { "completion_length": 227.48572540283203, "epoch": 0.30331311245916937, "grad_norm": 8.969457626342773, "kl": 0.387255859375, "learning_rate": 8.795673078617432e-07, "loss": 0.0707, "reward": 1.2464286088943481, "reward_std": 0.2629224382340908, "rewards/accuracy_reward": 0.3250000149011612, "rewards/format_reward": 0.9214286029338836, "step": 650 }, { "completion_length": 210.75715026855468, "epoch": 0.3079794680354643, "grad_norm": 4.416165828704834, "kl": 0.473828125, "learning_rate": 8.74213812310915e-07, "loss": 0.0801, "reward": 1.221428632736206, "reward_std": 0.27719090431928634, "rewards/accuracy_reward": 0.29285715967416764, "rewards/format_reward": 0.9285714626312256, "step": 660 }, { "completion_length": 200.8928649902344, "epoch": 0.31264582361175924, "grad_norm": 7.957707405090332, "kl": 0.52421875, "learning_rate": 8.68760960350222e-07, "loss": 0.0485, "reward": 1.221428644657135, "reward_std": 0.31562927216291425, "rewards/accuracy_reward": 0.3071428701281548, "rewards/format_reward": 0.9142857432365418, "step": 670 }, { "completion_length": 194.02857971191406, "epoch": 0.31731217918805416, "grad_norm": 2.773921012878418, "kl": 0.4138671875, "learning_rate": 8.632101997503674e-07, "loss": 0.0431, "reward": 1.246428620815277, "reward_std": 0.2256075546145439, "rewards/accuracy_reward": 0.3178571492433548, "rewards/format_reward": 0.9285714626312256, "step": 680 }, { "completion_length": 183.4214324951172, "epoch": 0.32197853476434907, "grad_norm": 3.124154567718506, "kl": 0.189404296875, "learning_rate": 8.575630042774902e-07, "loss": 0.0263, "reward": 1.260714340209961, "reward_std": 0.1950671538710594, "rewards/accuracy_reward": 0.3107142999768257, "rewards/format_reward": 0.950000011920929, "step": 690 }, { "completion_length": 214.20000915527345, "epoch": 0.326644890340644, "grad_norm": 5.170936107635498, "kl": 0.279833984375, "learning_rate": 8.518208733018689e-07, "loss": 0.0798, "reward": 1.2071429252624513, "reward_std": 0.36203873455524443, "rewards/accuracy_reward": 0.30714287534356116, "rewards/format_reward": 0.9000000417232513, "step": 700 }, { "completion_length": 236.79644165039062, "epoch": 0.3313112459169389, "grad_norm": 4.168279647827148, "kl": 0.5509765625, "learning_rate": 8.459853313998283e-07, "loss": 0.1131, "reward": 1.2428571939468385, "reward_std": 0.33234085887670517, "rewards/accuracy_reward": 0.35714287459850313, "rewards/format_reward": 0.8857143223285675, "step": 710 }, { "completion_length": 212.58929443359375, "epoch": 0.3359776014932338, "grad_norm": 5.4327569007873535, "kl": 0.4990234375, "learning_rate": 8.400579279489541e-07, "loss": 0.095, "reward": 1.2250000715255738, "reward_std": 0.31540548279881475, "rewards/accuracy_reward": 0.2964285884052515, "rewards/format_reward": 0.9285714626312256, "step": 720 }, { "completion_length": 219.4178665161133, "epoch": 0.3406439570695287, "grad_norm": 6.700848579406738, "kl": 0.37919921875, "learning_rate": 8.340402367167216e-07, "loss": 0.0824, "reward": 1.1571429014205932, "reward_std": 0.30023063272237777, "rewards/accuracy_reward": 0.25000000819563867, "rewards/format_reward": 0.9071428835391998, "step": 730 }, { "completion_length": 193.31786346435547, "epoch": 0.3453103126458236, "grad_norm": 4.084702014923096, "kl": 0.28115234375, "learning_rate": 8.2793385544265e-07, "loss": 0.0288, "reward": 1.2535714745521545, "reward_std": 0.3358024753630161, "rewards/accuracy_reward": 0.339285734295845, "rewards/format_reward": 0.9142857432365418, "step": 740 }, { "completion_length": 208.07858123779297, "epoch": 0.34997666822211854, "grad_norm": 5.686543941497803, "kl": 0.33671875, "learning_rate": 8.217404054140909e-07, "loss": 0.0335, "reward": 1.1714286088943482, "reward_std": 0.2581101007759571, "rewards/accuracy_reward": 0.23571430072188376, "rewards/format_reward": 0.9357143104076385, "step": 750 }, { "completion_length": 205.19286651611327, "epoch": 0.35464302379841345, "grad_norm": 2.9921562671661377, "kl": 0.177294921875, "learning_rate": 8.154615310357649e-07, "loss": 0.0755, "reward": 1.2428571939468385, "reward_std": 0.27265038043260575, "rewards/accuracy_reward": 0.30714286789298056, "rewards/format_reward": 0.935714316368103, "step": 760 }, { "completion_length": 211.06072692871095, "epoch": 0.35930937937470836, "grad_norm": 2.7796568870544434, "kl": 0.48134765625, "learning_rate": 8.090988993931609e-07, "loss": 0.0967, "reward": 1.3071429133415222, "reward_std": 0.27672048956155776, "rewards/accuracy_reward": 0.39285716861486436, "rewards/format_reward": 0.9142857491970062, "step": 770 }, { "completion_length": 210.21429595947265, "epoch": 0.36397573495100327, "grad_norm": 4.467612266540527, "kl": 0.39296875, "learning_rate": 8.026541998099126e-07, "loss": 0.1026, "reward": 1.1821429014205933, "reward_std": 0.20331501960754395, "rewards/accuracy_reward": 0.24642858393490313, "rewards/format_reward": 0.9357143044471741, "step": 780 }, { "completion_length": 184.62858123779296, "epoch": 0.3686420905272982, "grad_norm": 2.8182897567749023, "kl": 0.4158203125, "learning_rate": 7.961291433992723e-07, "loss": 0.0864, "reward": 1.2571429133415222, "reward_std": 0.3440789520740509, "rewards/accuracy_reward": 0.35714287906885145, "rewards/format_reward": 0.9000000417232513, "step": 790 }, { "completion_length": 177.10000915527343, "epoch": 0.3733084461035931, "grad_norm": 5.841248035430908, "kl": 0.25205078125, "learning_rate": 7.895254626097964e-07, "loss": 0.0477, "reward": 1.3178572177886962, "reward_std": 0.26967298090457914, "rewards/accuracy_reward": 0.3750000178813934, "rewards/format_reward": 0.9428571701049805, "step": 800 }, { "completion_length": 186.17857971191407, "epoch": 0.377974801679888, "grad_norm": 2.7772974967956543, "kl": 0.372216796875, "learning_rate": 7.828449107653703e-07, "loss": 0.0548, "reward": 1.2035714864730835, "reward_std": 0.21819290220737458, "rewards/accuracy_reward": 0.26071429774165156, "rewards/format_reward": 0.9428571701049805, "step": 810 }, { "completion_length": 179.5714370727539, "epoch": 0.3826411572561829, "grad_norm": 3.573528528213501, "kl": 0.5103515625, "learning_rate": 7.760892615996862e-07, "loss": 0.0807, "reward": 1.296428620815277, "reward_std": 0.20319449976086618, "rewards/accuracy_reward": 0.3392857268452644, "rewards/format_reward": 0.9571428775787354, "step": 820 }, { "completion_length": 201.9464370727539, "epoch": 0.38730751283247783, "grad_norm": 2.7865989208221436, "kl": 0.32451171875, "learning_rate": 7.692603087853061e-07, "loss": 0.129, "reward": 1.1964286327362061, "reward_std": 0.2745025597512722, "rewards/accuracy_reward": 0.28214287012815475, "rewards/format_reward": 0.9142857372760773, "step": 830 }, { "completion_length": 187.04286499023436, "epoch": 0.39197386840877274, "grad_norm": 5.086669445037842, "kl": 0.412451171875, "learning_rate": 7.623598654574282e-07, "loss": 0.0784, "reward": 1.2285714864730835, "reward_std": 0.19613576233386992, "rewards/accuracy_reward": 0.2785714466124773, "rewards/format_reward": 0.9500000238418579, "step": 840 }, { "completion_length": 205.9321517944336, "epoch": 0.39664022398506765, "grad_norm": 6.3026580810546875, "kl": 0.42822265625, "learning_rate": 7.553897637324871e-07, "loss": 0.1118, "reward": 1.23571435213089, "reward_std": 0.264027439057827, "rewards/accuracy_reward": 0.32142858356237414, "rewards/format_reward": 0.9142857491970062, "step": 850 }, { "completion_length": 211.72857971191405, "epoch": 0.40130657956136256, "grad_norm": 7.240902423858643, "kl": 0.609912109375, "learning_rate": 7.483518542217136e-07, "loss": 0.1452, "reward": 1.2392857789993286, "reward_std": 0.2891633503139019, "rewards/accuracy_reward": 0.3250000163912773, "rewards/format_reward": 0.9142857491970062, "step": 860 }, { "completion_length": 196.15000915527344, "epoch": 0.4059729351376575, "grad_norm": 7.294248104095459, "kl": 0.38701171875, "learning_rate": 7.412480055397843e-07, "loss": 0.0556, "reward": 1.2500000596046448, "reward_std": 0.2849683463573456, "rewards/accuracy_reward": 0.3285714417695999, "rewards/format_reward": 0.9214285969734192, "step": 870 }, { "completion_length": 202.96786651611328, "epoch": 0.4106392907139524, "grad_norm": 2.0189414024353027, "kl": 0.35, "learning_rate": 7.340801038086918e-07, "loss": 0.0262, "reward": 1.2250000476837157, "reward_std": 0.19948717057704926, "rewards/accuracy_reward": 0.2750000137835741, "rewards/format_reward": 0.9500000238418579, "step": 880 }, { "completion_length": 190.95001068115235, "epoch": 0.4153056462902473, "grad_norm": 11.575712203979492, "kl": 0.3896484375, "learning_rate": 7.268500521569655e-07, "loss": 0.0922, "reward": 1.2142857670783997, "reward_std": 0.26723918691277504, "rewards/accuracy_reward": 0.3071428701281548, "rewards/format_reward": 0.9071428835391998, "step": 890 }, { "completion_length": 181.60000762939453, "epoch": 0.4199720018665422, "grad_norm": 8.530049324035645, "kl": 0.45927734375, "learning_rate": 7.195597702143772e-07, "loss": 0.0985, "reward": 1.1571429133415223, "reward_std": 0.27336115539073946, "rewards/accuracy_reward": 0.22857143953442574, "rewards/format_reward": 0.9285714626312256, "step": 900 }, { "completion_length": 190.7464385986328, "epoch": 0.4246383574428371, "grad_norm": 5.701374530792236, "kl": 1.1736328125, "learning_rate": 7.122111936022668e-07, "loss": 0.1988, "reward": 1.2500000596046448, "reward_std": 0.24506716057658195, "rewards/accuracy_reward": 0.32857144474983213, "rewards/format_reward": 0.9214286029338836, "step": 910 }, { "completion_length": 172.73214950561524, "epoch": 0.42930471301913203, "grad_norm": 4.281832695007324, "kl": 1.024951171875, "learning_rate": 7.048062734196204e-07, "loss": 0.1912, "reward": 1.2642857909202576, "reward_std": 0.343449330329895, "rewards/accuracy_reward": 0.35000001043081286, "rewards/format_reward": 0.9142857432365418, "step": 920 }, { "completion_length": 197.4178665161133, "epoch": 0.43397106859542695, "grad_norm": 4.720562934875488, "kl": 0.543603515625, "learning_rate": 6.9734697572504e-07, "loss": 0.0907, "reward": 1.2571429133415222, "reward_std": 0.2518312208354473, "rewards/accuracy_reward": 0.3285714462399483, "rewards/format_reward": 0.9285714566707611, "step": 930 }, { "completion_length": 196.92858123779297, "epoch": 0.4386374241717219, "grad_norm": 77.73806762695312, "kl": 1.05439453125, "learning_rate": 6.89835281014741e-07, "loss": 0.1745, "reward": 1.285714328289032, "reward_std": 0.28164542019367217, "rewards/accuracy_reward": 0.3642857253551483, "rewards/format_reward": 0.9214285969734192, "step": 940 }, { "completion_length": 179.39286575317382, "epoch": 0.4433037797480168, "grad_norm": 2.6931166648864746, "kl": 0.42705078125, "learning_rate": 6.822731836967168e-07, "loss": 0.0645, "reward": 1.3428572177886964, "reward_std": 0.19739395827054979, "rewards/accuracy_reward": 0.392857152223587, "rewards/format_reward": 0.9500000238418579, "step": 950 }, { "completion_length": 187.36429443359376, "epoch": 0.44797013532431174, "grad_norm": 5.5171427726745605, "kl": 0.443359375, "learning_rate": 6.746626915612085e-07, "loss": 0.0781, "reward": 1.2857143521308898, "reward_std": 0.19799869433045386, "rewards/accuracy_reward": 0.357142873108387, "rewards/format_reward": 0.9285714566707611, "step": 960 }, { "completion_length": 184.37500915527343, "epoch": 0.45263649090060665, "grad_norm": 13.36310863494873, "kl": 1.1681640625, "learning_rate": 6.670058252476235e-07, "loss": 0.2008, "reward": 1.3428571820259094, "reward_std": 0.262357784062624, "rewards/accuracy_reward": 0.41428573429584503, "rewards/format_reward": 0.9285714626312256, "step": 970 }, { "completion_length": 195.95358123779297, "epoch": 0.45730284647690156, "grad_norm": 2.2677857875823975, "kl": 1.1369140625, "learning_rate": 6.593046177080408e-07, "loss": 0.1455, "reward": 1.1714286088943482, "reward_std": 0.3041789963841438, "rewards/accuracy_reward": 0.2714285865426064, "rewards/format_reward": 0.9000000298023224, "step": 980 }, { "completion_length": 193.67857818603517, "epoch": 0.46196920205319647, "grad_norm": 1.030552625656128, "kl": 0.56953125, "learning_rate": 6.515611136674479e-07, "loss": 0.0992, "reward": 1.2642857789993287, "reward_std": 0.1844715215265751, "rewards/accuracy_reward": 0.32142859101295473, "rewards/format_reward": 0.9428571701049805, "step": 990 }, { "completion_length": 191.07143630981446, "epoch": 0.4666355576294914, "grad_norm": 7.8564453125, "kl": 0.504833984375, "learning_rate": 6.437773690808524e-07, "loss": 0.099, "reward": 1.3071429133415222, "reward_std": 0.24715664908289908, "rewards/accuracy_reward": 0.3714285910129547, "rewards/format_reward": 0.935714316368103, "step": 1000 }, { "completion_length": 201.9714370727539, "epoch": 0.4713019132057863, "grad_norm": 2.8117690086364746, "kl": 1.044677734375, "learning_rate": 6.359554505874109e-07, "loss": 0.2054, "reward": 1.196428608894348, "reward_std": 0.3307777248322964, "rewards/accuracy_reward": 0.2821428716182709, "rewards/format_reward": 0.9142857551574707, "step": 1010 }, { "completion_length": 189.82500915527345, "epoch": 0.4759682687820812, "grad_norm": 11.323598861694336, "kl": 0.63037109375, "learning_rate": 6.280974349617214e-07, "loss": 0.095, "reward": 1.2785714745521546, "reward_std": 0.26157640293240547, "rewards/accuracy_reward": 0.35000001788139345, "rewards/format_reward": 0.9285714566707611, "step": 1020 }, { "completion_length": 190.90001068115234, "epoch": 0.4806346243583761, "grad_norm": 14.018318176269531, "kl": 0.6455078125, "learning_rate": 6.202054085624261e-07, "loss": 0.1192, "reward": 1.2857143521308898, "reward_std": 0.23778653591871263, "rewards/accuracy_reward": 0.35714287161827085, "rewards/format_reward": 0.9285714566707611, "step": 1030 }, { "completion_length": 189.96786499023438, "epoch": 0.48530097993467103, "grad_norm": 5.3534932136535645, "kl": 0.632958984375, "learning_rate": 6.122814667782673e-07, "loss": 0.0864, "reward": 1.2285714626312256, "reward_std": 0.1533150166273117, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.9785714387893677, "step": 1040 }, { "completion_length": 195.0642936706543, "epoch": 0.48996733551096594, "grad_norm": 6.974513530731201, "kl": 0.494384765625, "learning_rate": 6.043277134717475e-07, "loss": 0.0765, "reward": 1.3321428894996643, "reward_std": 0.1950671575963497, "rewards/accuracy_reward": 0.37500001639127734, "rewards/format_reward": 0.9571428775787354, "step": 1050 }, { "completion_length": 204.17501068115234, "epoch": 0.49463369108726085, "grad_norm": 6.67943000793457, "kl": 0.497265625, "learning_rate": 5.963462604205392e-07, "loss": 0.0889, "reward": 1.260714340209961, "reward_std": 0.23447152674198152, "rewards/accuracy_reward": 0.3250000193715096, "rewards/format_reward": 0.935714316368103, "step": 1060 }, { "completion_length": 217.91072692871094, "epoch": 0.49930004666355576, "grad_norm": 2.6225903034210205, "kl": 1.211474609375, "learning_rate": 5.883392267567924e-07, "loss": 0.1539, "reward": 1.2142857789993287, "reward_std": 0.2801480941474438, "rewards/accuracy_reward": 0.28571429699659345, "rewards/format_reward": 0.9285714566707611, "step": 1070 }, { "completion_length": 200.80000762939454, "epoch": 0.5039664022398507, "grad_norm": 17.208133697509766, "kl": 1.3998046875, "learning_rate": 5.803087384044902e-07, "loss": 0.2627, "reward": 1.2071429014205932, "reward_std": 0.3747034803032875, "rewards/accuracy_reward": 0.3214285850524902, "rewards/format_reward": 0.885714328289032, "step": 1080 }, { "completion_length": 184.55357818603517, "epoch": 0.5086327578161456, "grad_norm": 4.367872714996338, "kl": 0.80498046875, "learning_rate": 5.722569275150019e-07, "loss": 0.1581, "reward": 1.2642857551574707, "reward_std": 0.2569018341600895, "rewards/accuracy_reward": 0.3285714417695999, "rewards/format_reward": 0.9357143104076385, "step": 1090 }, { "completion_length": 179.18572387695312, "epoch": 0.5132991133924405, "grad_norm": 4.963561058044434, "kl": 0.592578125, "learning_rate": 5.641859319009801e-07, "loss": 0.0957, "reward": 1.3250000715255736, "reward_std": 0.2563889928162098, "rewards/accuracy_reward": 0.3892857372760773, "rewards/format_reward": 0.9357143104076385, "step": 1100 }, { "completion_length": 180.88929290771483, "epoch": 0.5179654689687354, "grad_norm": 1.608384609222412, "kl": 0.348974609375, "learning_rate": 5.560978944687576e-07, "loss": 0.0775, "reward": 1.2714286208152772, "reward_std": 0.19492939710617066, "rewards/accuracy_reward": 0.32142858654260636, "rewards/format_reward": 0.9500000238418579, "step": 1110 }, { "completion_length": 164.9321502685547, "epoch": 0.5226318245450303, "grad_norm": 6.095414638519287, "kl": 0.59853515625, "learning_rate": 5.479949626493908e-07, "loss": 0.0792, "reward": 1.3428571939468383, "reward_std": 0.18014808967709542, "rewards/accuracy_reward": 0.37142859399318695, "rewards/format_reward": 0.9714285850524902, "step": 1120 }, { "completion_length": 179.71072387695312, "epoch": 0.5272981801213252, "grad_norm": 13.79627513885498, "kl": 0.299853515625, "learning_rate": 5.398792878285002e-07, "loss": 0.0579, "reward": 1.296428632736206, "reward_std": 0.16341925486922265, "rewards/accuracy_reward": 0.32500001788139343, "rewards/format_reward": 0.9714285850524902, "step": 1130 }, { "completion_length": 189.71429595947265, "epoch": 0.5319645356976201, "grad_norm": 7.43311071395874, "kl": 0.44296875, "learning_rate": 5.317530247750639e-07, "loss": 0.0818, "reward": 1.2785714983940124, "reward_std": 0.1776016980409622, "rewards/accuracy_reward": 0.32142858393490314, "rewards/format_reward": 0.9571428716182708, "step": 1140 }, { "completion_length": 207.7714416503906, "epoch": 0.5366308912739151, "grad_norm": 27.736406326293945, "kl": 1.2056640625, "learning_rate": 5.2361833106931e-07, "loss": 0.2633, "reward": 1.228571480512619, "reward_std": 0.36894305497407914, "rewards/accuracy_reward": 0.3571428719907999, "rewards/format_reward": 0.8714286148548126, "step": 1150 }, { "completion_length": 216.8928695678711, "epoch": 0.54129724685021, "grad_norm": 8.771681785583496, "kl": 0.88974609375, "learning_rate": 5.154773665298648e-07, "loss": 0.1611, "reward": 1.1535714745521546, "reward_std": 0.2724130667746067, "rewards/accuracy_reward": 0.2392857253551483, "rewards/format_reward": 0.9142857491970062, "step": 1160 }, { "completion_length": 190.18572082519532, "epoch": 0.5459636024265049, "grad_norm": 6.968381404876709, "kl": 0.37958984375, "learning_rate": 5.073322926403045e-07, "loss": 0.0619, "reward": 1.260714340209961, "reward_std": 0.15576233565807343, "rewards/accuracy_reward": 0.275000012293458, "rewards/format_reward": 0.9857142925262451, "step": 1170 }, { "completion_length": 220.9714370727539, "epoch": 0.5506299580027998, "grad_norm": 1.6743552684783936, "kl": 0.696728515625, "learning_rate": 4.991852719752678e-07, "loss": 0.1253, "reward": 1.2321429014205934, "reward_std": 0.24198277071118354, "rewards/accuracy_reward": 0.31071430146694184, "rewards/format_reward": 0.9214285969734192, "step": 1180 }, { "completion_length": 203.98929595947266, "epoch": 0.5552963135790947, "grad_norm": 8.832259178161621, "kl": 1.82734375, "learning_rate": 4.910384676262752e-07, "loss": 0.1067, "reward": 1.26071435213089, "reward_std": 0.32460705041885374, "rewards/accuracy_reward": 0.36785716116428374, "rewards/format_reward": 0.8928571701049804, "step": 1190 }, { "completion_length": 188.37858123779296, "epoch": 0.5599626691553896, "grad_norm": 4.268427848815918, "kl": 0.414404296875, "learning_rate": 4.828940426274142e-07, "loss": 0.0621, "reward": 1.3285714864730835, "reward_std": 0.23999654203653337, "rewards/accuracy_reward": 0.3714285850524902, "rewards/format_reward": 0.9571428775787354, "step": 1200 }, { "completion_length": 207.16429290771484, "epoch": 0.5646290247316845, "grad_norm": 22.58124542236328, "kl": 0.891845703125, "learning_rate": 4.747541593810377e-07, "loss": 0.1984, "reward": 1.2178572058677672, "reward_std": 0.3189430497586727, "rewards/accuracy_reward": 0.310714303329587, "rewards/format_reward": 0.9071428954601288, "step": 1210 }, { "completion_length": 218.35000915527343, "epoch": 0.5692953803079794, "grad_norm": 7.771918773651123, "kl": 0.96904296875, "learning_rate": 4.666209790836316e-07, "loss": 0.1555, "reward": 1.2107143342494964, "reward_std": 0.3533112980425358, "rewards/accuracy_reward": 0.3178571552038193, "rewards/format_reward": 0.8928571820259095, "step": 1220 }, { "completion_length": 187.87857971191406, "epoch": 0.5739617358842743, "grad_norm": 13.805558204650879, "kl": 0.933251953125, "learning_rate": 4.5849666115200143e-07, "loss": 0.1366, "reward": 1.2500000715255737, "reward_std": 0.21033736318349838, "rewards/accuracy_reward": 0.30714286863803864, "rewards/format_reward": 0.942857164144516, "step": 1230 }, { "completion_length": 197.11786804199218, "epoch": 0.5786280914605693, "grad_norm": 3.612844228744507, "kl": 0.625244140625, "learning_rate": 4.503833626499317e-07, "loss": 0.1048, "reward": 1.1892857551574707, "reward_std": 0.3342569015920162, "rewards/accuracy_reward": 0.2821428693830967, "rewards/format_reward": 0.9071428894996643, "step": 1240 }, { "completion_length": 193.096435546875, "epoch": 0.5832944470368642, "grad_norm": 4.014871597290039, "kl": 0.572314453125, "learning_rate": 4.42283237715471e-07, "loss": 0.0812, "reward": 1.160714316368103, "reward_std": 0.28212499171495437, "rewards/accuracy_reward": 0.26071429550647734, "rewards/format_reward": 0.9000000357627869, "step": 1250 }, { "completion_length": 195.02500762939454, "epoch": 0.5879608026131591, "grad_norm": 4.279513359069824, "kl": 0.7986328125, "learning_rate": 4.3419843698899234e-07, "loss": 0.1005, "reward": 1.2928572058677674, "reward_std": 0.25148131176829336, "rewards/accuracy_reward": 0.3571428656578064, "rewards/format_reward": 0.9357143044471741, "step": 1260 }, { "completion_length": 221.07858123779297, "epoch": 0.592627158189454, "grad_norm": 4.270975589752197, "kl": 0.7974609375, "learning_rate": 4.2613110704218336e-07, "loss": 0.1913, "reward": 1.210714340209961, "reward_std": 0.27596538737416265, "rewards/accuracy_reward": 0.30357144251465795, "rewards/format_reward": 0.9071429014205933, "step": 1270 }, { "completion_length": 192.57857971191407, "epoch": 0.5972935137657489, "grad_norm": 6.560425281524658, "kl": 1.0640625, "learning_rate": 4.1808338980811666e-07, "loss": 0.1447, "reward": 1.2214286208152771, "reward_std": 0.2975998237729073, "rewards/accuracy_reward": 0.3285714462399483, "rewards/format_reward": 0.8928571760654449, "step": 1280 }, { "completion_length": 210.92501220703124, "epoch": 0.6019598693420438, "grad_norm": 2.6335413455963135, "kl": 1.1158203125, "learning_rate": 4.100574220125506e-07, "loss": 0.2254, "reward": 1.2178571939468383, "reward_std": 0.38201676979660987, "rewards/accuracy_reward": 0.3250000149011612, "rewards/format_reward": 0.892857164144516, "step": 1290 }, { "completion_length": 226.9035842895508, "epoch": 0.6066262249183387, "grad_norm": 6.515714645385742, "kl": 1.377734375, "learning_rate": 4.020553346066144e-07, "loss": 0.2749, "reward": 1.2035714745521546, "reward_std": 0.37217203676700594, "rewards/accuracy_reward": 0.3107142999768257, "rewards/format_reward": 0.8928571879863739, "step": 1300 }, { "completion_length": 211.1928680419922, "epoch": 0.6112925804946336, "grad_norm": 16.117403030395508, "kl": 1.2681640625, "learning_rate": 3.9407925220102493e-07, "loss": 0.2125, "reward": 1.1928571820259095, "reward_std": 0.3735316038131714, "rewards/accuracy_reward": 0.2928571544587612, "rewards/format_reward": 0.9000000357627869, "step": 1310 }, { "completion_length": 203.11429290771486, "epoch": 0.6159589360709286, "grad_norm": 7.737660884857178, "kl": 0.8955078125, "learning_rate": 3.86131292501988e-07, "loss": 0.126, "reward": 1.2571429014205933, "reward_std": 0.31292245015501974, "rewards/accuracy_reward": 0.3428571581840515, "rewards/format_reward": 0.9142857551574707, "step": 1320 }, { "completion_length": 188.27857971191406, "epoch": 0.6206252916472235, "grad_norm": 3.5433154106140137, "kl": 0.89716796875, "learning_rate": 3.7821356574893204e-07, "loss": 0.1548, "reward": 1.31071435213089, "reward_std": 0.26513244956731796, "rewards/accuracy_reward": 0.36785716116428374, "rewards/format_reward": 0.942857164144516, "step": 1330 }, { "completion_length": 196.4714385986328, "epoch": 0.6252916472235185, "grad_norm": 6.886636734008789, "kl": 0.853076171875, "learning_rate": 3.7032817415422517e-07, "loss": 0.1634, "reward": 1.2678572058677673, "reward_std": 0.2711702950298786, "rewards/accuracy_reward": 0.3321428790688515, "rewards/format_reward": 0.935714316368103, "step": 1340 }, { "completion_length": 197.9714385986328, "epoch": 0.6299580027998134, "grad_norm": 10.59721565246582, "kl": 1.061083984375, "learning_rate": 3.624772113450223e-07, "loss": 0.1761, "reward": 1.2678572058677673, "reward_std": 0.32303600385785103, "rewards/accuracy_reward": 0.36071430891752243, "rewards/format_reward": 0.9071428954601288, "step": 1350 }, { "completion_length": 180.4321502685547, "epoch": 0.6346243583761083, "grad_norm": 2.4233856201171875, "kl": 0.690625, "learning_rate": 3.5466276180739264e-07, "loss": 0.0947, "reward": 1.2892857670783997, "reward_std": 0.21290518939495087, "rewards/accuracy_reward": 0.3392857272177935, "rewards/format_reward": 0.9500000238418579, "step": 1360 }, { "completion_length": 196.02857971191406, "epoch": 0.6392907139524032, "grad_norm": 18.396207809448242, "kl": 0.962060546875, "learning_rate": 3.4688690033287414e-07, "loss": 0.155, "reward": 1.3535714745521545, "reward_std": 0.24271938800811768, "rewards/accuracy_reward": 0.4250000178813934, "rewards/format_reward": 0.9285714626312256, "step": 1370 }, { "completion_length": 194.17501068115234, "epoch": 0.6439570695286981, "grad_norm": 12.984419822692871, "kl": 0.37880859375, "learning_rate": 3.3915169146760137e-07, "loss": 0.096, "reward": 1.2642857909202576, "reward_std": 0.2268330782651901, "rewards/accuracy_reward": 0.33571430034935473, "rewards/format_reward": 0.9285714507102967, "step": 1380 }, { "completion_length": 180.59286499023438, "epoch": 0.648623425104993, "grad_norm": 3.471736192703247, "kl": 0.78525390625, "learning_rate": 3.3145918896415394e-07, "loss": 0.0905, "reward": 1.3535714745521545, "reward_std": 0.1773286685347557, "rewards/accuracy_reward": 0.417857164144516, "rewards/format_reward": 0.935714316368103, "step": 1390 }, { "completion_length": 172.69286499023437, "epoch": 0.653289780681288, "grad_norm": 5.568473815917969, "kl": 0.4484130859375, "learning_rate": 3.2381143523627106e-07, "loss": 0.0142, "reward": 1.3071429252624511, "reward_std": 0.19887898862361908, "rewards/accuracy_reward": 0.3500000134110451, "rewards/format_reward": 0.9571428775787354, "step": 1400 }, { "completion_length": 197.35714874267578, "epoch": 0.6579561362575829, "grad_norm": 5.128924369812012, "kl": 0.848291015625, "learning_rate": 3.16210460816576e-07, "loss": 0.1411, "reward": 1.2464286088943481, "reward_std": 0.20331502109766006, "rewards/accuracy_reward": 0.3035714402794838, "rewards/format_reward": 0.9428571701049805, "step": 1410 }, { "completion_length": 205.62858123779296, "epoch": 0.6626224918338778, "grad_norm": 6.538782596588135, "kl": 0.71435546875, "learning_rate": 3.086582838174551e-07, "loss": 0.1207, "reward": 1.210714328289032, "reward_std": 0.26858522146940234, "rewards/accuracy_reward": 0.2750000096857548, "rewards/format_reward": 0.9357143104076385, "step": 1420 }, { "completion_length": 177.58214874267577, "epoch": 0.6672888474101727, "grad_norm": 2.0602848529815674, "kl": 0.540087890625, "learning_rate": 3.0115690939523514e-07, "loss": 0.0609, "reward": 1.2571429014205933, "reward_std": 0.19617216065526008, "rewards/accuracy_reward": 0.3071428701281548, "rewards/format_reward": 0.9500000238418579, "step": 1430 }, { "completion_length": 190.9821517944336, "epoch": 0.6719552029864676, "grad_norm": 1.0282678604125977, "kl": 0.74091796875, "learning_rate": 2.9370832921779983e-07, "loss": 0.1188, "reward": 1.2035714626312255, "reward_std": 0.22572807371616363, "rewards/accuracy_reward": 0.25357144251465796, "rewards/format_reward": 0.9500000238418579, "step": 1440 }, { "completion_length": 197.7464385986328, "epoch": 0.6766215585627625, "grad_norm": 5.613475799560547, "kl": 0.711767578125, "learning_rate": 2.8631452093578814e-07, "loss": 0.1211, "reward": 1.3035714864730834, "reward_std": 0.20981329679489136, "rewards/accuracy_reward": 0.3750000141561031, "rewards/format_reward": 0.9285714626312256, "step": 1450 }, { "completion_length": 200.86786651611328, "epoch": 0.6812879141390574, "grad_norm": 6.9082841873168945, "kl": 1.02734375, "learning_rate": 2.7897744765751375e-07, "loss": 0.1942, "reward": 1.321428620815277, "reward_std": 0.2701858140528202, "rewards/accuracy_reward": 0.4000000197440386, "rewards/format_reward": 0.9214286088943482, "step": 1460 }, { "completion_length": 191.17858123779297, "epoch": 0.6859542697153523, "grad_norm": 116.39089965820312, "kl": 0.575634765625, "learning_rate": 2.716990574277469e-07, "loss": 0.086, "reward": 1.2821429133415223, "reward_std": 0.21748021617531776, "rewards/accuracy_reward": 0.33214287310838697, "rewards/format_reward": 0.9500000178813934, "step": 1470 }, { "completion_length": 196.97500762939453, "epoch": 0.6906206252916472, "grad_norm": 5.2841033935546875, "kl": 0.8533203125, "learning_rate": 2.644812827104933e-07, "loss": 0.1501, "reward": 1.175000047683716, "reward_std": 0.30686734020709994, "rewards/accuracy_reward": 0.2821428686380386, "rewards/format_reward": 0.8928571820259095, "step": 1480 }, { "completion_length": 186.7571533203125, "epoch": 0.6952869808679422, "grad_norm": 4.615259170532227, "kl": 0.45205078125, "learning_rate": 2.573260398759125e-07, "loss": 0.0948, "reward": 1.346428632736206, "reward_std": 0.12943540289998054, "rewards/accuracy_reward": 0.36785715967416766, "rewards/format_reward": 0.9785714387893677, "step": 1490 }, { "completion_length": 175.87857818603516, "epoch": 0.6999533364442371, "grad_norm": 2.3132364749908447, "kl": 0.661865234375, "learning_rate": 2.5023522869150705e-07, "loss": 0.0561, "reward": 1.2535714864730836, "reward_std": 0.21969022005796432, "rewards/accuracy_reward": 0.31785715371370316, "rewards/format_reward": 0.935714316368103, "step": 1500 }, { "completion_length": 187.2821517944336, "epoch": 0.704619692020532, "grad_norm": 29.17850685119629, "kl": 0.95556640625, "learning_rate": 2.432107318177217e-07, "loss": 0.1785, "reward": 1.3500000596046449, "reward_std": 0.25344905629754066, "rewards/accuracy_reward": 0.42142859399318694, "rewards/format_reward": 0.9285714566707611, "step": 1510 }, { "completion_length": 177.58572235107422, "epoch": 0.7092860475968269, "grad_norm": 1.220908284187317, "kl": 0.591015625, "learning_rate": 2.3625441430808347e-07, "loss": 0.0738, "reward": 1.4071429133415223, "reward_std": 0.21487789303064347, "rewards/accuracy_reward": 0.4714285969734192, "rewards/format_reward": 0.9357143044471741, "step": 1520 }, { "completion_length": 180.8821533203125, "epoch": 0.7139524031731218, "grad_norm": 4.141109943389893, "kl": 0.378857421875, "learning_rate": 2.2936812311401682e-07, "loss": 0.0597, "reward": 1.3000000715255737, "reward_std": 0.17239581793546677, "rewards/accuracy_reward": 0.3357143022119999, "rewards/format_reward": 0.9642857313156128, "step": 1530 }, { "completion_length": 183.73214874267578, "epoch": 0.7186187587494167, "grad_norm": 10.828474044799805, "kl": 0.504296875, "learning_rate": 2.225536865944646e-07, "loss": 0.0564, "reward": 1.3321429133415221, "reward_std": 0.14654723256826402, "rewards/accuracy_reward": 0.3607143074274063, "rewards/format_reward": 0.9714285850524902, "step": 1540 }, { "completion_length": 189.9714370727539, "epoch": 0.7232851143257116, "grad_norm": 7.55503511428833, "kl": 0.721484375, "learning_rate": 2.1581291403044632e-07, "loss": 0.1054, "reward": 1.2250000596046449, "reward_std": 0.2633721731603146, "rewards/accuracy_reward": 0.2892857283353806, "rewards/format_reward": 0.9357143104076385, "step": 1550 }, { "completion_length": 180.25000762939453, "epoch": 0.7279514699020065, "grad_norm": 9.971400260925293, "kl": 0.6979736328125, "learning_rate": 2.0914759514468106e-07, "loss": 0.1232, "reward": 1.2785714745521546, "reward_std": 0.2545368172228336, "rewards/accuracy_reward": 0.357142873480916, "rewards/format_reward": 0.9214286029338836, "step": 1560 }, { "completion_length": 198.396435546875, "epoch": 0.7326178254783015, "grad_norm": 9.208026885986328, "kl": 1.296337890625, "learning_rate": 2.0255949962640333e-07, "loss": 0.2623, "reward": 1.2785715103149413, "reward_std": 0.27719091176986693, "rewards/accuracy_reward": 0.3785714417695999, "rewards/format_reward": 0.900000023841858, "step": 1570 }, { "completion_length": 204.6928680419922, "epoch": 0.7372841810545964, "grad_norm": 7.680899620056152, "kl": 0.668994140625, "learning_rate": 1.9605037666149832e-07, "loss": 0.1278, "reward": 1.2857143521308898, "reward_std": 0.24715665131807327, "rewards/accuracy_reward": 0.357142873108387, "rewards/format_reward": 0.9285714566707611, "step": 1580 }, { "completion_length": 185.85357818603515, "epoch": 0.7419505366308913, "grad_norm": 8.488438606262207, "kl": 0.361083984375, "learning_rate": 1.8962195446808083e-07, "loss": 0.0404, "reward": 1.196428644657135, "reward_std": 0.25750192254781723, "rewards/accuracy_reward": 0.26785715520381925, "rewards/format_reward": 0.9285714507102967, "step": 1590 }, { "completion_length": 196.72500762939453, "epoch": 0.7466168922071862, "grad_norm": 4.548067092895508, "kl": 1.42060546875, "learning_rate": 1.8327593983764057e-07, "loss": 0.2529, "reward": 1.335714328289032, "reward_std": 0.377190912514925, "rewards/accuracy_reward": 0.4285714507102966, "rewards/format_reward": 0.9071428894996643, "step": 1600 }, { "completion_length": 177.2678649902344, "epoch": 0.7512832477834811, "grad_norm": 3.3683552742004395, "kl": 0.58388671875, "learning_rate": 1.770140176818774e-07, "loss": 0.0739, "reward": 1.3428571939468383, "reward_std": 0.18543876633048056, "rewards/accuracy_reward": 0.3642857313156128, "rewards/format_reward": 0.9785714387893677, "step": 1610 }, { "completion_length": 203.56429595947264, "epoch": 0.755949603359776, "grad_norm": 3.158090114593506, "kl": 0.81240234375, "learning_rate": 1.7083785058534566e-07, "loss": 0.1285, "reward": 1.2821429371833801, "reward_std": 0.24443381130695344, "rewards/accuracy_reward": 0.3392857268452644, "rewards/format_reward": 0.9428571701049805, "step": 1620 }, { "completion_length": 203.41429443359374, "epoch": 0.7606159589360709, "grad_norm": 3.8291237354278564, "kl": 1.1529296875, "learning_rate": 1.6474907836402507e-07, "loss": 0.1792, "reward": 1.2678571939468384, "reward_std": 0.24378738924860954, "rewards/accuracy_reward": 0.3464285895228386, "rewards/format_reward": 0.9214285969734192, "step": 1630 }, { "completion_length": 204.32500915527345, "epoch": 0.7652823145123658, "grad_norm": 127.166259765625, "kl": 1.08779296875, "learning_rate": 1.5874931762993933e-07, "loss": 0.1349, "reward": 1.196428620815277, "reward_std": 0.26433941870927813, "rewards/accuracy_reward": 0.28928572684526443, "rewards/format_reward": 0.9071428775787354, "step": 1640 }, { "completion_length": 210.50001068115233, "epoch": 0.7699486700886607, "grad_norm": 1.736830472946167, "kl": 0.989453125, "learning_rate": 1.5284016136193396e-07, "loss": 0.2122, "reward": 1.2178571939468383, "reward_std": 0.2605919159948826, "rewards/accuracy_reward": 0.2964285835623741, "rewards/format_reward": 0.9214286088943482, "step": 1650 }, { "completion_length": 201.36072235107423, "epoch": 0.7746150256649557, "grad_norm": 10.542801856994629, "kl": 1.331201171875, "learning_rate": 1.4702317848272838e-07, "loss": 0.2161, "reward": 1.3214286327362061, "reward_std": 0.28298772796988486, "rewards/accuracy_reward": 0.40000002086162567, "rewards/format_reward": 0.9214286029338836, "step": 1660 }, { "completion_length": 192.8857208251953, "epoch": 0.7792813812412506, "grad_norm": 6.193233966827393, "kl": 1.0849609375, "learning_rate": 1.4129991344235653e-07, "loss": 0.1358, "reward": 1.2321429014205934, "reward_std": 0.2178552895784378, "rewards/accuracy_reward": 0.31785715706646445, "rewards/format_reward": 0.9142857432365418, "step": 1670 }, { "completion_length": 173.0357223510742, "epoch": 0.7839477368175455, "grad_norm": 16.36906623840332, "kl": 0.372314453125, "learning_rate": 1.3567188580810435e-07, "loss": 0.0753, "reward": 1.4285714864730834, "reward_std": 0.19271938651800155, "rewards/accuracy_reward": 0.46428574323654176, "rewards/format_reward": 0.9642857313156128, "step": 1680 }, { "completion_length": 194.05357971191407, "epoch": 0.7886140923938404, "grad_norm": 2.392770290374756, "kl": 0.406396484375, "learning_rate": 1.3014058986105374e-07, "loss": 0.0856, "reward": 1.2535714864730836, "reward_std": 0.15812735334038736, "rewards/accuracy_reward": 0.2892857283353806, "rewards/format_reward": 0.9642857313156128, "step": 1690 }, { "completion_length": 175.20000762939452, "epoch": 0.7932804479701353, "grad_norm": 4.713147163391113, "kl": 0.342333984375, "learning_rate": 1.2470749419934057e-07, "loss": 0.0522, "reward": 1.435714340209961, "reward_std": 0.1269535943865776, "rewards/accuracy_reward": 0.4428571715950966, "rewards/format_reward": 0.9928571462631226, "step": 1700 }, { "completion_length": 178.71429595947265, "epoch": 0.7979468035464302, "grad_norm": 2.6368408203125, "kl": 0.63681640625, "learning_rate": 1.1937404134823175e-07, "loss": 0.0749, "reward": 1.2642857551574707, "reward_std": 0.21649573594331742, "rewards/accuracy_reward": 0.3000000137835741, "rewards/format_reward": 0.9642857313156128, "step": 1710 }, { "completion_length": 180.82500915527345, "epoch": 0.8026131591227251, "grad_norm": 4.20245361328125, "kl": 0.404296875, "learning_rate": 1.1414164737712401e-07, "loss": 0.0445, "reward": 1.3035714864730834, "reward_std": 0.21377288773655892, "rewards/accuracy_reward": 0.35357144474983215, "rewards/format_reward": 0.9500000178813934, "step": 1720 }, { "completion_length": 193.33929290771485, "epoch": 0.80727951469902, "grad_norm": 12.832620620727539, "kl": 0.924365234375, "learning_rate": 1.0901170152356775e-07, "loss": 0.1151, "reward": 1.2142857551574706, "reward_std": 0.24824440032243728, "rewards/accuracy_reward": 0.3000000149011612, "rewards/format_reward": 0.9142857551574707, "step": 1730 }, { "completion_length": 195.20358123779297, "epoch": 0.811945870275315, "grad_norm": 8.172798156738281, "kl": 0.75888671875, "learning_rate": 1.0398556582441481e-07, "loss": 0.1337, "reward": 1.271428644657135, "reward_std": 0.271032539755106, "rewards/accuracy_reward": 0.3428571570664644, "rewards/format_reward": 0.9285714626312256, "step": 1740 }, { "completion_length": 183.7714385986328, "epoch": 0.8166122258516099, "grad_norm": 12.028782844543457, "kl": 1.259375, "learning_rate": 9.906457475418778e-08, "loss": 0.1913, "reward": 1.3107143759727478, "reward_std": 0.25871951803565024, "rewards/accuracy_reward": 0.38214287757873533, "rewards/format_reward": 0.9285714566707611, "step": 1750 }, { "completion_length": 195.21072387695312, "epoch": 0.8212785814279048, "grad_norm": 8.357789039611816, "kl": 0.935986328125, "learning_rate": 9.425003487076789e-08, "loss": 0.1143, "reward": 1.2392857670783997, "reward_std": 0.2283131591975689, "rewards/accuracy_reward": 0.30357144549489024, "rewards/format_reward": 0.9357143104076385, "step": 1760 }, { "completion_length": 186.48572387695313, "epoch": 0.8259449370041997, "grad_norm": 4.591169834136963, "kl": 0.8271484375, "learning_rate": 8.954322446849444e-08, "loss": 0.1123, "reward": 1.3535715103149415, "reward_std": 0.23052316084504126, "rewards/accuracy_reward": 0.40357144773006437, "rewards/format_reward": 0.9500000238418579, "step": 1770 }, { "completion_length": 203.71072082519532, "epoch": 0.8306112925804946, "grad_norm": 4.072372913360596, "kl": 1.01142578125, "learning_rate": 8.494539323876871e-08, "loss": 0.1496, "reward": 1.2678571939468384, "reward_std": 0.24972448274493217, "rewards/accuracy_reward": 0.34642858654260633, "rewards/format_reward": 0.9214285969734192, "step": 1780 }, { "completion_length": 173.66429290771484, "epoch": 0.8352776481567895, "grad_norm": 1.4727064371109009, "kl": 0.4848876953125, "learning_rate": 8.045776193825204e-08, "loss": 0.0449, "reward": 1.335714340209961, "reward_std": 0.20700509771704673, "rewards/accuracy_reward": 0.3857143074274063, "rewards/format_reward": 0.9500000238418579, "step": 1790 }, { "completion_length": 178.2857223510742, "epoch": 0.8399440037330844, "grad_norm": 2.1876659393310547, "kl": 0.60693359375, "learning_rate": 7.608152206474638e-08, "loss": 0.0354, "reward": 1.36071435213089, "reward_std": 0.2141479544341564, "rewards/accuracy_reward": 0.41071430742740633, "rewards/format_reward": 0.9500000238418579, "step": 1800 }, { "completion_length": 184.4964370727539, "epoch": 0.8446103593093793, "grad_norm": 1.164255976676941, "kl": 0.330322265625, "learning_rate": 7.181783554084308e-08, "loss": 0.0332, "reward": 1.3071429133415222, "reward_std": 0.13999654576182366, "rewards/accuracy_reward": 0.32142859026789666, "rewards/format_reward": 0.9857142925262451, "step": 1810 }, { "completion_length": 180.47501068115236, "epoch": 0.8492767148856742, "grad_norm": 17.824142456054688, "kl": 0.4853271484375, "learning_rate": 6.766783440542434e-08, "loss": 0.0599, "reward": 1.2642857670783996, "reward_std": 0.19838216677308082, "rewards/accuracy_reward": 0.32857144847512243, "rewards/format_reward": 0.935714316368103, "step": 1820 }, { "completion_length": 187.25000915527343, "epoch": 0.8539430704619692, "grad_norm": 7.02644681930542, "kl": 0.693896484375, "learning_rate": 6.363262051309908e-08, "loss": 0.1129, "reward": 1.2892857670783997, "reward_std": 0.27438203766942026, "rewards/accuracy_reward": 0.35357144474983215, "rewards/format_reward": 0.9357143104076385, "step": 1830 }, { "completion_length": 190.3964370727539, "epoch": 0.8586094260382641, "grad_norm": 0.5623534321784973, "kl": 0.6712890625, "learning_rate": 5.971326524165226e-08, "loss": 0.1025, "reward": 1.296428644657135, "reward_std": 0.2583474151790142, "rewards/accuracy_reward": 0.36785715967416766, "rewards/format_reward": 0.9285714566707611, "step": 1840 }, { "completion_length": 191.0107223510742, "epoch": 0.863275781614559, "grad_norm": 0.3061552047729492, "kl": 0.79541015625, "learning_rate": 5.591080920758695e-08, "loss": 0.1553, "reward": 1.2821429252624512, "reward_std": 0.2908777602016926, "rewards/accuracy_reward": 0.3678571581840515, "rewards/format_reward": 0.9142857491970062, "step": 1850 }, { "completion_length": 190.6214385986328, "epoch": 0.8679421371908539, "grad_norm": 4.034696578979492, "kl": 0.664599609375, "learning_rate": 5.22262619898331e-08, "loss": 0.1263, "reward": 1.2821429133415223, "reward_std": 0.24259887337684632, "rewards/accuracy_reward": 0.3464285880327225, "rewards/format_reward": 0.935714316368103, "step": 1860 }, { "completion_length": 192.10358276367188, "epoch": 0.8726084927671488, "grad_norm": 11.7506742477417, "kl": 1.0009765625, "learning_rate": 4.8660601861697294e-08, "loss": 0.1442, "reward": 1.3321429133415221, "reward_std": 0.2509672470390797, "rewards/accuracy_reward": 0.39642858803272246, "rewards/format_reward": 0.935714316368103, "step": 1870 }, { "completion_length": 194.44286346435547, "epoch": 0.8772748483434438, "grad_norm": 2.883983850479126, "kl": 0.7970703125, "learning_rate": 4.5214775531124184e-08, "loss": 0.0791, "reward": 1.2214286148548126, "reward_std": 0.2235020525753498, "rewards/accuracy_reward": 0.2785714402794838, "rewards/format_reward": 0.9428571581840515, "step": 1880 }, { "completion_length": 177.9107223510742, "epoch": 0.8819412039197387, "grad_norm": 12.7833251953125, "kl": 0.6814453125, "learning_rate": 4.188969788933899e-08, "loss": 0.0794, "reward": 1.2928571939468383, "reward_std": 0.2521940000355244, "rewards/accuracy_reward": 0.36428572833538053, "rewards/format_reward": 0.9285714507102967, "step": 1890 }, { "completion_length": 181.83929595947265, "epoch": 0.8866075594960336, "grad_norm": 1.639757513999939, "kl": 1.08232421875, "learning_rate": 3.8686251767937325e-08, "loss": 0.1071, "reward": 1.3000000596046448, "reward_std": 0.22904308661818504, "rewards/accuracy_reward": 0.35714287161827085, "rewards/format_reward": 0.942857164144516, "step": 1900 }, { "completion_length": 192.0964370727539, "epoch": 0.8912739150723286, "grad_norm": 3.8928933143615723, "kl": 0.3626220703125, "learning_rate": 3.560528770448712e-08, "loss": 0.064, "reward": 1.3571429014205934, "reward_std": 0.14244568049907685, "rewards/accuracy_reward": 0.3928571566939354, "rewards/format_reward": 0.9642857313156128, "step": 1910 }, { "completion_length": 198.77500915527344, "epoch": 0.8959402706486235, "grad_norm": 4.202512741088867, "kl": 0.386767578125, "learning_rate": 3.264762371670493e-08, "loss": 0.0725, "reward": 1.2428572058677674, "reward_std": 0.18409645855426787, "rewards/accuracy_reward": 0.2928571604192257, "rewards/format_reward": 0.9500000238418579, "step": 1920 }, { "completion_length": 182.1321548461914, "epoch": 0.9006066262249184, "grad_norm": 8.796235084533691, "kl": 0.31845703125, "learning_rate": 2.981404508526653e-08, "loss": 0.049, "reward": 1.3142857670783996, "reward_std": 0.19875723943114282, "rewards/accuracy_reward": 0.35714287757873536, "rewards/format_reward": 0.9571428775787354, "step": 1930 }, { "completion_length": 183.12857818603516, "epoch": 0.9052729818012133, "grad_norm": 5.363933086395264, "kl": 0.4515380859375, "learning_rate": 2.7105304145309317e-08, "loss": 0.0753, "reward": 1.3535714864730835, "reward_std": 0.22955592721700668, "rewards/accuracy_reward": 0.40357145145535467, "rewards/format_reward": 0.9500000238418579, "step": 1940 }, { "completion_length": 178.5964370727539, "epoch": 0.9099393373775082, "grad_norm": 4.07145357131958, "kl": 0.243212890625, "learning_rate": 2.4522120086681975e-08, "loss": 0.035, "reward": 1.3750000596046448, "reward_std": 0.1731257550418377, "rewards/accuracy_reward": 0.38928572833538055, "rewards/format_reward": 0.9857142925262451, "step": 1950 }, { "completion_length": 191.87858123779296, "epoch": 0.9146056929538031, "grad_norm": 4.126840114593506, "kl": 0.7501953125, "learning_rate": 2.2065178762994517e-08, "loss": 0.1034, "reward": 1.2821429014205932, "reward_std": 0.1828709363937378, "rewards/accuracy_reward": 0.3392857313156128, "rewards/format_reward": 0.942857164144516, "step": 1960 }, { "completion_length": 191.27857818603516, "epoch": 0.919272048530098, "grad_norm": 3.2844011783599854, "kl": 1.066259765625, "learning_rate": 1.9735132509519302e-08, "loss": 0.1838, "reward": 1.271428632736206, "reward_std": 0.33275041803717614, "rewards/accuracy_reward": 0.35714287757873536, "rewards/format_reward": 0.9142857491970062, "step": 1970 }, { "completion_length": 182.68929290771484, "epoch": 0.9239384041063929, "grad_norm": 5.930522441864014, "kl": 0.534765625, "learning_rate": 1.7532599969991347e-08, "loss": 0.0479, "reward": 1.2785714864730835, "reward_std": 0.19234431087970733, "rewards/accuracy_reward": 0.3214285895228386, "rewards/format_reward": 0.9571428775787354, "step": 1980 }, { "completion_length": 178.8928649902344, "epoch": 0.9286047596826879, "grad_norm": 0.8274029493331909, "kl": 0.40849609375, "learning_rate": 1.545816593235416e-08, "loss": 0.0388, "reward": 1.3142857789993285, "reward_std": 0.20442002266645432, "rewards/accuracy_reward": 0.3500000134110451, "rewards/format_reward": 0.9642857313156128, "step": 1990 }, { "completion_length": 188.56786499023437, "epoch": 0.9332711152589828, "grad_norm": 6.531651020050049, "kl": 1.097607421875, "learning_rate": 1.3512381173494458e-08, "loss": 0.2064, "reward": 1.3535714983940124, "reward_std": 0.24394118189811706, "rewards/accuracy_reward": 0.4250000178813934, "rewards/format_reward": 0.9285714626312256, "step": 2000 }, { "completion_length": 194.05000762939454, "epoch": 0.9379374708352777, "grad_norm": 7.329962253570557, "kl": 0.97109375, "learning_rate": 1.169576231300684e-08, "loss": 0.1501, "reward": 1.2750000476837158, "reward_std": 0.3136565685272217, "rewards/accuracy_reward": 0.3607142955064774, "rewards/format_reward": 0.9142857491970062, "step": 2010 }, { "completion_length": 198.34286499023438, "epoch": 0.9426038264115726, "grad_norm": 5.229209899902344, "kl": 0.5191162109375, "learning_rate": 1.000879167602764e-08, "loss": 0.087, "reward": 1.285714340209961, "reward_std": 0.1509844921529293, "rewards/accuracy_reward": 0.32857144698500634, "rewards/format_reward": 0.9571428775787354, "step": 2020 }, { "completion_length": 202.35357971191405, "epoch": 0.9472701819878675, "grad_norm": 2.168286085128784, "kl": 0.821923828125, "learning_rate": 8.451917165174404e-09, "loss": 0.1315, "reward": 1.3642857670783997, "reward_std": 0.23890879452228547, "rewards/accuracy_reward": 0.4214285880327225, "rewards/format_reward": 0.9428571581840515, "step": 2030 }, { "completion_length": 205.11786346435548, "epoch": 0.9519365375641624, "grad_norm": 11.269577980041504, "kl": 1.1117919921875, "learning_rate": 7.025552141624369e-09, "loss": 0.2032, "reward": 1.1714286088943482, "reward_std": 0.2642820030450821, "rewards/accuracy_reward": 0.2571428693830967, "rewards/format_reward": 0.9142857372760773, "step": 2040 }, { "completion_length": 190.096435546875, "epoch": 0.9566028931404573, "grad_norm": 0.18682968616485596, "kl": 0.72197265625, "learning_rate": 5.730075315364346e-09, "loss": 0.1334, "reward": 1.2321429133415223, "reward_std": 0.25800683721899986, "rewards/accuracy_reward": 0.29642858505249026, "rewards/format_reward": 0.935714316368103, "step": 2050 }, { "completion_length": 184.56429443359374, "epoch": 0.9612692487167522, "grad_norm": 2.113898754119873, "kl": 0.876220703125, "learning_rate": 4.565830644640223e-09, "loss": 0.1067, "reward": 1.271428632736206, "reward_std": 0.3147573724389076, "rewards/accuracy_reward": 0.3642857313156128, "rewards/format_reward": 0.9071428894996643, "step": 2060 }, { "completion_length": 184.8857208251953, "epoch": 0.9659356042930471, "grad_norm": 6.786799907684326, "kl": 0.4216796875, "learning_rate": 3.533127244634171e-09, "loss": 0.046, "reward": 1.3571429133415223, "reward_std": 0.17572808191180228, "rewards/accuracy_reward": 0.38571430146694186, "rewards/format_reward": 0.9714285790920257, "step": 2070 }, { "completion_length": 188.2321517944336, "epoch": 0.9706019598693421, "grad_norm": 6.5551042556762695, "kl": 0.491552734375, "learning_rate": 2.6322393053916925e-09, "loss": 0.082, "reward": 1.2392857670783997, "reward_std": 0.24259887263178825, "rewards/accuracy_reward": 0.29642858952283857, "rewards/format_reward": 0.9428571701049805, "step": 2080 }, { "completion_length": 197.81429748535157, "epoch": 0.975268315445637, "grad_norm": 3.5728259086608887, "kl": 0.3767578125, "learning_rate": 1.86340601902274e-09, "loss": 0.052, "reward": 1.2035714983940125, "reward_std": 0.18816160932183265, "rewards/accuracy_reward": 0.26071429550647734, "rewards/format_reward": 0.9428571701049805, "step": 2090 }, { "completion_length": 184.00000915527343, "epoch": 0.9799346710219319, "grad_norm": 8.470995903015137, "kl": 0.66552734375, "learning_rate": 1.2268315161944044e-09, "loss": 0.0892, "reward": 1.2678572058677673, "reward_std": 0.16525296717882157, "rewards/accuracy_reward": 0.2964285921305418, "rewards/format_reward": 0.9714285850524902, "step": 2100 }, { "completion_length": 190.1964370727539, "epoch": 0.9846010265982268, "grad_norm": 5.933376312255859, "kl": 0.58525390625, "learning_rate": 7.226848119326057e-10, "loss": 0.0743, "reward": 1.2571429133415222, "reward_std": 0.20183914229273797, "rewards/accuracy_reward": 0.3285714477300644, "rewards/format_reward": 0.9285714566707611, "step": 2110 }, { "completion_length": 203.46429290771485, "epoch": 0.9892673821745217, "grad_norm": 11.98684310913086, "kl": 0.8004150390625, "learning_rate": 3.510997607475974e-10, "loss": 0.144, "reward": 1.2071429133415221, "reward_std": 0.29701889082789423, "rewards/accuracy_reward": 0.28571429923176767, "rewards/format_reward": 0.9214285969734192, "step": 2120 }, { "completion_length": 193.4964370727539, "epoch": 0.9939337377508166, "grad_norm": 1.8809298276901245, "kl": 0.745263671875, "learning_rate": 1.121750210946737e-10, "loss": 0.1279, "reward": 1.2571429133415222, "reward_std": 0.26994478702545166, "rewards/accuracy_reward": 0.3357143022119999, "rewards/format_reward": 0.9214286029338836, "step": 2130 }, { "completion_length": 185.08929443359375, "epoch": 0.9986000933271115, "grad_norm": 1.6485497951507568, "kl": 0.567529296875, "learning_rate": 5.974029179456331e-12, "loss": 0.097, "reward": 1.310714328289032, "reward_std": 0.1667502835392952, "rewards/accuracy_reward": 0.3535714462399483, "rewards/format_reward": 0.9571428775787354, "step": 2140 }, { "epoch": 1.0, "eval_completion_length": 203.0802721296038, "eval_kl": 0.6515764508928571, "eval_loss": 0.1416667252779007, "eval_reward": 1.1919643453189306, "eval_reward_std": 0.2649446129798889, "eval_rewards/accuracy_reward": 0.24681123665400914, "eval_rewards/format_reward": 0.9451530916350228, "eval_runtime": 118.4791, "eval_samples_per_second": 2.532, "eval_steps_per_second": 0.025, "step": 2143 }, { "epoch": 1.0, "step": 2143, "total_flos": 0.0, "train_loss": 0.08651691437249102, "train_runtime": 12893.4863, "train_samples_per_second": 1.163, "train_steps_per_second": 0.166 } ], "logging_steps": 10, "max_steps": 2143, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }