| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 100, |
| "global_step": 2143, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 211.3571517944336, |
| "epoch": 0.004666355576294913, |
| "grad_norm": 2.9974076747894287, |
| "kl": 0.00047588348388671875, |
| "learning_rate": 4.6511627906976744e-08, |
| "loss": 0.044, |
| "reward": 0.6535714566707611, |
| "reward_std": 0.5595175564289093, |
| "rewards/accuracy_reward": 0.010714286193251609, |
| "rewards/format_reward": 0.6428571760654449, |
| "step": 10 |
| }, |
| { |
| "completion_length": 211.85358123779298, |
| "epoch": 0.009332711152589827, |
| "grad_norm": 2.421142816543579, |
| "kl": 0.0005392074584960937, |
| "learning_rate": 9.302325581395349e-08, |
| "loss": 0.0331, |
| "reward": 0.6000000357627868, |
| "reward_std": 0.6225969046354294, |
| "rewards/accuracy_reward": 0.021428572386503218, |
| "rewards/format_reward": 0.5785714596509933, |
| "step": 20 |
| }, |
| { |
| "completion_length": 225.71072235107422, |
| "epoch": 0.013999066728884742, |
| "grad_norm": 2.8735339641571045, |
| "kl": 0.0006612777709960937, |
| "learning_rate": 1.3953488372093021e-07, |
| "loss": 0.0571, |
| "reward": 0.5857143223285675, |
| "reward_std": 0.5592944413423538, |
| "rewards/accuracy_reward": 0.00714285746216774, |
| "rewards/format_reward": 0.5785714656114578, |
| "step": 30 |
| }, |
| { |
| "completion_length": 205.19286651611327, |
| "epoch": 0.018665422305179653, |
| "grad_norm": 2.0310211181640625, |
| "kl": 0.0027374267578125, |
| "learning_rate": 1.8604651162790698e-07, |
| "loss": 0.0065, |
| "reward": 0.6285714566707611, |
| "reward_std": 0.5308447808027268, |
| "rewards/accuracy_reward": 0.01428571492433548, |
| "rewards/format_reward": 0.6142857350409031, |
| "step": 40 |
| }, |
| { |
| "completion_length": 194.54643707275392, |
| "epoch": 0.02333177788147457, |
| "grad_norm": 3.083293914794922, |
| "kl": 0.00838165283203125, |
| "learning_rate": 2.3255813953488372e-07, |
| "loss": 0.0626, |
| "reward": 0.7464286029338837, |
| "reward_std": 0.46758472323417666, |
| "rewards/accuracy_reward": 0.01785714365541935, |
| "rewards/format_reward": 0.7285714626312256, |
| "step": 50 |
| }, |
| { |
| "completion_length": 197.08929595947265, |
| "epoch": 0.027998133457769483, |
| "grad_norm": 2.4293816089630127, |
| "kl": 0.0138580322265625, |
| "learning_rate": 2.7906976744186043e-07, |
| "loss": 0.0292, |
| "reward": 0.685714328289032, |
| "reward_std": 0.47790482342243196, |
| "rewards/accuracy_reward": 0.00714285746216774, |
| "rewards/format_reward": 0.67857146859169, |
| "step": 60 |
| }, |
| { |
| "completion_length": 193.36786499023438, |
| "epoch": 0.032664489034064395, |
| "grad_norm": 1.755541443824768, |
| "kl": 0.00519561767578125, |
| "learning_rate": 3.2558139534883724e-07, |
| "loss": 0.028, |
| "reward": 0.8035714745521545, |
| "reward_std": 0.36622021347284317, |
| "rewards/accuracy_reward": 0.01785714365541935, |
| "rewards/format_reward": 0.7857143342494964, |
| "step": 70 |
| }, |
| { |
| "completion_length": 206.29644012451172, |
| "epoch": 0.03733084461035931, |
| "grad_norm": 1.8061885833740234, |
| "kl": 0.0106536865234375, |
| "learning_rate": 3.7209302325581396e-07, |
| "loss": 0.023, |
| "reward": 0.6857143223285675, |
| "reward_std": 0.4770329385995865, |
| "rewards/accuracy_reward": 0.00714285746216774, |
| "rewards/format_reward": 0.6785714626312256, |
| "step": 80 |
| }, |
| { |
| "completion_length": 205.0571517944336, |
| "epoch": 0.041997200186654225, |
| "grad_norm": 1.5979427099227905, |
| "kl": 0.008978271484375, |
| "learning_rate": 4.186046511627907e-07, |
| "loss": 0.0297, |
| "reward": 0.8071428894996643, |
| "reward_std": 0.3088400363922119, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.8071428894996643, |
| "step": 90 |
| }, |
| { |
| "completion_length": 217.22858123779298, |
| "epoch": 0.04666355576294914, |
| "grad_norm": 1.7703214883804321, |
| "kl": 0.0110107421875, |
| "learning_rate": 4.6511627906976743e-07, |
| "loss": 0.037, |
| "reward": 0.7285714745521545, |
| "reward_std": 0.4834458529949188, |
| "rewards/accuracy_reward": 0.00714285746216774, |
| "rewards/format_reward": 0.7214286148548126, |
| "step": 100 |
| }, |
| { |
| "completion_length": 198.60000915527343, |
| "epoch": 0.05132991133924405, |
| "grad_norm": 1.311502456665039, |
| "kl": 0.0128143310546875, |
| "learning_rate": 5.116279069767442e-07, |
| "loss": 0.0184, |
| "reward": 0.7964286148548126, |
| "reward_std": 0.3687057480216026, |
| "rewards/accuracy_reward": 0.010714286193251609, |
| "rewards/format_reward": 0.7857143342494964, |
| "step": 110 |
| }, |
| { |
| "completion_length": 210.16072540283204, |
| "epoch": 0.05599626691553897, |
| "grad_norm": 4.063913345336914, |
| "kl": 0.010107421875, |
| "learning_rate": 5.581395348837209e-07, |
| "loss": 0.0155, |
| "reward": 0.7964286148548126, |
| "reward_std": 0.34110155403614045, |
| "rewards/accuracy_reward": 0.010714286193251609, |
| "rewards/format_reward": 0.785714328289032, |
| "step": 120 |
| }, |
| { |
| "completion_length": 203.16429443359374, |
| "epoch": 0.06066262249183388, |
| "grad_norm": 1.8343654870986938, |
| "kl": 0.00831298828125, |
| "learning_rate": 6.046511627906976e-07, |
| "loss": 0.0343, |
| "reward": 0.8607143282890319, |
| "reward_std": 0.25221002399921416, |
| "rewards/accuracy_reward": 0.00357142873108387, |
| "rewards/format_reward": 0.8571429014205932, |
| "step": 130 |
| }, |
| { |
| "completion_length": 202.78929595947267, |
| "epoch": 0.06532897806812879, |
| "grad_norm": 0.8969087600708008, |
| "kl": 0.0120025634765625, |
| "learning_rate": 6.511627906976745e-07, |
| "loss": 0.0145, |
| "reward": 0.8321428954601288, |
| "reward_std": 0.2753357619047165, |
| "rewards/accuracy_reward": 0.00357142873108387, |
| "rewards/format_reward": 0.8285714685916901, |
| "step": 140 |
| }, |
| { |
| "completion_length": 200.78929595947267, |
| "epoch": 0.0699953336444237, |
| "grad_norm": 2.4758172035217285, |
| "kl": 0.016839599609375, |
| "learning_rate": 6.976744186046511e-07, |
| "loss": 0.0205, |
| "reward": 0.90357146859169, |
| "reward_std": 0.22839727699756623, |
| "rewards/accuracy_reward": 0.01785714365541935, |
| "rewards/format_reward": 0.8857143342494964, |
| "step": 150 |
| }, |
| { |
| "completion_length": 208.1607238769531, |
| "epoch": 0.07466168922071861, |
| "grad_norm": 1.8052928447723389, |
| "kl": 0.012713623046875, |
| "learning_rate": 7.441860465116279e-07, |
| "loss": 0.0122, |
| "reward": 0.9321428835391998, |
| "reward_std": 0.1500000089406967, |
| "rewards/accuracy_reward": 0.00357142873108387, |
| "rewards/format_reward": 0.9285714566707611, |
| "step": 160 |
| }, |
| { |
| "completion_length": 210.90000762939454, |
| "epoch": 0.07932804479701354, |
| "grad_norm": 1.3043458461761475, |
| "kl": 0.0185272216796875, |
| "learning_rate": 7.906976744186046e-07, |
| "loss": 0.0076, |
| "reward": 0.9178571820259094, |
| "reward_std": 0.1664957284927368, |
| "rewards/accuracy_reward": 0.00357142873108387, |
| "rewards/format_reward": 0.9142857551574707, |
| "step": 170 |
| }, |
| { |
| "completion_length": 204.02858123779296, |
| "epoch": 0.08399440037330845, |
| "grad_norm": 1.9711344242095947, |
| "kl": 0.024969482421875, |
| "learning_rate": 8.372093023255814e-07, |
| "loss": -0.0035, |
| "reward": 0.903571480512619, |
| "reward_std": 0.1994871750473976, |
| "rewards/accuracy_reward": 0.010714286193251609, |
| "rewards/format_reward": 0.8928571879863739, |
| "step": 180 |
| }, |
| { |
| "completion_length": 201.1821517944336, |
| "epoch": 0.08866075594960336, |
| "grad_norm": 8.437910079956055, |
| "kl": 0.023162841796875, |
| "learning_rate": 8.837209302325581e-07, |
| "loss": 0.0113, |
| "reward": 0.9178571820259094, |
| "reward_std": 0.18299144729971886, |
| "rewards/accuracy_reward": 0.010714286193251609, |
| "rewards/format_reward": 0.9071428894996643, |
| "step": 190 |
| }, |
| { |
| "completion_length": 205.58929443359375, |
| "epoch": 0.09332711152589827, |
| "grad_norm": 0.09394335001707077, |
| "kl": 0.01868896484375, |
| "learning_rate": 9.302325581395349e-07, |
| "loss": 0.0195, |
| "reward": 0.9214285910129547, |
| "reward_std": 0.13299144804477692, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.9214285910129547, |
| "step": 200 |
| }, |
| { |
| "completion_length": 217.59286651611328, |
| "epoch": 0.09799346710219319, |
| "grad_norm": 2.8316187858581543, |
| "kl": 0.021905517578125, |
| "learning_rate": 9.767441860465115e-07, |
| "loss": 0.0257, |
| "reward": 0.860714340209961, |
| "reward_std": 0.2807814501225948, |
| "rewards/accuracy_reward": 0.010714286193251609, |
| "rewards/format_reward": 0.8500000536441803, |
| "step": 210 |
| }, |
| { |
| "completion_length": 205.23929290771486, |
| "epoch": 0.1026598226784881, |
| "grad_norm": 1.3520991802215576, |
| "kl": 0.015557861328125, |
| "learning_rate": 9.99983405533249e-07, |
| "loss": 0.0129, |
| "reward": 0.9571429014205932, |
| "reward_std": 0.11428571939468384, |
| "rewards/accuracy_reward": 0.00714285746216774, |
| "rewards/format_reward": 0.9500000238418579, |
| "step": 220 |
| }, |
| { |
| "completion_length": 201.4571533203125, |
| "epoch": 0.10732617825478301, |
| "grad_norm": 6.0174970626831055, |
| "kl": 0.034246826171875, |
| "learning_rate": 9.99850656408199e-07, |
| "loss": 0.016, |
| "reward": 0.9785714983940125, |
| "reward_std": 0.1857142947614193, |
| "rewards/accuracy_reward": 0.0357142873108387, |
| "rewards/format_reward": 0.9428571701049805, |
| "step": 230 |
| }, |
| { |
| "completion_length": 197.5357223510742, |
| "epoch": 0.11199253383107793, |
| "grad_norm": 1.7986341714859009, |
| "kl": 0.0531494140625, |
| "learning_rate": 9.995851934039294e-07, |
| "loss": 0.0582, |
| "reward": 1.096428632736206, |
| "reward_std": 0.23630646169185637, |
| "rewards/accuracy_reward": 0.12500000596046448, |
| "rewards/format_reward": 0.9714285850524902, |
| "step": 240 |
| }, |
| { |
| "completion_length": 208.5428680419922, |
| "epoch": 0.11665888940737285, |
| "grad_norm": 2.8913028240203857, |
| "kl": 0.0509765625, |
| "learning_rate": 9.991870870027424e-07, |
| "loss": 0.0426, |
| "reward": 1.0678572058677673, |
| "reward_std": 0.19603439420461655, |
| "rewards/accuracy_reward": 0.10357143431901931, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 250 |
| }, |
| { |
| "completion_length": 187.68929443359374, |
| "epoch": 0.12132524498366776, |
| "grad_norm": 0.7588302493095398, |
| "kl": 0.05869140625, |
| "learning_rate": 9.98656442904699e-07, |
| "loss": 0.028, |
| "reward": 1.110714328289032, |
| "reward_std": 0.17129081785678862, |
| "rewards/accuracy_reward": 0.13928571566939354, |
| "rewards/format_reward": 0.9714285850524902, |
| "step": 260 |
| }, |
| { |
| "completion_length": 196.61429443359376, |
| "epoch": 0.12599160055996267, |
| "grad_norm": 36.95159912109375, |
| "kl": 0.0792724609375, |
| "learning_rate": 9.979934019995547e-07, |
| "loss": 0.0407, |
| "reward": 1.1428571820259095, |
| "reward_std": 0.299724480509758, |
| "rewards/accuracy_reward": 0.20714286752045155, |
| "rewards/format_reward": 0.935714316368103, |
| "step": 270 |
| }, |
| { |
| "completion_length": 207.1464385986328, |
| "epoch": 0.13065795613625758, |
| "grad_norm": 2.451353073120117, |
| "kl": 0.057666015625, |
| "learning_rate": 9.97198140329352e-07, |
| "loss": 0.032, |
| "reward": 1.1285714864730836, |
| "reward_std": 0.24082783833146096, |
| "rewards/accuracy_reward": 0.17142857611179352, |
| "rewards/format_reward": 0.9571428775787354, |
| "step": 280 |
| }, |
| { |
| "completion_length": 207.25000610351563, |
| "epoch": 0.1353243117125525, |
| "grad_norm": 1.8005753755569458, |
| "kl": 0.056298828125, |
| "learning_rate": 9.962708690416806e-07, |
| "loss": 0.0108, |
| "reward": 1.221428632736206, |
| "reward_std": 0.20909458994865418, |
| "rewards/accuracy_reward": 0.25000000894069674, |
| "rewards/format_reward": 0.9714285850524902, |
| "step": 290 |
| }, |
| { |
| "completion_length": 201.07143859863282, |
| "epoch": 0.1399906672888474, |
| "grad_norm": 1.7425264120101929, |
| "kl": 0.052978515625, |
| "learning_rate": 9.952118343336157e-07, |
| "loss": 0.0208, |
| "reward": 1.1678571820259094, |
| "reward_std": 0.24439741671085358, |
| "rewards/accuracy_reward": 0.22500001043081283, |
| "rewards/format_reward": 0.9428571581840515, |
| "step": 300 |
| }, |
| { |
| "completion_length": 211.52858123779296, |
| "epoch": 0.14465702286514232, |
| "grad_norm": 2.8271803855895996, |
| "kl": 0.0644287109375, |
| "learning_rate": 9.940213173863515e-07, |
| "loss": 0.0253, |
| "reward": 1.1714285969734193, |
| "reward_std": 0.2888915419578552, |
| "rewards/accuracy_reward": 0.22857143878936767, |
| "rewards/format_reward": 0.9428571701049805, |
| "step": 310 |
| }, |
| { |
| "completion_length": 212.41786499023436, |
| "epoch": 0.14932337844143723, |
| "grad_norm": 1.6736899614334106, |
| "kl": 0.063427734375, |
| "learning_rate": 9.926996342905446e-07, |
| "loss": 0.0327, |
| "reward": 1.2321429014205934, |
| "reward_std": 0.22101710960268975, |
| "rewards/accuracy_reward": 0.26785715818405154, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 320 |
| }, |
| { |
| "completion_length": 222.3071533203125, |
| "epoch": 0.15398973401773214, |
| "grad_norm": 1.0232563018798828, |
| "kl": 0.0625244140625, |
| "learning_rate": 9.912471359623905e-07, |
| "loss": 0.0303, |
| "reward": 1.2321429014205934, |
| "reward_std": 0.27462306767702105, |
| "rewards/accuracy_reward": 0.30357144474983216, |
| "rewards/format_reward": 0.9285714626312256, |
| "step": 330 |
| }, |
| { |
| "completion_length": 199.1357223510742, |
| "epoch": 0.15865608959402708, |
| "grad_norm": 0.4898677170276642, |
| "kl": 0.057177734375, |
| "learning_rate": 9.89664208050453e-07, |
| "loss": 0.0513, |
| "reward": 1.2321429133415223, |
| "reward_std": 0.25317725613713266, |
| "rewards/accuracy_reward": 0.27500001192092893, |
| "rewards/format_reward": 0.9571428775787354, |
| "step": 340 |
| }, |
| { |
| "completion_length": 184.546435546875, |
| "epoch": 0.163322445170322, |
| "grad_norm": 2.1455774307250977, |
| "kl": 0.069482421875, |
| "learning_rate": 9.879512708332718e-07, |
| "loss": 0.0067, |
| "reward": 1.2250000476837157, |
| "reward_std": 0.18287093117833136, |
| "rewards/accuracy_reward": 0.2607142999768257, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 350 |
| }, |
| { |
| "completion_length": 182.8071517944336, |
| "epoch": 0.1679888007466169, |
| "grad_norm": 1.5658093690872192, |
| "kl": 0.0677490234375, |
| "learning_rate": 9.861087791077743e-07, |
| "loss": 0.0095, |
| "reward": 1.2750000476837158, |
| "reward_std": 0.16363511905074118, |
| "rewards/accuracy_reward": 0.29642858952283857, |
| "rewards/format_reward": 0.9785714387893677, |
| "step": 360 |
| }, |
| { |
| "completion_length": 190.7214385986328, |
| "epoch": 0.1726551563229118, |
| "grad_norm": 1.7997597455978394, |
| "kl": 0.05615234375, |
| "learning_rate": 9.841372220685253e-07, |
| "loss": 0.0157, |
| "reward": 1.235714316368103, |
| "reward_std": 0.2049168437719345, |
| "rewards/accuracy_reward": 0.2785714462399483, |
| "rewards/format_reward": 0.9571428775787354, |
| "step": 370 |
| }, |
| { |
| "completion_length": 193.371435546875, |
| "epoch": 0.17732151189920672, |
| "grad_norm": 1.4841110706329346, |
| "kl": 0.060205078125, |
| "learning_rate": 9.820371231778422e-07, |
| "loss": 0.0343, |
| "reward": 1.3142857551574707, |
| "reward_std": 0.23386457264423371, |
| "rewards/accuracy_reward": 0.3642857328057289, |
| "rewards/format_reward": 0.9500000178813934, |
| "step": 380 |
| }, |
| { |
| "completion_length": 201.2071563720703, |
| "epoch": 0.18198786747550164, |
| "grad_norm": 1.3704997301101685, |
| "kl": 0.0554443359375, |
| "learning_rate": 9.79809040026811e-07, |
| "loss": 0.0187, |
| "reward": 1.2250000596046449, |
| "reward_std": 0.2502213083207607, |
| "rewards/accuracy_reward": 0.28214287348091605, |
| "rewards/format_reward": 0.9428571701049805, |
| "step": 390 |
| }, |
| { |
| "completion_length": 186.56429443359374, |
| "epoch": 0.18665422305179655, |
| "grad_norm": 2.2330009937286377, |
| "kl": 0.0556884765625, |
| "learning_rate": 9.774535641872433e-07, |
| "loss": 0.0379, |
| "reward": 1.2428571820259093, |
| "reward_std": 0.3126678854227066, |
| "rewards/accuracy_reward": 0.3214285898953676, |
| "rewards/format_reward": 0.9214286029338836, |
| "step": 400 |
| }, |
| { |
| "completion_length": 171.86786651611328, |
| "epoch": 0.19132057862809146, |
| "grad_norm": 1.9319186210632324, |
| "kl": 0.05830078125, |
| "learning_rate": 9.749713210546087e-07, |
| "loss": 0.0061, |
| "reward": 1.371428632736206, |
| "reward_std": 0.2255903147161007, |
| "rewards/accuracy_reward": 0.4000000223517418, |
| "rewards/format_reward": 0.9714285790920257, |
| "step": 410 |
| }, |
| { |
| "completion_length": 194.71429443359375, |
| "epoch": 0.19598693420438637, |
| "grad_norm": 1.76435387134552, |
| "kl": 0.0529052734375, |
| "learning_rate": 9.723629696819884e-07, |
| "loss": 0.0143, |
| "reward": 1.285714340209961, |
| "reward_std": 0.20479509681463243, |
| "rewards/accuracy_reward": 0.3357142999768257, |
| "rewards/format_reward": 0.9500000238418579, |
| "step": 420 |
| }, |
| { |
| "completion_length": 188.8714370727539, |
| "epoch": 0.20065328978068128, |
| "grad_norm": 1.158916711807251, |
| "kl": 0.0750244140625, |
| "learning_rate": 9.696292026050922e-07, |
| "loss": 0.0165, |
| "reward": 1.3178571939468384, |
| "reward_std": 0.26929790526628494, |
| "rewards/accuracy_reward": 0.382142873480916, |
| "rewards/format_reward": 0.935714316368103, |
| "step": 430 |
| }, |
| { |
| "completion_length": 179.84644012451173, |
| "epoch": 0.2053196453569762, |
| "grad_norm": 1.382643699645996, |
| "kl": 0.11826171875, |
| "learning_rate": 9.66770745658385e-07, |
| "loss": 0.0183, |
| "reward": 1.2571429371833802, |
| "reward_std": 0.21266788095235825, |
| "rewards/accuracy_reward": 0.3071428656578064, |
| "rewards/format_reward": 0.9500000238418579, |
| "step": 440 |
| }, |
| { |
| "completion_length": 180.546435546875, |
| "epoch": 0.2099860009332711, |
| "grad_norm": 1.8777272701263428, |
| "kl": 0.0839599609375, |
| "learning_rate": 9.637883577823721e-07, |
| "loss": 0.0276, |
| "reward": 1.3428572058677672, |
| "reward_std": 0.24049336314201356, |
| "rewards/accuracy_reward": 0.40000001937150953, |
| "rewards/format_reward": 0.9428571701049805, |
| "step": 450 |
| }, |
| { |
| "completion_length": 190.68572387695312, |
| "epoch": 0.21465235650956602, |
| "grad_norm": 1.0944005250930786, |
| "kl": 0.0735107421875, |
| "learning_rate": 9.606828308220969e-07, |
| "loss": 0.0113, |
| "reward": 1.2785714745521546, |
| "reward_std": 0.21487789005041122, |
| "rewards/accuracy_reward": 0.32857144325971605, |
| "rewards/format_reward": 0.9500000238418579, |
| "step": 460 |
| }, |
| { |
| "completion_length": 197.60357971191405, |
| "epoch": 0.21931871208586096, |
| "grad_norm": 0.9634405970573425, |
| "kl": 0.073779296875, |
| "learning_rate": 9.574549893168977e-07, |
| "loss": 0.0197, |
| "reward": 1.2535714626312255, |
| "reward_std": 0.17683308124542235, |
| "rewards/accuracy_reward": 0.2892857242375612, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 470 |
| }, |
| { |
| "completion_length": 203.69643859863282, |
| "epoch": 0.22398506766215587, |
| "grad_norm": 1.3643263578414917, |
| "kl": 0.08203125, |
| "learning_rate": 9.541056902814896e-07, |
| "loss": 0.0227, |
| "reward": 1.285714340209961, |
| "reward_std": 0.27460705786943435, |
| "rewards/accuracy_reward": 0.37857144623994826, |
| "rewards/format_reward": 0.9071428894996643, |
| "step": 480 |
| }, |
| { |
| "completion_length": 220.28929595947267, |
| "epoch": 0.22865142323845078, |
| "grad_norm": 1.668428897857666, |
| "kl": 0.0689453125, |
| "learning_rate": 9.506358229784194e-07, |
| "loss": 0.0146, |
| "reward": 1.3071429133415222, |
| "reward_std": 0.17622366920113564, |
| "rewards/accuracy_reward": 0.3571428686380386, |
| "rewards/format_reward": 0.9500000238418579, |
| "step": 490 |
| }, |
| { |
| "completion_length": 216.17501220703124, |
| "epoch": 0.2333177788147457, |
| "grad_norm": 2.019341468811035, |
| "kl": 0.079345703125, |
| "learning_rate": 9.4704630868196e-07, |
| "loss": 0.0646, |
| "reward": 1.1821429014205933, |
| "reward_std": 0.23459327667951585, |
| "rewards/accuracy_reward": 0.2535714410245419, |
| "rewards/format_reward": 0.9285714566707611, |
| "step": 500 |
| }, |
| { |
| "completion_length": 205.07501068115235, |
| "epoch": 0.2379841343910406, |
| "grad_norm": 1.7414186000823975, |
| "kl": 0.0832763671875, |
| "learning_rate": 9.433381004335061e-07, |
| "loss": 0.0468, |
| "reward": 1.2071429073810578, |
| "reward_std": 0.2705150328576565, |
| "rewards/accuracy_reward": 0.30000001192092896, |
| "rewards/format_reward": 0.9071428894996643, |
| "step": 510 |
| }, |
| { |
| "completion_length": 180.91429595947267, |
| "epoch": 0.24265048996733551, |
| "grad_norm": 2.2701659202575684, |
| "kl": 0.124853515625, |
| "learning_rate": 9.395121827885355e-07, |
| "loss": 0.0327, |
| "reward": 1.3142857670783996, |
| "reward_std": 0.21858522519469262, |
| "rewards/accuracy_reward": 0.37142859064042566, |
| "rewards/format_reward": 0.9428571701049805, |
| "step": 520 |
| }, |
| { |
| "completion_length": 192.1821517944336, |
| "epoch": 0.24731684554363043, |
| "grad_norm": 14.688100814819336, |
| "kl": 0.170263671875, |
| "learning_rate": 9.355695715552011e-07, |
| "loss": 0.0272, |
| "reward": 1.2142857670783997, |
| "reward_std": 0.20738017484545707, |
| "rewards/accuracy_reward": 0.27142858095467093, |
| "rewards/format_reward": 0.9428571701049805, |
| "step": 530 |
| }, |
| { |
| "completion_length": 202.57500915527345, |
| "epoch": 0.25198320111992534, |
| "grad_norm": 2.822713613510132, |
| "kl": 0.1373046875, |
| "learning_rate": 9.315113135246283e-07, |
| "loss": 0.0432, |
| "reward": 1.1928571820259095, |
| "reward_std": 0.35453804582357407, |
| "rewards/accuracy_reward": 0.28571429997682574, |
| "rewards/format_reward": 0.9071429014205933, |
| "step": 540 |
| }, |
| { |
| "completion_length": 202.9321517944336, |
| "epoch": 0.25664955669622025, |
| "grad_norm": 1.5090000629425049, |
| "kl": 0.12421875, |
| "learning_rate": 9.273384861929836e-07, |
| "loss": 0.0491, |
| "reward": 1.1178572058677674, |
| "reward_std": 0.23188644349575044, |
| "rewards/accuracy_reward": 0.19642858132719992, |
| "rewards/format_reward": 0.9214286029338836, |
| "step": 550 |
| }, |
| { |
| "completion_length": 192.82857666015624, |
| "epoch": 0.26131591227251516, |
| "grad_norm": 0.5006041526794434, |
| "kl": 0.205029296875, |
| "learning_rate": 9.230521974753919e-07, |
| "loss": 0.0594, |
| "reward": 1.2321429252624512, |
| "reward_std": 0.2444589801132679, |
| "rewards/accuracy_reward": 0.3321428716182709, |
| "rewards/format_reward": 0.9000000298023224, |
| "step": 560 |
| }, |
| { |
| "completion_length": 171.6428649902344, |
| "epoch": 0.26598226784881007, |
| "grad_norm": 1.1855828762054443, |
| "kl": 0.193359375, |
| "learning_rate": 9.186535854117776e-07, |
| "loss": 0.037, |
| "reward": 1.260714328289032, |
| "reward_std": 0.2411015473306179, |
| "rewards/accuracy_reward": 0.33928573578596116, |
| "rewards/format_reward": 0.9214286029338836, |
| "step": 570 |
| }, |
| { |
| "completion_length": 180.17143707275392, |
| "epoch": 0.270648623425105, |
| "grad_norm": 1.677296757698059, |
| "kl": 0.1857421875, |
| "learning_rate": 9.141438178647065e-07, |
| "loss": 0.0374, |
| "reward": 1.3000000596046448, |
| "reward_std": 0.2666125223040581, |
| "rewards/accuracy_reward": 0.3714285910129547, |
| "rewards/format_reward": 0.9285714626312256, |
| "step": 580 |
| }, |
| { |
| "completion_length": 186.6821533203125, |
| "epoch": 0.2753149790013999, |
| "grad_norm": 5.2433247566223145, |
| "kl": 0.16669921875, |
| "learning_rate": 9.095240922093104e-07, |
| "loss": 0.0407, |
| "reward": 1.3500000715255738, |
| "reward_std": 0.18397593572735788, |
| "rewards/accuracy_reward": 0.39285715818405154, |
| "rewards/format_reward": 0.9571428656578064, |
| "step": 590 |
| }, |
| { |
| "completion_length": 180.6428680419922, |
| "epoch": 0.2799813345776948, |
| "grad_norm": 0.4842807650566101, |
| "kl": 0.190380859375, |
| "learning_rate": 9.047956350153752e-07, |
| "loss": 0.0147, |
| "reward": 1.246428620815277, |
| "reward_std": 0.21586237102746964, |
| "rewards/accuracy_reward": 0.28928572684526443, |
| "rewards/format_reward": 0.9571428775787354, |
| "step": 600 |
| }, |
| { |
| "completion_length": 191.20358123779297, |
| "epoch": 0.2846476901539897, |
| "grad_norm": 2.350338935852051, |
| "kl": 0.14912109375, |
| "learning_rate": 8.999597017216782e-07, |
| "loss": 0.0334, |
| "reward": 1.3035714983940125, |
| "reward_std": 0.16870573312044143, |
| "rewards/accuracy_reward": 0.33928572833538057, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 610 |
| }, |
| { |
| "completion_length": 193.98215026855468, |
| "epoch": 0.28931404573028463, |
| "grad_norm": 2.8800251483917236, |
| "kl": 0.21923828125, |
| "learning_rate": 8.950175763026604e-07, |
| "loss": 0.0245, |
| "reward": 1.2071429133415221, |
| "reward_std": 0.2516971692442894, |
| "rewards/accuracy_reward": 0.2642857298254967, |
| "rewards/format_reward": 0.942857164144516, |
| "step": 620 |
| }, |
| { |
| "completion_length": 191.61429290771486, |
| "epoch": 0.29398040130657954, |
| "grad_norm": 2.0671656131744385, |
| "kl": 0.1837890625, |
| "learning_rate": 8.899705709275217e-07, |
| "loss": 0.0145, |
| "reward": 1.385714340209961, |
| "reward_std": 0.20700510069727898, |
| "rewards/accuracy_reward": 0.4428571671247482, |
| "rewards/format_reward": 0.9428571701049805, |
| "step": 630 |
| }, |
| { |
| "completion_length": 199.62858123779296, |
| "epoch": 0.29864675688287445, |
| "grad_norm": 5.944628715515137, |
| "kl": 0.29609375, |
| "learning_rate": 8.848200256118312e-07, |
| "loss": 0.0386, |
| "reward": 1.246428644657135, |
| "reward_std": 0.2530567437410355, |
| "rewards/accuracy_reward": 0.3035714406520128, |
| "rewards/format_reward": 0.942857164144516, |
| "step": 640 |
| }, |
| { |
| "completion_length": 227.48572540283203, |
| "epoch": 0.30331311245916937, |
| "grad_norm": 8.969457626342773, |
| "kl": 0.387255859375, |
| "learning_rate": 8.795673078617432e-07, |
| "loss": 0.0707, |
| "reward": 1.2464286088943481, |
| "reward_std": 0.2629224382340908, |
| "rewards/accuracy_reward": 0.3250000149011612, |
| "rewards/format_reward": 0.9214286029338836, |
| "step": 650 |
| }, |
| { |
| "completion_length": 210.75715026855468, |
| "epoch": 0.3079794680354643, |
| "grad_norm": 4.416165828704834, |
| "kl": 0.473828125, |
| "learning_rate": 8.74213812310915e-07, |
| "loss": 0.0801, |
| "reward": 1.221428632736206, |
| "reward_std": 0.27719090431928634, |
| "rewards/accuracy_reward": 0.29285715967416764, |
| "rewards/format_reward": 0.9285714626312256, |
| "step": 660 |
| }, |
| { |
| "completion_length": 200.8928649902344, |
| "epoch": 0.31264582361175924, |
| "grad_norm": 7.957707405090332, |
| "kl": 0.52421875, |
| "learning_rate": 8.68760960350222e-07, |
| "loss": 0.0485, |
| "reward": 1.221428644657135, |
| "reward_std": 0.31562927216291425, |
| "rewards/accuracy_reward": 0.3071428701281548, |
| "rewards/format_reward": 0.9142857432365418, |
| "step": 670 |
| }, |
| { |
| "completion_length": 194.02857971191406, |
| "epoch": 0.31731217918805416, |
| "grad_norm": 2.773921012878418, |
| "kl": 0.4138671875, |
| "learning_rate": 8.632101997503674e-07, |
| "loss": 0.0431, |
| "reward": 1.246428620815277, |
| "reward_std": 0.2256075546145439, |
| "rewards/accuracy_reward": 0.3178571492433548, |
| "rewards/format_reward": 0.9285714626312256, |
| "step": 680 |
| }, |
| { |
| "completion_length": 183.4214324951172, |
| "epoch": 0.32197853476434907, |
| "grad_norm": 3.124154567718506, |
| "kl": 0.189404296875, |
| "learning_rate": 8.575630042774902e-07, |
| "loss": 0.0263, |
| "reward": 1.260714340209961, |
| "reward_std": 0.1950671538710594, |
| "rewards/accuracy_reward": 0.3107142999768257, |
| "rewards/format_reward": 0.950000011920929, |
| "step": 690 |
| }, |
| { |
| "completion_length": 214.20000915527345, |
| "epoch": 0.326644890340644, |
| "grad_norm": 5.170936107635498, |
| "kl": 0.279833984375, |
| "learning_rate": 8.518208733018689e-07, |
| "loss": 0.0798, |
| "reward": 1.2071429252624513, |
| "reward_std": 0.36203873455524443, |
| "rewards/accuracy_reward": 0.30714287534356116, |
| "rewards/format_reward": 0.9000000417232513, |
| "step": 700 |
| }, |
| { |
| "completion_length": 236.79644165039062, |
| "epoch": 0.3313112459169389, |
| "grad_norm": 4.168279647827148, |
| "kl": 0.5509765625, |
| "learning_rate": 8.459853313998283e-07, |
| "loss": 0.1131, |
| "reward": 1.2428571939468385, |
| "reward_std": 0.33234085887670517, |
| "rewards/accuracy_reward": 0.35714287459850313, |
| "rewards/format_reward": 0.8857143223285675, |
| "step": 710 |
| }, |
| { |
| "completion_length": 212.58929443359375, |
| "epoch": 0.3359776014932338, |
| "grad_norm": 5.4327569007873535, |
| "kl": 0.4990234375, |
| "learning_rate": 8.400579279489541e-07, |
| "loss": 0.095, |
| "reward": 1.2250000715255738, |
| "reward_std": 0.31540548279881475, |
| "rewards/accuracy_reward": 0.2964285884052515, |
| "rewards/format_reward": 0.9285714626312256, |
| "step": 720 |
| }, |
| { |
| "completion_length": 219.4178665161133, |
| "epoch": 0.3406439570695287, |
| "grad_norm": 6.700848579406738, |
| "kl": 0.37919921875, |
| "learning_rate": 8.340402367167216e-07, |
| "loss": 0.0824, |
| "reward": 1.1571429014205932, |
| "reward_std": 0.30023063272237777, |
| "rewards/accuracy_reward": 0.25000000819563867, |
| "rewards/format_reward": 0.9071428835391998, |
| "step": 730 |
| }, |
| { |
| "completion_length": 193.31786346435547, |
| "epoch": 0.3453103126458236, |
| "grad_norm": 4.084702014923096, |
| "kl": 0.28115234375, |
| "learning_rate": 8.2793385544265e-07, |
| "loss": 0.0288, |
| "reward": 1.2535714745521545, |
| "reward_std": 0.3358024753630161, |
| "rewards/accuracy_reward": 0.339285734295845, |
| "rewards/format_reward": 0.9142857432365418, |
| "step": 740 |
| }, |
| { |
| "completion_length": 208.07858123779297, |
| "epoch": 0.34997666822211854, |
| "grad_norm": 5.686543941497803, |
| "kl": 0.33671875, |
| "learning_rate": 8.217404054140909e-07, |
| "loss": 0.0335, |
| "reward": 1.1714286088943482, |
| "reward_std": 0.2581101007759571, |
| "rewards/accuracy_reward": 0.23571430072188376, |
| "rewards/format_reward": 0.9357143104076385, |
| "step": 750 |
| }, |
| { |
| "completion_length": 205.19286651611327, |
| "epoch": 0.35464302379841345, |
| "grad_norm": 2.9921562671661377, |
| "kl": 0.177294921875, |
| "learning_rate": 8.154615310357649e-07, |
| "loss": 0.0755, |
| "reward": 1.2428571939468385, |
| "reward_std": 0.27265038043260575, |
| "rewards/accuracy_reward": 0.30714286789298056, |
| "rewards/format_reward": 0.935714316368103, |
| "step": 760 |
| }, |
| { |
| "completion_length": 211.06072692871095, |
| "epoch": 0.35930937937470836, |
| "grad_norm": 2.7796568870544434, |
| "kl": 0.48134765625, |
| "learning_rate": 8.090988993931609e-07, |
| "loss": 0.0967, |
| "reward": 1.3071429133415222, |
| "reward_std": 0.27672048956155776, |
| "rewards/accuracy_reward": 0.39285716861486436, |
| "rewards/format_reward": 0.9142857491970062, |
| "step": 770 |
| }, |
| { |
| "completion_length": 210.21429595947265, |
| "epoch": 0.36397573495100327, |
| "grad_norm": 4.467612266540527, |
| "kl": 0.39296875, |
| "learning_rate": 8.026541998099126e-07, |
| "loss": 0.1026, |
| "reward": 1.1821429014205933, |
| "reward_std": 0.20331501960754395, |
| "rewards/accuracy_reward": 0.24642858393490313, |
| "rewards/format_reward": 0.9357143044471741, |
| "step": 780 |
| }, |
| { |
| "completion_length": 184.62858123779296, |
| "epoch": 0.3686420905272982, |
| "grad_norm": 2.8182897567749023, |
| "kl": 0.4158203125, |
| "learning_rate": 7.961291433992723e-07, |
| "loss": 0.0864, |
| "reward": 1.2571429133415222, |
| "reward_std": 0.3440789520740509, |
| "rewards/accuracy_reward": 0.35714287906885145, |
| "rewards/format_reward": 0.9000000417232513, |
| "step": 790 |
| }, |
| { |
| "completion_length": 177.10000915527343, |
| "epoch": 0.3733084461035931, |
| "grad_norm": 5.841248035430908, |
| "kl": 0.25205078125, |
| "learning_rate": 7.895254626097964e-07, |
| "loss": 0.0477, |
| "reward": 1.3178572177886962, |
| "reward_std": 0.26967298090457914, |
| "rewards/accuracy_reward": 0.3750000178813934, |
| "rewards/format_reward": 0.9428571701049805, |
| "step": 800 |
| }, |
| { |
| "completion_length": 186.17857971191407, |
| "epoch": 0.377974801679888, |
| "grad_norm": 2.7772974967956543, |
| "kl": 0.372216796875, |
| "learning_rate": 7.828449107653703e-07, |
| "loss": 0.0548, |
| "reward": 1.2035714864730835, |
| "reward_std": 0.21819290220737458, |
| "rewards/accuracy_reward": 0.26071429774165156, |
| "rewards/format_reward": 0.9428571701049805, |
| "step": 810 |
| }, |
| { |
| "completion_length": 179.5714370727539, |
| "epoch": 0.3826411572561829, |
| "grad_norm": 3.573528528213501, |
| "kl": 0.5103515625, |
| "learning_rate": 7.760892615996862e-07, |
| "loss": 0.0807, |
| "reward": 1.296428620815277, |
| "reward_std": 0.20319449976086618, |
| "rewards/accuracy_reward": 0.3392857268452644, |
| "rewards/format_reward": 0.9571428775787354, |
| "step": 820 |
| }, |
| { |
| "completion_length": 201.9464370727539, |
| "epoch": 0.38730751283247783, |
| "grad_norm": 2.7865989208221436, |
| "kl": 0.32451171875, |
| "learning_rate": 7.692603087853061e-07, |
| "loss": 0.129, |
| "reward": 1.1964286327362061, |
| "reward_std": 0.2745025597512722, |
| "rewards/accuracy_reward": 0.28214287012815475, |
| "rewards/format_reward": 0.9142857372760773, |
| "step": 830 |
| }, |
| { |
| "completion_length": 187.04286499023436, |
| "epoch": 0.39197386840877274, |
| "grad_norm": 5.086669445037842, |
| "kl": 0.412451171875, |
| "learning_rate": 7.623598654574282e-07, |
| "loss": 0.0784, |
| "reward": 1.2285714864730835, |
| "reward_std": 0.19613576233386992, |
| "rewards/accuracy_reward": 0.2785714466124773, |
| "rewards/format_reward": 0.9500000238418579, |
| "step": 840 |
| }, |
| { |
| "completion_length": 205.9321517944336, |
| "epoch": 0.39664022398506765, |
| "grad_norm": 6.3026580810546875, |
| "kl": 0.42822265625, |
| "learning_rate": 7.553897637324871e-07, |
| "loss": 0.1118, |
| "reward": 1.23571435213089, |
| "reward_std": 0.264027439057827, |
| "rewards/accuracy_reward": 0.32142858356237414, |
| "rewards/format_reward": 0.9142857491970062, |
| "step": 850 |
| }, |
| { |
| "completion_length": 211.72857971191405, |
| "epoch": 0.40130657956136256, |
| "grad_norm": 7.240902423858643, |
| "kl": 0.609912109375, |
| "learning_rate": 7.483518542217136e-07, |
| "loss": 0.1452, |
| "reward": 1.2392857789993286, |
| "reward_std": 0.2891633503139019, |
| "rewards/accuracy_reward": 0.3250000163912773, |
| "rewards/format_reward": 0.9142857491970062, |
| "step": 860 |
| }, |
| { |
| "completion_length": 196.15000915527344, |
| "epoch": 0.4059729351376575, |
| "grad_norm": 7.294248104095459, |
| "kl": 0.38701171875, |
| "learning_rate": 7.412480055397843e-07, |
| "loss": 0.0556, |
| "reward": 1.2500000596046448, |
| "reward_std": 0.2849683463573456, |
| "rewards/accuracy_reward": 0.3285714417695999, |
| "rewards/format_reward": 0.9214285969734192, |
| "step": 870 |
| }, |
| { |
| "completion_length": 202.96786651611328, |
| "epoch": 0.4106392907139524, |
| "grad_norm": 2.0189414024353027, |
| "kl": 0.35, |
| "learning_rate": 7.340801038086918e-07, |
| "loss": 0.0262, |
| "reward": 1.2250000476837157, |
| "reward_std": 0.19948717057704926, |
| "rewards/accuracy_reward": 0.2750000137835741, |
| "rewards/format_reward": 0.9500000238418579, |
| "step": 880 |
| }, |
| { |
| "completion_length": 190.95001068115235, |
| "epoch": 0.4153056462902473, |
| "grad_norm": 11.575712203979492, |
| "kl": 0.3896484375, |
| "learning_rate": 7.268500521569655e-07, |
| "loss": 0.0922, |
| "reward": 1.2142857670783997, |
| "reward_std": 0.26723918691277504, |
| "rewards/accuracy_reward": 0.3071428701281548, |
| "rewards/format_reward": 0.9071428835391998, |
| "step": 890 |
| }, |
| { |
| "completion_length": 181.60000762939453, |
| "epoch": 0.4199720018665422, |
| "grad_norm": 8.530049324035645, |
| "kl": 0.45927734375, |
| "learning_rate": 7.195597702143772e-07, |
| "loss": 0.0985, |
| "reward": 1.1571429133415223, |
| "reward_std": 0.27336115539073946, |
| "rewards/accuracy_reward": 0.22857143953442574, |
| "rewards/format_reward": 0.9285714626312256, |
| "step": 900 |
| }, |
| { |
| "completion_length": 190.7464385986328, |
| "epoch": 0.4246383574428371, |
| "grad_norm": 5.701374530792236, |
| "kl": 1.1736328125, |
| "learning_rate": 7.122111936022668e-07, |
| "loss": 0.1988, |
| "reward": 1.2500000596046448, |
| "reward_std": 0.24506716057658195, |
| "rewards/accuracy_reward": 0.32857144474983213, |
| "rewards/format_reward": 0.9214286029338836, |
| "step": 910 |
| }, |
| { |
| "completion_length": 172.73214950561524, |
| "epoch": 0.42930471301913203, |
| "grad_norm": 4.281832695007324, |
| "kl": 1.024951171875, |
| "learning_rate": 7.048062734196204e-07, |
| "loss": 0.1912, |
| "reward": 1.2642857909202576, |
| "reward_std": 0.343449330329895, |
| "rewards/accuracy_reward": 0.35000001043081286, |
| "rewards/format_reward": 0.9142857432365418, |
| "step": 920 |
| }, |
| { |
| "completion_length": 197.4178665161133, |
| "epoch": 0.43397106859542695, |
| "grad_norm": 4.720562934875488, |
| "kl": 0.543603515625, |
| "learning_rate": 6.9734697572504e-07, |
| "loss": 0.0907, |
| "reward": 1.2571429133415222, |
| "reward_std": 0.2518312208354473, |
| "rewards/accuracy_reward": 0.3285714462399483, |
| "rewards/format_reward": 0.9285714566707611, |
| "step": 930 |
| }, |
| { |
| "completion_length": 196.92858123779297, |
| "epoch": 0.4386374241717219, |
| "grad_norm": 77.73806762695312, |
| "kl": 1.05439453125, |
| "learning_rate": 6.89835281014741e-07, |
| "loss": 0.1745, |
| "reward": 1.285714328289032, |
| "reward_std": 0.28164542019367217, |
| "rewards/accuracy_reward": 0.3642857253551483, |
| "rewards/format_reward": 0.9214285969734192, |
| "step": 940 |
| }, |
| { |
| "completion_length": 179.39286575317382, |
| "epoch": 0.4433037797480168, |
| "grad_norm": 2.6931166648864746, |
| "kl": 0.42705078125, |
| "learning_rate": 6.822731836967168e-07, |
| "loss": 0.0645, |
| "reward": 1.3428572177886964, |
| "reward_std": 0.19739395827054979, |
| "rewards/accuracy_reward": 0.392857152223587, |
| "rewards/format_reward": 0.9500000238418579, |
| "step": 950 |
| }, |
| { |
| "completion_length": 187.36429443359376, |
| "epoch": 0.44797013532431174, |
| "grad_norm": 5.5171427726745605, |
| "kl": 0.443359375, |
| "learning_rate": 6.746626915612085e-07, |
| "loss": 0.0781, |
| "reward": 1.2857143521308898, |
| "reward_std": 0.19799869433045386, |
| "rewards/accuracy_reward": 0.357142873108387, |
| "rewards/format_reward": 0.9285714566707611, |
| "step": 960 |
| }, |
| { |
| "completion_length": 184.37500915527343, |
| "epoch": 0.45263649090060665, |
| "grad_norm": 13.36310863494873, |
| "kl": 1.1681640625, |
| "learning_rate": 6.670058252476235e-07, |
| "loss": 0.2008, |
| "reward": 1.3428571820259094, |
| "reward_std": 0.262357784062624, |
| "rewards/accuracy_reward": 0.41428573429584503, |
| "rewards/format_reward": 0.9285714626312256, |
| "step": 970 |
| }, |
| { |
| "completion_length": 195.95358123779297, |
| "epoch": 0.45730284647690156, |
| "grad_norm": 2.2677857875823975, |
| "kl": 1.1369140625, |
| "learning_rate": 6.593046177080408e-07, |
| "loss": 0.1455, |
| "reward": 1.1714286088943482, |
| "reward_std": 0.3041789963841438, |
| "rewards/accuracy_reward": 0.2714285865426064, |
| "rewards/format_reward": 0.9000000298023224, |
| "step": 980 |
| }, |
| { |
| "completion_length": 193.67857818603517, |
| "epoch": 0.46196920205319647, |
| "grad_norm": 1.030552625656128, |
| "kl": 0.56953125, |
| "learning_rate": 6.515611136674479e-07, |
| "loss": 0.0992, |
| "reward": 1.2642857789993287, |
| "reward_std": 0.1844715215265751, |
| "rewards/accuracy_reward": 0.32142859101295473, |
| "rewards/format_reward": 0.9428571701049805, |
| "step": 990 |
| }, |
| { |
| "completion_length": 191.07143630981446, |
| "epoch": 0.4666355576294914, |
| "grad_norm": 7.8564453125, |
| "kl": 0.504833984375, |
| "learning_rate": 6.437773690808524e-07, |
| "loss": 0.099, |
| "reward": 1.3071429133415222, |
| "reward_std": 0.24715664908289908, |
| "rewards/accuracy_reward": 0.3714285910129547, |
| "rewards/format_reward": 0.935714316368103, |
| "step": 1000 |
| }, |
| { |
| "completion_length": 201.9714370727539, |
| "epoch": 0.4713019132057863, |
| "grad_norm": 2.8117690086364746, |
| "kl": 1.044677734375, |
| "learning_rate": 6.359554505874109e-07, |
| "loss": 0.2054, |
| "reward": 1.196428608894348, |
| "reward_std": 0.3307777248322964, |
| "rewards/accuracy_reward": 0.2821428716182709, |
| "rewards/format_reward": 0.9142857551574707, |
| "step": 1010 |
| }, |
| { |
| "completion_length": 189.82500915527345, |
| "epoch": 0.4759682687820812, |
| "grad_norm": 11.323598861694336, |
| "kl": 0.63037109375, |
| "learning_rate": 6.280974349617214e-07, |
| "loss": 0.095, |
| "reward": 1.2785714745521546, |
| "reward_std": 0.26157640293240547, |
| "rewards/accuracy_reward": 0.35000001788139345, |
| "rewards/format_reward": 0.9285714566707611, |
| "step": 1020 |
| }, |
| { |
| "completion_length": 190.90001068115234, |
| "epoch": 0.4806346243583761, |
| "grad_norm": 14.018318176269531, |
| "kl": 0.6455078125, |
| "learning_rate": 6.202054085624261e-07, |
| "loss": 0.1192, |
| "reward": 1.2857143521308898, |
| "reward_std": 0.23778653591871263, |
| "rewards/accuracy_reward": 0.35714287161827085, |
| "rewards/format_reward": 0.9285714566707611, |
| "step": 1030 |
| }, |
| { |
| "completion_length": 189.96786499023438, |
| "epoch": 0.48530097993467103, |
| "grad_norm": 5.3534932136535645, |
| "kl": 0.632958984375, |
| "learning_rate": 6.122814667782673e-07, |
| "loss": 0.0864, |
| "reward": 1.2285714626312256, |
| "reward_std": 0.1533150166273117, |
| "rewards/accuracy_reward": 0.2500000149011612, |
| "rewards/format_reward": 0.9785714387893677, |
| "step": 1040 |
| }, |
| { |
| "completion_length": 195.0642936706543, |
| "epoch": 0.48996733551096594, |
| "grad_norm": 6.974513530731201, |
| "kl": 0.494384765625, |
| "learning_rate": 6.043277134717475e-07, |
| "loss": 0.0765, |
| "reward": 1.3321428894996643, |
| "reward_std": 0.1950671575963497, |
| "rewards/accuracy_reward": 0.37500001639127734, |
| "rewards/format_reward": 0.9571428775787354, |
| "step": 1050 |
| }, |
| { |
| "completion_length": 204.17501068115234, |
| "epoch": 0.49463369108726085, |
| "grad_norm": 6.67943000793457, |
| "kl": 0.497265625, |
| "learning_rate": 5.963462604205392e-07, |
| "loss": 0.0889, |
| "reward": 1.260714340209961, |
| "reward_std": 0.23447152674198152, |
| "rewards/accuracy_reward": 0.3250000193715096, |
| "rewards/format_reward": 0.935714316368103, |
| "step": 1060 |
| }, |
| { |
| "completion_length": 217.91072692871094, |
| "epoch": 0.49930004666355576, |
| "grad_norm": 2.6225903034210205, |
| "kl": 1.211474609375, |
| "learning_rate": 5.883392267567924e-07, |
| "loss": 0.1539, |
| "reward": 1.2142857789993287, |
| "reward_std": 0.2801480941474438, |
| "rewards/accuracy_reward": 0.28571429699659345, |
| "rewards/format_reward": 0.9285714566707611, |
| "step": 1070 |
| }, |
| { |
| "completion_length": 200.80000762939454, |
| "epoch": 0.5039664022398507, |
| "grad_norm": 17.208133697509766, |
| "kl": 1.3998046875, |
| "learning_rate": 5.803087384044902e-07, |
| "loss": 0.2627, |
| "reward": 1.2071429014205932, |
| "reward_std": 0.3747034803032875, |
| "rewards/accuracy_reward": 0.3214285850524902, |
| "rewards/format_reward": 0.885714328289032, |
| "step": 1080 |
| }, |
| { |
| "completion_length": 184.55357818603517, |
| "epoch": 0.5086327578161456, |
| "grad_norm": 4.367872714996338, |
| "kl": 0.80498046875, |
| "learning_rate": 5.722569275150019e-07, |
| "loss": 0.1581, |
| "reward": 1.2642857551574707, |
| "reward_std": 0.2569018341600895, |
| "rewards/accuracy_reward": 0.3285714417695999, |
| "rewards/format_reward": 0.9357143104076385, |
| "step": 1090 |
| }, |
| { |
| "completion_length": 179.18572387695312, |
| "epoch": 0.5132991133924405, |
| "grad_norm": 4.963561058044434, |
| "kl": 0.592578125, |
| "learning_rate": 5.641859319009801e-07, |
| "loss": 0.0957, |
| "reward": 1.3250000715255736, |
| "reward_std": 0.2563889928162098, |
| "rewards/accuracy_reward": 0.3892857372760773, |
| "rewards/format_reward": 0.9357143104076385, |
| "step": 1100 |
| }, |
| { |
| "completion_length": 180.88929290771483, |
| "epoch": 0.5179654689687354, |
| "grad_norm": 1.608384609222412, |
| "kl": 0.348974609375, |
| "learning_rate": 5.560978944687576e-07, |
| "loss": 0.0775, |
| "reward": 1.2714286208152772, |
| "reward_std": 0.19492939710617066, |
| "rewards/accuracy_reward": 0.32142858654260636, |
| "rewards/format_reward": 0.9500000238418579, |
| "step": 1110 |
| }, |
| { |
| "completion_length": 164.9321502685547, |
| "epoch": 0.5226318245450303, |
| "grad_norm": 6.095414638519287, |
| "kl": 0.59853515625, |
| "learning_rate": 5.479949626493908e-07, |
| "loss": 0.0792, |
| "reward": 1.3428571939468383, |
| "reward_std": 0.18014808967709542, |
| "rewards/accuracy_reward": 0.37142859399318695, |
| "rewards/format_reward": 0.9714285850524902, |
| "step": 1120 |
| }, |
| { |
| "completion_length": 179.71072387695312, |
| "epoch": 0.5272981801213252, |
| "grad_norm": 13.79627513885498, |
| "kl": 0.299853515625, |
| "learning_rate": 5.398792878285002e-07, |
| "loss": 0.0579, |
| "reward": 1.296428632736206, |
| "reward_std": 0.16341925486922265, |
| "rewards/accuracy_reward": 0.32500001788139343, |
| "rewards/format_reward": 0.9714285850524902, |
| "step": 1130 |
| }, |
| { |
| "completion_length": 189.71429595947265, |
| "epoch": 0.5319645356976201, |
| "grad_norm": 7.43311071395874, |
| "kl": 0.44296875, |
| "learning_rate": 5.317530247750639e-07, |
| "loss": 0.0818, |
| "reward": 1.2785714983940124, |
| "reward_std": 0.1776016980409622, |
| "rewards/accuracy_reward": 0.32142858393490314, |
| "rewards/format_reward": 0.9571428716182708, |
| "step": 1140 |
| }, |
| { |
| "completion_length": 207.7714416503906, |
| "epoch": 0.5366308912739151, |
| "grad_norm": 27.736406326293945, |
| "kl": 1.2056640625, |
| "learning_rate": 5.2361833106931e-07, |
| "loss": 0.2633, |
| "reward": 1.228571480512619, |
| "reward_std": 0.36894305497407914, |
| "rewards/accuracy_reward": 0.3571428719907999, |
| "rewards/format_reward": 0.8714286148548126, |
| "step": 1150 |
| }, |
| { |
| "completion_length": 216.8928695678711, |
| "epoch": 0.54129724685021, |
| "grad_norm": 8.771681785583496, |
| "kl": 0.88974609375, |
| "learning_rate": 5.154773665298648e-07, |
| "loss": 0.1611, |
| "reward": 1.1535714745521546, |
| "reward_std": 0.2724130667746067, |
| "rewards/accuracy_reward": 0.2392857253551483, |
| "rewards/format_reward": 0.9142857491970062, |
| "step": 1160 |
| }, |
| { |
| "completion_length": 190.18572082519532, |
| "epoch": 0.5459636024265049, |
| "grad_norm": 6.968381404876709, |
| "kl": 0.37958984375, |
| "learning_rate": 5.073322926403045e-07, |
| "loss": 0.0619, |
| "reward": 1.260714340209961, |
| "reward_std": 0.15576233565807343, |
| "rewards/accuracy_reward": 0.275000012293458, |
| "rewards/format_reward": 0.9857142925262451, |
| "step": 1170 |
| }, |
| { |
| "completion_length": 220.9714370727539, |
| "epoch": 0.5506299580027998, |
| "grad_norm": 1.6743552684783936, |
| "kl": 0.696728515625, |
| "learning_rate": 4.991852719752678e-07, |
| "loss": 0.1253, |
| "reward": 1.2321429014205934, |
| "reward_std": 0.24198277071118354, |
| "rewards/accuracy_reward": 0.31071430146694184, |
| "rewards/format_reward": 0.9214285969734192, |
| "step": 1180 |
| }, |
| { |
| "completion_length": 203.98929595947266, |
| "epoch": 0.5552963135790947, |
| "grad_norm": 8.832259178161621, |
| "kl": 1.82734375, |
| "learning_rate": 4.910384676262752e-07, |
| "loss": 0.1067, |
| "reward": 1.26071435213089, |
| "reward_std": 0.32460705041885374, |
| "rewards/accuracy_reward": 0.36785716116428374, |
| "rewards/format_reward": 0.8928571701049804, |
| "step": 1190 |
| }, |
| { |
| "completion_length": 188.37858123779296, |
| "epoch": 0.5599626691553896, |
| "grad_norm": 4.268427848815918, |
| "kl": 0.414404296875, |
| "learning_rate": 4.828940426274142e-07, |
| "loss": 0.0621, |
| "reward": 1.3285714864730835, |
| "reward_std": 0.23999654203653337, |
| "rewards/accuracy_reward": 0.3714285850524902, |
| "rewards/format_reward": 0.9571428775787354, |
| "step": 1200 |
| }, |
| { |
| "completion_length": 207.16429290771484, |
| "epoch": 0.5646290247316845, |
| "grad_norm": 22.58124542236328, |
| "kl": 0.891845703125, |
| "learning_rate": 4.747541593810377e-07, |
| "loss": 0.1984, |
| "reward": 1.2178572058677672, |
| "reward_std": 0.3189430497586727, |
| "rewards/accuracy_reward": 0.310714303329587, |
| "rewards/format_reward": 0.9071428954601288, |
| "step": 1210 |
| }, |
| { |
| "completion_length": 218.35000915527343, |
| "epoch": 0.5692953803079794, |
| "grad_norm": 7.771918773651123, |
| "kl": 0.96904296875, |
| "learning_rate": 4.666209790836316e-07, |
| "loss": 0.1555, |
| "reward": 1.2107143342494964, |
| "reward_std": 0.3533112980425358, |
| "rewards/accuracy_reward": 0.3178571552038193, |
| "rewards/format_reward": 0.8928571820259095, |
| "step": 1220 |
| }, |
| { |
| "completion_length": 187.87857971191406, |
| "epoch": 0.5739617358842743, |
| "grad_norm": 13.805558204650879, |
| "kl": 0.933251953125, |
| "learning_rate": 4.5849666115200143e-07, |
| "loss": 0.1366, |
| "reward": 1.2500000715255737, |
| "reward_std": 0.21033736318349838, |
| "rewards/accuracy_reward": 0.30714286863803864, |
| "rewards/format_reward": 0.942857164144516, |
| "step": 1230 |
| }, |
| { |
| "completion_length": 197.11786804199218, |
| "epoch": 0.5786280914605693, |
| "grad_norm": 3.612844228744507, |
| "kl": 0.625244140625, |
| "learning_rate": 4.503833626499317e-07, |
| "loss": 0.1048, |
| "reward": 1.1892857551574707, |
| "reward_std": 0.3342569015920162, |
| "rewards/accuracy_reward": 0.2821428693830967, |
| "rewards/format_reward": 0.9071428894996643, |
| "step": 1240 |
| }, |
| { |
| "completion_length": 193.096435546875, |
| "epoch": 0.5832944470368642, |
| "grad_norm": 4.014871597290039, |
| "kl": 0.572314453125, |
| "learning_rate": 4.42283237715471e-07, |
| "loss": 0.0812, |
| "reward": 1.160714316368103, |
| "reward_std": 0.28212499171495437, |
| "rewards/accuracy_reward": 0.26071429550647734, |
| "rewards/format_reward": 0.9000000357627869, |
| "step": 1250 |
| }, |
| { |
| "completion_length": 195.02500762939454, |
| "epoch": 0.5879608026131591, |
| "grad_norm": 4.279513359069824, |
| "kl": 0.7986328125, |
| "learning_rate": 4.3419843698899234e-07, |
| "loss": 0.1005, |
| "reward": 1.2928572058677674, |
| "reward_std": 0.25148131176829336, |
| "rewards/accuracy_reward": 0.3571428656578064, |
| "rewards/format_reward": 0.9357143044471741, |
| "step": 1260 |
| }, |
| { |
| "completion_length": 221.07858123779297, |
| "epoch": 0.592627158189454, |
| "grad_norm": 4.270975589752197, |
| "kl": 0.7974609375, |
| "learning_rate": 4.2613110704218336e-07, |
| "loss": 0.1913, |
| "reward": 1.210714340209961, |
| "reward_std": 0.27596538737416265, |
| "rewards/accuracy_reward": 0.30357144251465795, |
| "rewards/format_reward": 0.9071429014205933, |
| "step": 1270 |
| }, |
| { |
| "completion_length": 192.57857971191407, |
| "epoch": 0.5972935137657489, |
| "grad_norm": 6.560425281524658, |
| "kl": 1.0640625, |
| "learning_rate": 4.1808338980811666e-07, |
| "loss": 0.1447, |
| "reward": 1.2214286208152771, |
| "reward_std": 0.2975998237729073, |
| "rewards/accuracy_reward": 0.3285714462399483, |
| "rewards/format_reward": 0.8928571760654449, |
| "step": 1280 |
| }, |
| { |
| "completion_length": 210.92501220703124, |
| "epoch": 0.6019598693420438, |
| "grad_norm": 2.6335413455963135, |
| "kl": 1.1158203125, |
| "learning_rate": 4.100574220125506e-07, |
| "loss": 0.2254, |
| "reward": 1.2178571939468383, |
| "reward_std": 0.38201676979660987, |
| "rewards/accuracy_reward": 0.3250000149011612, |
| "rewards/format_reward": 0.892857164144516, |
| "step": 1290 |
| }, |
| { |
| "completion_length": 226.9035842895508, |
| "epoch": 0.6066262249183387, |
| "grad_norm": 6.515714645385742, |
| "kl": 1.377734375, |
| "learning_rate": 4.020553346066144e-07, |
| "loss": 0.2749, |
| "reward": 1.2035714745521546, |
| "reward_std": 0.37217203676700594, |
| "rewards/accuracy_reward": 0.3107142999768257, |
| "rewards/format_reward": 0.8928571879863739, |
| "step": 1300 |
| }, |
| { |
| "completion_length": 211.1928680419922, |
| "epoch": 0.6112925804946336, |
| "grad_norm": 16.117403030395508, |
| "kl": 1.2681640625, |
| "learning_rate": 3.9407925220102493e-07, |
| "loss": 0.2125, |
| "reward": 1.1928571820259095, |
| "reward_std": 0.3735316038131714, |
| "rewards/accuracy_reward": 0.2928571544587612, |
| "rewards/format_reward": 0.9000000357627869, |
| "step": 1310 |
| }, |
| { |
| "completion_length": 203.11429290771486, |
| "epoch": 0.6159589360709286, |
| "grad_norm": 7.737660884857178, |
| "kl": 0.8955078125, |
| "learning_rate": 3.86131292501988e-07, |
| "loss": 0.126, |
| "reward": 1.2571429014205933, |
| "reward_std": 0.31292245015501974, |
| "rewards/accuracy_reward": 0.3428571581840515, |
| "rewards/format_reward": 0.9142857551574707, |
| "step": 1320 |
| }, |
| { |
| "completion_length": 188.27857971191406, |
| "epoch": 0.6206252916472235, |
| "grad_norm": 3.5433154106140137, |
| "kl": 0.89716796875, |
| "learning_rate": 3.7821356574893204e-07, |
| "loss": 0.1548, |
| "reward": 1.31071435213089, |
| "reward_std": 0.26513244956731796, |
| "rewards/accuracy_reward": 0.36785716116428374, |
| "rewards/format_reward": 0.942857164144516, |
| "step": 1330 |
| }, |
| { |
| "completion_length": 196.4714385986328, |
| "epoch": 0.6252916472235185, |
| "grad_norm": 6.886636734008789, |
| "kl": 0.853076171875, |
| "learning_rate": 3.7032817415422517e-07, |
| "loss": 0.1634, |
| "reward": 1.2678572058677673, |
| "reward_std": 0.2711702950298786, |
| "rewards/accuracy_reward": 0.3321428790688515, |
| "rewards/format_reward": 0.935714316368103, |
| "step": 1340 |
| }, |
| { |
| "completion_length": 197.9714385986328, |
| "epoch": 0.6299580027998134, |
| "grad_norm": 10.59721565246582, |
| "kl": 1.061083984375, |
| "learning_rate": 3.624772113450223e-07, |
| "loss": 0.1761, |
| "reward": 1.2678572058677673, |
| "reward_std": 0.32303600385785103, |
| "rewards/accuracy_reward": 0.36071430891752243, |
| "rewards/format_reward": 0.9071428954601288, |
| "step": 1350 |
| }, |
| { |
| "completion_length": 180.4321502685547, |
| "epoch": 0.6346243583761083, |
| "grad_norm": 2.4233856201171875, |
| "kl": 0.690625, |
| "learning_rate": 3.5466276180739264e-07, |
| "loss": 0.0947, |
| "reward": 1.2892857670783997, |
| "reward_std": 0.21290518939495087, |
| "rewards/accuracy_reward": 0.3392857272177935, |
| "rewards/format_reward": 0.9500000238418579, |
| "step": 1360 |
| }, |
| { |
| "completion_length": 196.02857971191406, |
| "epoch": 0.6392907139524032, |
| "grad_norm": 18.396207809448242, |
| "kl": 0.962060546875, |
| "learning_rate": 3.4688690033287414e-07, |
| "loss": 0.155, |
| "reward": 1.3535714745521545, |
| "reward_std": 0.24271938800811768, |
| "rewards/accuracy_reward": 0.4250000178813934, |
| "rewards/format_reward": 0.9285714626312256, |
| "step": 1370 |
| }, |
| { |
| "completion_length": 194.17501068115234, |
| "epoch": 0.6439570695286981, |
| "grad_norm": 12.984419822692871, |
| "kl": 0.37880859375, |
| "learning_rate": 3.3915169146760137e-07, |
| "loss": 0.096, |
| "reward": 1.2642857909202576, |
| "reward_std": 0.2268330782651901, |
| "rewards/accuracy_reward": 0.33571430034935473, |
| "rewards/format_reward": 0.9285714507102967, |
| "step": 1380 |
| }, |
| { |
| "completion_length": 180.59286499023438, |
| "epoch": 0.648623425104993, |
| "grad_norm": 3.471736192703247, |
| "kl": 0.78525390625, |
| "learning_rate": 3.3145918896415394e-07, |
| "loss": 0.0905, |
| "reward": 1.3535714745521545, |
| "reward_std": 0.1773286685347557, |
| "rewards/accuracy_reward": 0.417857164144516, |
| "rewards/format_reward": 0.935714316368103, |
| "step": 1390 |
| }, |
| { |
| "completion_length": 172.69286499023437, |
| "epoch": 0.653289780681288, |
| "grad_norm": 5.568473815917969, |
| "kl": 0.4484130859375, |
| "learning_rate": 3.2381143523627106e-07, |
| "loss": 0.0142, |
| "reward": 1.3071429252624511, |
| "reward_std": 0.19887898862361908, |
| "rewards/accuracy_reward": 0.3500000134110451, |
| "rewards/format_reward": 0.9571428775787354, |
| "step": 1400 |
| }, |
| { |
| "completion_length": 197.35714874267578, |
| "epoch": 0.6579561362575829, |
| "grad_norm": 5.128924369812012, |
| "kl": 0.848291015625, |
| "learning_rate": 3.16210460816576e-07, |
| "loss": 0.1411, |
| "reward": 1.2464286088943481, |
| "reward_std": 0.20331502109766006, |
| "rewards/accuracy_reward": 0.3035714402794838, |
| "rewards/format_reward": 0.9428571701049805, |
| "step": 1410 |
| }, |
| { |
| "completion_length": 205.62858123779296, |
| "epoch": 0.6626224918338778, |
| "grad_norm": 6.538782596588135, |
| "kl": 0.71435546875, |
| "learning_rate": 3.086582838174551e-07, |
| "loss": 0.1207, |
| "reward": 1.210714328289032, |
| "reward_std": 0.26858522146940234, |
| "rewards/accuracy_reward": 0.2750000096857548, |
| "rewards/format_reward": 0.9357143104076385, |
| "step": 1420 |
| }, |
| { |
| "completion_length": 177.58214874267577, |
| "epoch": 0.6672888474101727, |
| "grad_norm": 2.0602848529815674, |
| "kl": 0.540087890625, |
| "learning_rate": 3.0115690939523514e-07, |
| "loss": 0.0609, |
| "reward": 1.2571429014205933, |
| "reward_std": 0.19617216065526008, |
| "rewards/accuracy_reward": 0.3071428701281548, |
| "rewards/format_reward": 0.9500000238418579, |
| "step": 1430 |
| }, |
| { |
| "completion_length": 190.9821517944336, |
| "epoch": 0.6719552029864676, |
| "grad_norm": 1.0282678604125977, |
| "kl": 0.74091796875, |
| "learning_rate": 2.9370832921779983e-07, |
| "loss": 0.1188, |
| "reward": 1.2035714626312255, |
| "reward_std": 0.22572807371616363, |
| "rewards/accuracy_reward": 0.25357144251465796, |
| "rewards/format_reward": 0.9500000238418579, |
| "step": 1440 |
| }, |
| { |
| "completion_length": 197.7464385986328, |
| "epoch": 0.6766215585627625, |
| "grad_norm": 5.613475799560547, |
| "kl": 0.711767578125, |
| "learning_rate": 2.8631452093578814e-07, |
| "loss": 0.1211, |
| "reward": 1.3035714864730834, |
| "reward_std": 0.20981329679489136, |
| "rewards/accuracy_reward": 0.3750000141561031, |
| "rewards/format_reward": 0.9285714626312256, |
| "step": 1450 |
| }, |
| { |
| "completion_length": 200.86786651611328, |
| "epoch": 0.6812879141390574, |
| "grad_norm": 6.9082841873168945, |
| "kl": 1.02734375, |
| "learning_rate": 2.7897744765751375e-07, |
| "loss": 0.1942, |
| "reward": 1.321428620815277, |
| "reward_std": 0.2701858140528202, |
| "rewards/accuracy_reward": 0.4000000197440386, |
| "rewards/format_reward": 0.9214286088943482, |
| "step": 1460 |
| }, |
| { |
| "completion_length": 191.17858123779297, |
| "epoch": 0.6859542697153523, |
| "grad_norm": 116.39089965820312, |
| "kl": 0.575634765625, |
| "learning_rate": 2.716990574277469e-07, |
| "loss": 0.086, |
| "reward": 1.2821429133415223, |
| "reward_std": 0.21748021617531776, |
| "rewards/accuracy_reward": 0.33214287310838697, |
| "rewards/format_reward": 0.9500000178813934, |
| "step": 1470 |
| }, |
| { |
| "completion_length": 196.97500762939453, |
| "epoch": 0.6906206252916472, |
| "grad_norm": 5.2841033935546875, |
| "kl": 0.8533203125, |
| "learning_rate": 2.644812827104933e-07, |
| "loss": 0.1501, |
| "reward": 1.175000047683716, |
| "reward_std": 0.30686734020709994, |
| "rewards/accuracy_reward": 0.2821428686380386, |
| "rewards/format_reward": 0.8928571820259095, |
| "step": 1480 |
| }, |
| { |
| "completion_length": 186.7571533203125, |
| "epoch": 0.6952869808679422, |
| "grad_norm": 4.615259170532227, |
| "kl": 0.45205078125, |
| "learning_rate": 2.573260398759125e-07, |
| "loss": 0.0948, |
| "reward": 1.346428632736206, |
| "reward_std": 0.12943540289998054, |
| "rewards/accuracy_reward": 0.36785715967416766, |
| "rewards/format_reward": 0.9785714387893677, |
| "step": 1490 |
| }, |
| { |
| "completion_length": 175.87857818603516, |
| "epoch": 0.6999533364442371, |
| "grad_norm": 2.3132364749908447, |
| "kl": 0.661865234375, |
| "learning_rate": 2.5023522869150705e-07, |
| "loss": 0.0561, |
| "reward": 1.2535714864730836, |
| "reward_std": 0.21969022005796432, |
| "rewards/accuracy_reward": 0.31785715371370316, |
| "rewards/format_reward": 0.935714316368103, |
| "step": 1500 |
| }, |
| { |
| "completion_length": 187.2821517944336, |
| "epoch": 0.704619692020532, |
| "grad_norm": 29.17850685119629, |
| "kl": 0.95556640625, |
| "learning_rate": 2.432107318177217e-07, |
| "loss": 0.1785, |
| "reward": 1.3500000596046449, |
| "reward_std": 0.25344905629754066, |
| "rewards/accuracy_reward": 0.42142859399318694, |
| "rewards/format_reward": 0.9285714566707611, |
| "step": 1510 |
| }, |
| { |
| "completion_length": 177.58572235107422, |
| "epoch": 0.7092860475968269, |
| "grad_norm": 1.220908284187317, |
| "kl": 0.591015625, |
| "learning_rate": 2.3625441430808347e-07, |
| "loss": 0.0738, |
| "reward": 1.4071429133415223, |
| "reward_std": 0.21487789303064347, |
| "rewards/accuracy_reward": 0.4714285969734192, |
| "rewards/format_reward": 0.9357143044471741, |
| "step": 1520 |
| }, |
| { |
| "completion_length": 180.8821533203125, |
| "epoch": 0.7139524031731218, |
| "grad_norm": 4.141109943389893, |
| "kl": 0.378857421875, |
| "learning_rate": 2.2936812311401682e-07, |
| "loss": 0.0597, |
| "reward": 1.3000000715255737, |
| "reward_std": 0.17239581793546677, |
| "rewards/accuracy_reward": 0.3357143022119999, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 1530 |
| }, |
| { |
| "completion_length": 183.73214874267578, |
| "epoch": 0.7186187587494167, |
| "grad_norm": 10.828474044799805, |
| "kl": 0.504296875, |
| "learning_rate": 2.225536865944646e-07, |
| "loss": 0.0564, |
| "reward": 1.3321429133415221, |
| "reward_std": 0.14654723256826402, |
| "rewards/accuracy_reward": 0.3607143074274063, |
| "rewards/format_reward": 0.9714285850524902, |
| "step": 1540 |
| }, |
| { |
| "completion_length": 189.9714370727539, |
| "epoch": 0.7232851143257116, |
| "grad_norm": 7.55503511428833, |
| "kl": 0.721484375, |
| "learning_rate": 2.1581291403044632e-07, |
| "loss": 0.1054, |
| "reward": 1.2250000596046449, |
| "reward_std": 0.2633721731603146, |
| "rewards/accuracy_reward": 0.2892857283353806, |
| "rewards/format_reward": 0.9357143104076385, |
| "step": 1550 |
| }, |
| { |
| "completion_length": 180.25000762939453, |
| "epoch": 0.7279514699020065, |
| "grad_norm": 9.971400260925293, |
| "kl": 0.6979736328125, |
| "learning_rate": 2.0914759514468106e-07, |
| "loss": 0.1232, |
| "reward": 1.2785714745521546, |
| "reward_std": 0.2545368172228336, |
| "rewards/accuracy_reward": 0.357142873480916, |
| "rewards/format_reward": 0.9214286029338836, |
| "step": 1560 |
| }, |
| { |
| "completion_length": 198.396435546875, |
| "epoch": 0.7326178254783015, |
| "grad_norm": 9.208026885986328, |
| "kl": 1.296337890625, |
| "learning_rate": 2.0255949962640333e-07, |
| "loss": 0.2623, |
| "reward": 1.2785715103149413, |
| "reward_std": 0.27719091176986693, |
| "rewards/accuracy_reward": 0.3785714417695999, |
| "rewards/format_reward": 0.900000023841858, |
| "step": 1570 |
| }, |
| { |
| "completion_length": 204.6928680419922, |
| "epoch": 0.7372841810545964, |
| "grad_norm": 7.680899620056152, |
| "kl": 0.668994140625, |
| "learning_rate": 1.9605037666149832e-07, |
| "loss": 0.1278, |
| "reward": 1.2857143521308898, |
| "reward_std": 0.24715665131807327, |
| "rewards/accuracy_reward": 0.357142873108387, |
| "rewards/format_reward": 0.9285714566707611, |
| "step": 1580 |
| }, |
| { |
| "completion_length": 185.85357818603515, |
| "epoch": 0.7419505366308913, |
| "grad_norm": 8.488438606262207, |
| "kl": 0.361083984375, |
| "learning_rate": 1.8962195446808083e-07, |
| "loss": 0.0404, |
| "reward": 1.196428644657135, |
| "reward_std": 0.25750192254781723, |
| "rewards/accuracy_reward": 0.26785715520381925, |
| "rewards/format_reward": 0.9285714507102967, |
| "step": 1590 |
| }, |
| { |
| "completion_length": 196.72500762939453, |
| "epoch": 0.7466168922071862, |
| "grad_norm": 4.548067092895508, |
| "kl": 1.42060546875, |
| "learning_rate": 1.8327593983764057e-07, |
| "loss": 0.2529, |
| "reward": 1.335714328289032, |
| "reward_std": 0.377190912514925, |
| "rewards/accuracy_reward": 0.4285714507102966, |
| "rewards/format_reward": 0.9071428894996643, |
| "step": 1600 |
| }, |
| { |
| "completion_length": 177.2678649902344, |
| "epoch": 0.7512832477834811, |
| "grad_norm": 3.3683552742004395, |
| "kl": 0.58388671875, |
| "learning_rate": 1.770140176818774e-07, |
| "loss": 0.0739, |
| "reward": 1.3428571939468383, |
| "reward_std": 0.18543876633048056, |
| "rewards/accuracy_reward": 0.3642857313156128, |
| "rewards/format_reward": 0.9785714387893677, |
| "step": 1610 |
| }, |
| { |
| "completion_length": 203.56429595947264, |
| "epoch": 0.755949603359776, |
| "grad_norm": 3.158090114593506, |
| "kl": 0.81240234375, |
| "learning_rate": 1.7083785058534566e-07, |
| "loss": 0.1285, |
| "reward": 1.2821429371833801, |
| "reward_std": 0.24443381130695344, |
| "rewards/accuracy_reward": 0.3392857268452644, |
| "rewards/format_reward": 0.9428571701049805, |
| "step": 1620 |
| }, |
| { |
| "completion_length": 203.41429443359374, |
| "epoch": 0.7606159589360709, |
| "grad_norm": 3.8291237354278564, |
| "kl": 1.1529296875, |
| "learning_rate": 1.6474907836402507e-07, |
| "loss": 0.1792, |
| "reward": 1.2678571939468384, |
| "reward_std": 0.24378738924860954, |
| "rewards/accuracy_reward": 0.3464285895228386, |
| "rewards/format_reward": 0.9214285969734192, |
| "step": 1630 |
| }, |
| { |
| "completion_length": 204.32500915527345, |
| "epoch": 0.7652823145123658, |
| "grad_norm": 127.166259765625, |
| "kl": 1.08779296875, |
| "learning_rate": 1.5874931762993933e-07, |
| "loss": 0.1349, |
| "reward": 1.196428620815277, |
| "reward_std": 0.26433941870927813, |
| "rewards/accuracy_reward": 0.28928572684526443, |
| "rewards/format_reward": 0.9071428775787354, |
| "step": 1640 |
| }, |
| { |
| "completion_length": 210.50001068115233, |
| "epoch": 0.7699486700886607, |
| "grad_norm": 1.736830472946167, |
| "kl": 0.989453125, |
| "learning_rate": 1.5284016136193396e-07, |
| "loss": 0.2122, |
| "reward": 1.2178571939468383, |
| "reward_std": 0.2605919159948826, |
| "rewards/accuracy_reward": 0.2964285835623741, |
| "rewards/format_reward": 0.9214286088943482, |
| "step": 1650 |
| }, |
| { |
| "completion_length": 201.36072235107423, |
| "epoch": 0.7746150256649557, |
| "grad_norm": 10.542801856994629, |
| "kl": 1.331201171875, |
| "learning_rate": 1.4702317848272838e-07, |
| "loss": 0.2161, |
| "reward": 1.3214286327362061, |
| "reward_std": 0.28298772796988486, |
| "rewards/accuracy_reward": 0.40000002086162567, |
| "rewards/format_reward": 0.9214286029338836, |
| "step": 1660 |
| }, |
| { |
| "completion_length": 192.8857208251953, |
| "epoch": 0.7792813812412506, |
| "grad_norm": 6.193233966827393, |
| "kl": 1.0849609375, |
| "learning_rate": 1.4129991344235653e-07, |
| "loss": 0.1358, |
| "reward": 1.2321429014205934, |
| "reward_std": 0.2178552895784378, |
| "rewards/accuracy_reward": 0.31785715706646445, |
| "rewards/format_reward": 0.9142857432365418, |
| "step": 1670 |
| }, |
| { |
| "completion_length": 173.0357223510742, |
| "epoch": 0.7839477368175455, |
| "grad_norm": 16.36906623840332, |
| "kl": 0.372314453125, |
| "learning_rate": 1.3567188580810435e-07, |
| "loss": 0.0753, |
| "reward": 1.4285714864730834, |
| "reward_std": 0.19271938651800155, |
| "rewards/accuracy_reward": 0.46428574323654176, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 1680 |
| }, |
| { |
| "completion_length": 194.05357971191407, |
| "epoch": 0.7886140923938404, |
| "grad_norm": 2.392770290374756, |
| "kl": 0.406396484375, |
| "learning_rate": 1.3014058986105374e-07, |
| "loss": 0.0856, |
| "reward": 1.2535714864730836, |
| "reward_std": 0.15812735334038736, |
| "rewards/accuracy_reward": 0.2892857283353806, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 1690 |
| }, |
| { |
| "completion_length": 175.20000762939452, |
| "epoch": 0.7932804479701353, |
| "grad_norm": 4.713147163391113, |
| "kl": 0.342333984375, |
| "learning_rate": 1.2470749419934057e-07, |
| "loss": 0.0522, |
| "reward": 1.435714340209961, |
| "reward_std": 0.1269535943865776, |
| "rewards/accuracy_reward": 0.4428571715950966, |
| "rewards/format_reward": 0.9928571462631226, |
| "step": 1700 |
| }, |
| { |
| "completion_length": 178.71429595947265, |
| "epoch": 0.7979468035464302, |
| "grad_norm": 2.6368408203125, |
| "kl": 0.63681640625, |
| "learning_rate": 1.1937404134823175e-07, |
| "loss": 0.0749, |
| "reward": 1.2642857551574707, |
| "reward_std": 0.21649573594331742, |
| "rewards/accuracy_reward": 0.3000000137835741, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 1710 |
| }, |
| { |
| "completion_length": 180.82500915527345, |
| "epoch": 0.8026131591227251, |
| "grad_norm": 4.20245361328125, |
| "kl": 0.404296875, |
| "learning_rate": 1.1414164737712401e-07, |
| "loss": 0.0445, |
| "reward": 1.3035714864730834, |
| "reward_std": 0.21377288773655892, |
| "rewards/accuracy_reward": 0.35357144474983215, |
| "rewards/format_reward": 0.9500000178813934, |
| "step": 1720 |
| }, |
| { |
| "completion_length": 193.33929290771485, |
| "epoch": 0.80727951469902, |
| "grad_norm": 12.832620620727539, |
| "kl": 0.924365234375, |
| "learning_rate": 1.0901170152356775e-07, |
| "loss": 0.1151, |
| "reward": 1.2142857551574706, |
| "reward_std": 0.24824440032243728, |
| "rewards/accuracy_reward": 0.3000000149011612, |
| "rewards/format_reward": 0.9142857551574707, |
| "step": 1730 |
| }, |
| { |
| "completion_length": 195.20358123779297, |
| "epoch": 0.811945870275315, |
| "grad_norm": 8.172798156738281, |
| "kl": 0.75888671875, |
| "learning_rate": 1.0398556582441481e-07, |
| "loss": 0.1337, |
| "reward": 1.271428644657135, |
| "reward_std": 0.271032539755106, |
| "rewards/accuracy_reward": 0.3428571570664644, |
| "rewards/format_reward": 0.9285714626312256, |
| "step": 1740 |
| }, |
| { |
| "completion_length": 183.7714385986328, |
| "epoch": 0.8166122258516099, |
| "grad_norm": 12.028782844543457, |
| "kl": 1.259375, |
| "learning_rate": 9.906457475418778e-08, |
| "loss": 0.1913, |
| "reward": 1.3107143759727478, |
| "reward_std": 0.25871951803565024, |
| "rewards/accuracy_reward": 0.38214287757873533, |
| "rewards/format_reward": 0.9285714566707611, |
| "step": 1750 |
| }, |
| { |
| "completion_length": 195.21072387695312, |
| "epoch": 0.8212785814279048, |
| "grad_norm": 8.357789039611816, |
| "kl": 0.935986328125, |
| "learning_rate": 9.425003487076789e-08, |
| "loss": 0.1143, |
| "reward": 1.2392857670783997, |
| "reward_std": 0.2283131591975689, |
| "rewards/accuracy_reward": 0.30357144549489024, |
| "rewards/format_reward": 0.9357143104076385, |
| "step": 1760 |
| }, |
| { |
| "completion_length": 186.48572387695313, |
| "epoch": 0.8259449370041997, |
| "grad_norm": 4.591169834136963, |
| "kl": 0.8271484375, |
| "learning_rate": 8.954322446849444e-08, |
| "loss": 0.1123, |
| "reward": 1.3535715103149415, |
| "reward_std": 0.23052316084504126, |
| "rewards/accuracy_reward": 0.40357144773006437, |
| "rewards/format_reward": 0.9500000238418579, |
| "step": 1770 |
| }, |
| { |
| "completion_length": 203.71072082519532, |
| "epoch": 0.8306112925804946, |
| "grad_norm": 4.072372913360596, |
| "kl": 1.01142578125, |
| "learning_rate": 8.494539323876871e-08, |
| "loss": 0.1496, |
| "reward": 1.2678571939468384, |
| "reward_std": 0.24972448274493217, |
| "rewards/accuracy_reward": 0.34642858654260633, |
| "rewards/format_reward": 0.9214285969734192, |
| "step": 1780 |
| }, |
| { |
| "completion_length": 173.66429290771484, |
| "epoch": 0.8352776481567895, |
| "grad_norm": 1.4727064371109009, |
| "kl": 0.4848876953125, |
| "learning_rate": 8.045776193825204e-08, |
| "loss": 0.0449, |
| "reward": 1.335714340209961, |
| "reward_std": 0.20700509771704673, |
| "rewards/accuracy_reward": 0.3857143074274063, |
| "rewards/format_reward": 0.9500000238418579, |
| "step": 1790 |
| }, |
| { |
| "completion_length": 178.2857223510742, |
| "epoch": 0.8399440037330844, |
| "grad_norm": 2.1876659393310547, |
| "kl": 0.60693359375, |
| "learning_rate": 7.608152206474638e-08, |
| "loss": 0.0354, |
| "reward": 1.36071435213089, |
| "reward_std": 0.2141479544341564, |
| "rewards/accuracy_reward": 0.41071430742740633, |
| "rewards/format_reward": 0.9500000238418579, |
| "step": 1800 |
| }, |
| { |
| "completion_length": 184.4964370727539, |
| "epoch": 0.8446103593093793, |
| "grad_norm": 1.164255976676941, |
| "kl": 0.330322265625, |
| "learning_rate": 7.181783554084308e-08, |
| "loss": 0.0332, |
| "reward": 1.3071429133415222, |
| "reward_std": 0.13999654576182366, |
| "rewards/accuracy_reward": 0.32142859026789666, |
| "rewards/format_reward": 0.9857142925262451, |
| "step": 1810 |
| }, |
| { |
| "completion_length": 180.47501068115236, |
| "epoch": 0.8492767148856742, |
| "grad_norm": 17.824142456054688, |
| "kl": 0.4853271484375, |
| "learning_rate": 6.766783440542434e-08, |
| "loss": 0.0599, |
| "reward": 1.2642857670783996, |
| "reward_std": 0.19838216677308082, |
| "rewards/accuracy_reward": 0.32857144847512243, |
| "rewards/format_reward": 0.935714316368103, |
| "step": 1820 |
| }, |
| { |
| "completion_length": 187.25000915527343, |
| "epoch": 0.8539430704619692, |
| "grad_norm": 7.02644681930542, |
| "kl": 0.693896484375, |
| "learning_rate": 6.363262051309908e-08, |
| "loss": 0.1129, |
| "reward": 1.2892857670783997, |
| "reward_std": 0.27438203766942026, |
| "rewards/accuracy_reward": 0.35357144474983215, |
| "rewards/format_reward": 0.9357143104076385, |
| "step": 1830 |
| }, |
| { |
| "completion_length": 190.3964370727539, |
| "epoch": 0.8586094260382641, |
| "grad_norm": 0.5623534321784973, |
| "kl": 0.6712890625, |
| "learning_rate": 5.971326524165226e-08, |
| "loss": 0.1025, |
| "reward": 1.296428644657135, |
| "reward_std": 0.2583474151790142, |
| "rewards/accuracy_reward": 0.36785715967416766, |
| "rewards/format_reward": 0.9285714566707611, |
| "step": 1840 |
| }, |
| { |
| "completion_length": 191.0107223510742, |
| "epoch": 0.863275781614559, |
| "grad_norm": 0.3061552047729492, |
| "kl": 0.79541015625, |
| "learning_rate": 5.591080920758695e-08, |
| "loss": 0.1553, |
| "reward": 1.2821429252624512, |
| "reward_std": 0.2908777602016926, |
| "rewards/accuracy_reward": 0.3678571581840515, |
| "rewards/format_reward": 0.9142857491970062, |
| "step": 1850 |
| }, |
| { |
| "completion_length": 190.6214385986328, |
| "epoch": 0.8679421371908539, |
| "grad_norm": 4.034696578979492, |
| "kl": 0.664599609375, |
| "learning_rate": 5.22262619898331e-08, |
| "loss": 0.1263, |
| "reward": 1.2821429133415223, |
| "reward_std": 0.24259887337684632, |
| "rewards/accuracy_reward": 0.3464285880327225, |
| "rewards/format_reward": 0.935714316368103, |
| "step": 1860 |
| }, |
| { |
| "completion_length": 192.10358276367188, |
| "epoch": 0.8726084927671488, |
| "grad_norm": 11.7506742477417, |
| "kl": 1.0009765625, |
| "learning_rate": 4.8660601861697294e-08, |
| "loss": 0.1442, |
| "reward": 1.3321429133415221, |
| "reward_std": 0.2509672470390797, |
| "rewards/accuracy_reward": 0.39642858803272246, |
| "rewards/format_reward": 0.935714316368103, |
| "step": 1870 |
| }, |
| { |
| "completion_length": 194.44286346435547, |
| "epoch": 0.8772748483434438, |
| "grad_norm": 2.883983850479126, |
| "kl": 0.7970703125, |
| "learning_rate": 4.5214775531124184e-08, |
| "loss": 0.0791, |
| "reward": 1.2214286148548126, |
| "reward_std": 0.2235020525753498, |
| "rewards/accuracy_reward": 0.2785714402794838, |
| "rewards/format_reward": 0.9428571581840515, |
| "step": 1880 |
| }, |
| { |
| "completion_length": 177.9107223510742, |
| "epoch": 0.8819412039197387, |
| "grad_norm": 12.7833251953125, |
| "kl": 0.6814453125, |
| "learning_rate": 4.188969788933899e-08, |
| "loss": 0.0794, |
| "reward": 1.2928571939468383, |
| "reward_std": 0.2521940000355244, |
| "rewards/accuracy_reward": 0.36428572833538053, |
| "rewards/format_reward": 0.9285714507102967, |
| "step": 1890 |
| }, |
| { |
| "completion_length": 181.83929595947265, |
| "epoch": 0.8866075594960336, |
| "grad_norm": 1.639757513999939, |
| "kl": 1.08232421875, |
| "learning_rate": 3.8686251767937325e-08, |
| "loss": 0.1071, |
| "reward": 1.3000000596046448, |
| "reward_std": 0.22904308661818504, |
| "rewards/accuracy_reward": 0.35714287161827085, |
| "rewards/format_reward": 0.942857164144516, |
| "step": 1900 |
| }, |
| { |
| "completion_length": 192.0964370727539, |
| "epoch": 0.8912739150723286, |
| "grad_norm": 3.8928933143615723, |
| "kl": 0.3626220703125, |
| "learning_rate": 3.560528770448712e-08, |
| "loss": 0.064, |
| "reward": 1.3571429014205934, |
| "reward_std": 0.14244568049907685, |
| "rewards/accuracy_reward": 0.3928571566939354, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 1910 |
| }, |
| { |
| "completion_length": 198.77500915527344, |
| "epoch": 0.8959402706486235, |
| "grad_norm": 4.202512741088867, |
| "kl": 0.386767578125, |
| "learning_rate": 3.264762371670493e-08, |
| "loss": 0.0725, |
| "reward": 1.2428572058677674, |
| "reward_std": 0.18409645855426787, |
| "rewards/accuracy_reward": 0.2928571604192257, |
| "rewards/format_reward": 0.9500000238418579, |
| "step": 1920 |
| }, |
| { |
| "completion_length": 182.1321548461914, |
| "epoch": 0.9006066262249184, |
| "grad_norm": 8.796235084533691, |
| "kl": 0.31845703125, |
| "learning_rate": 2.981404508526653e-08, |
| "loss": 0.049, |
| "reward": 1.3142857670783996, |
| "reward_std": 0.19875723943114282, |
| "rewards/accuracy_reward": 0.35714287757873536, |
| "rewards/format_reward": 0.9571428775787354, |
| "step": 1930 |
| }, |
| { |
| "completion_length": 183.12857818603516, |
| "epoch": 0.9052729818012133, |
| "grad_norm": 5.363933086395264, |
| "kl": 0.4515380859375, |
| "learning_rate": 2.7105304145309317e-08, |
| "loss": 0.0753, |
| "reward": 1.3535714864730835, |
| "reward_std": 0.22955592721700668, |
| "rewards/accuracy_reward": 0.40357145145535467, |
| "rewards/format_reward": 0.9500000238418579, |
| "step": 1940 |
| }, |
| { |
| "completion_length": 178.5964370727539, |
| "epoch": 0.9099393373775082, |
| "grad_norm": 4.07145357131958, |
| "kl": 0.243212890625, |
| "learning_rate": 2.4522120086681975e-08, |
| "loss": 0.035, |
| "reward": 1.3750000596046448, |
| "reward_std": 0.1731257550418377, |
| "rewards/accuracy_reward": 0.38928572833538055, |
| "rewards/format_reward": 0.9857142925262451, |
| "step": 1950 |
| }, |
| { |
| "completion_length": 191.87858123779296, |
| "epoch": 0.9146056929538031, |
| "grad_norm": 4.126840114593506, |
| "kl": 0.7501953125, |
| "learning_rate": 2.2065178762994517e-08, |
| "loss": 0.1034, |
| "reward": 1.2821429014205932, |
| "reward_std": 0.1828709363937378, |
| "rewards/accuracy_reward": 0.3392857313156128, |
| "rewards/format_reward": 0.942857164144516, |
| "step": 1960 |
| }, |
| { |
| "completion_length": 191.27857818603516, |
| "epoch": 0.919272048530098, |
| "grad_norm": 3.2844011783599854, |
| "kl": 1.066259765625, |
| "learning_rate": 1.9735132509519302e-08, |
| "loss": 0.1838, |
| "reward": 1.271428632736206, |
| "reward_std": 0.33275041803717614, |
| "rewards/accuracy_reward": 0.35714287757873536, |
| "rewards/format_reward": 0.9142857491970062, |
| "step": 1970 |
| }, |
| { |
| "completion_length": 182.68929290771484, |
| "epoch": 0.9239384041063929, |
| "grad_norm": 5.930522441864014, |
| "kl": 0.534765625, |
| "learning_rate": 1.7532599969991347e-08, |
| "loss": 0.0479, |
| "reward": 1.2785714864730835, |
| "reward_std": 0.19234431087970733, |
| "rewards/accuracy_reward": 0.3214285895228386, |
| "rewards/format_reward": 0.9571428775787354, |
| "step": 1980 |
| }, |
| { |
| "completion_length": 178.8928649902344, |
| "epoch": 0.9286047596826879, |
| "grad_norm": 0.8274029493331909, |
| "kl": 0.40849609375, |
| "learning_rate": 1.545816593235416e-08, |
| "loss": 0.0388, |
| "reward": 1.3142857789993285, |
| "reward_std": 0.20442002266645432, |
| "rewards/accuracy_reward": 0.3500000134110451, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 1990 |
| }, |
| { |
| "completion_length": 188.56786499023437, |
| "epoch": 0.9332711152589828, |
| "grad_norm": 6.531651020050049, |
| "kl": 1.097607421875, |
| "learning_rate": 1.3512381173494458e-08, |
| "loss": 0.2064, |
| "reward": 1.3535714983940124, |
| "reward_std": 0.24394118189811706, |
| "rewards/accuracy_reward": 0.4250000178813934, |
| "rewards/format_reward": 0.9285714626312256, |
| "step": 2000 |
| }, |
| { |
| "completion_length": 194.05000762939454, |
| "epoch": 0.9379374708352777, |
| "grad_norm": 7.329962253570557, |
| "kl": 0.97109375, |
| "learning_rate": 1.169576231300684e-08, |
| "loss": 0.1501, |
| "reward": 1.2750000476837158, |
| "reward_std": 0.3136565685272217, |
| "rewards/accuracy_reward": 0.3607142955064774, |
| "rewards/format_reward": 0.9142857491970062, |
| "step": 2010 |
| }, |
| { |
| "completion_length": 198.34286499023438, |
| "epoch": 0.9426038264115726, |
| "grad_norm": 5.229209899902344, |
| "kl": 0.5191162109375, |
| "learning_rate": 1.000879167602764e-08, |
| "loss": 0.087, |
| "reward": 1.285714340209961, |
| "reward_std": 0.1509844921529293, |
| "rewards/accuracy_reward": 0.32857144698500634, |
| "rewards/format_reward": 0.9571428775787354, |
| "step": 2020 |
| }, |
| { |
| "completion_length": 202.35357971191405, |
| "epoch": 0.9472701819878675, |
| "grad_norm": 2.168286085128784, |
| "kl": 0.821923828125, |
| "learning_rate": 8.451917165174404e-09, |
| "loss": 0.1315, |
| "reward": 1.3642857670783997, |
| "reward_std": 0.23890879452228547, |
| "rewards/accuracy_reward": 0.4214285880327225, |
| "rewards/format_reward": 0.9428571581840515, |
| "step": 2030 |
| }, |
| { |
| "completion_length": 205.11786346435548, |
| "epoch": 0.9519365375641624, |
| "grad_norm": 11.269577980041504, |
| "kl": 1.1117919921875, |
| "learning_rate": 7.025552141624369e-09, |
| "loss": 0.2032, |
| "reward": 1.1714286088943482, |
| "reward_std": 0.2642820030450821, |
| "rewards/accuracy_reward": 0.2571428693830967, |
| "rewards/format_reward": 0.9142857372760773, |
| "step": 2040 |
| }, |
| { |
| "completion_length": 190.096435546875, |
| "epoch": 0.9566028931404573, |
| "grad_norm": 0.18682968616485596, |
| "kl": 0.72197265625, |
| "learning_rate": 5.730075315364346e-09, |
| "loss": 0.1334, |
| "reward": 1.2321429133415223, |
| "reward_std": 0.25800683721899986, |
| "rewards/accuracy_reward": 0.29642858505249026, |
| "rewards/format_reward": 0.935714316368103, |
| "step": 2050 |
| }, |
| { |
| "completion_length": 184.56429443359374, |
| "epoch": 0.9612692487167522, |
| "grad_norm": 2.113898754119873, |
| "kl": 0.876220703125, |
| "learning_rate": 4.565830644640223e-09, |
| "loss": 0.1067, |
| "reward": 1.271428632736206, |
| "reward_std": 0.3147573724389076, |
| "rewards/accuracy_reward": 0.3642857313156128, |
| "rewards/format_reward": 0.9071428894996643, |
| "step": 2060 |
| }, |
| { |
| "completion_length": 184.8857208251953, |
| "epoch": 0.9659356042930471, |
| "grad_norm": 6.786799907684326, |
| "kl": 0.4216796875, |
| "learning_rate": 3.533127244634171e-09, |
| "loss": 0.046, |
| "reward": 1.3571429133415223, |
| "reward_std": 0.17572808191180228, |
| "rewards/accuracy_reward": 0.38571430146694186, |
| "rewards/format_reward": 0.9714285790920257, |
| "step": 2070 |
| }, |
| { |
| "completion_length": 188.2321517944336, |
| "epoch": 0.9706019598693421, |
| "grad_norm": 6.5551042556762695, |
| "kl": 0.491552734375, |
| "learning_rate": 2.6322393053916925e-09, |
| "loss": 0.082, |
| "reward": 1.2392857670783997, |
| "reward_std": 0.24259887263178825, |
| "rewards/accuracy_reward": 0.29642858952283857, |
| "rewards/format_reward": 0.9428571701049805, |
| "step": 2080 |
| }, |
| { |
| "completion_length": 197.81429748535157, |
| "epoch": 0.975268315445637, |
| "grad_norm": 3.5728259086608887, |
| "kl": 0.3767578125, |
| "learning_rate": 1.86340601902274e-09, |
| "loss": 0.052, |
| "reward": 1.2035714983940125, |
| "reward_std": 0.18816160932183265, |
| "rewards/accuracy_reward": 0.26071429550647734, |
| "rewards/format_reward": 0.9428571701049805, |
| "step": 2090 |
| }, |
| { |
| "completion_length": 184.00000915527343, |
| "epoch": 0.9799346710219319, |
| "grad_norm": 8.470995903015137, |
| "kl": 0.66552734375, |
| "learning_rate": 1.2268315161944044e-09, |
| "loss": 0.0892, |
| "reward": 1.2678572058677673, |
| "reward_std": 0.16525296717882157, |
| "rewards/accuracy_reward": 0.2964285921305418, |
| "rewards/format_reward": 0.9714285850524902, |
| "step": 2100 |
| }, |
| { |
| "completion_length": 190.1964370727539, |
| "epoch": 0.9846010265982268, |
| "grad_norm": 5.933376312255859, |
| "kl": 0.58525390625, |
| "learning_rate": 7.226848119326057e-10, |
| "loss": 0.0743, |
| "reward": 1.2571429133415222, |
| "reward_std": 0.20183914229273797, |
| "rewards/accuracy_reward": 0.3285714477300644, |
| "rewards/format_reward": 0.9285714566707611, |
| "step": 2110 |
| }, |
| { |
| "completion_length": 203.46429290771485, |
| "epoch": 0.9892673821745217, |
| "grad_norm": 11.98684310913086, |
| "kl": 0.8004150390625, |
| "learning_rate": 3.510997607475974e-10, |
| "loss": 0.144, |
| "reward": 1.2071429133415221, |
| "reward_std": 0.29701889082789423, |
| "rewards/accuracy_reward": 0.28571429923176767, |
| "rewards/format_reward": 0.9214285969734192, |
| "step": 2120 |
| }, |
| { |
| "completion_length": 193.4964370727539, |
| "epoch": 0.9939337377508166, |
| "grad_norm": 1.8809298276901245, |
| "kl": 0.745263671875, |
| "learning_rate": 1.121750210946737e-10, |
| "loss": 0.1279, |
| "reward": 1.2571429133415222, |
| "reward_std": 0.26994478702545166, |
| "rewards/accuracy_reward": 0.3357143022119999, |
| "rewards/format_reward": 0.9214286029338836, |
| "step": 2130 |
| }, |
| { |
| "completion_length": 185.08929443359375, |
| "epoch": 0.9986000933271115, |
| "grad_norm": 1.6485497951507568, |
| "kl": 0.567529296875, |
| "learning_rate": 5.974029179456331e-12, |
| "loss": 0.097, |
| "reward": 1.310714328289032, |
| "reward_std": 0.1667502835392952, |
| "rewards/accuracy_reward": 0.3535714462399483, |
| "rewards/format_reward": 0.9571428775787354, |
| "step": 2140 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_completion_length": 203.0802721296038, |
| "eval_kl": 0.6515764508928571, |
| "eval_loss": 0.1416667252779007, |
| "eval_reward": 1.1919643453189306, |
| "eval_reward_std": 0.2649446129798889, |
| "eval_rewards/accuracy_reward": 0.24681123665400914, |
| "eval_rewards/format_reward": 0.9451530916350228, |
| "eval_runtime": 118.4791, |
| "eval_samples_per_second": 2.532, |
| "eval_steps_per_second": 0.025, |
| "step": 2143 |
| }, |
| { |
| "epoch": 1.0, |
| "step": 2143, |
| "total_flos": 0.0, |
| "train_loss": 0.08651691437249102, |
| "train_runtime": 12893.4863, |
| "train_samples_per_second": 1.163, |
| "train_steps_per_second": 0.166 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 2143, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": false, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|