Sphere-Expositor-700M-v1 / trainer_state.json
ishanb3d's picture
Upload folder using huggingface_hub
cd0acdb verified
{
"best_global_step": 600,
"best_metric": 0.473636656999588,
"best_model_checkpoint": "./liquidaps-clean-large/checkpoint-600",
"epoch": 1.367475035663338,
"eval_steps": 100,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.3900936841964722,
"epoch": 0.002282453637660485,
"grad_norm": 13.0,
"learning_rate": 0.0,
"loss": 0.8776,
"mean_token_accuracy": 0.7829889133572578,
"num_tokens": 5919.0,
"step": 1
},
{
"entropy": 1.4252997040748596,
"epoch": 0.00456490727532097,
"grad_norm": 12.875,
"learning_rate": 1.1363636363636364e-07,
"loss": 0.6708,
"mean_token_accuracy": 0.8342809975147247,
"num_tokens": 11950.0,
"step": 2
},
{
"entropy": 1.398602306842804,
"epoch": 0.0068473609129814554,
"grad_norm": 13.375,
"learning_rate": 2.2727272727272729e-07,
"loss": 0.817,
"mean_token_accuracy": 0.7826605513691902,
"num_tokens": 17559.0,
"step": 3
},
{
"entropy": 1.3683724850416183,
"epoch": 0.00912981455064194,
"grad_norm": 13.875,
"learning_rate": 3.409090909090909e-07,
"loss": 0.8089,
"mean_token_accuracy": 0.8110606968402863,
"num_tokens": 23355.0,
"step": 4
},
{
"entropy": 1.6440566033124924,
"epoch": 0.011412268188302425,
"grad_norm": 16.5,
"learning_rate": 4.5454545454545457e-07,
"loss": 1.0826,
"mean_token_accuracy": 0.7466800287365913,
"num_tokens": 28342.0,
"step": 5
},
{
"entropy": 1.2425581067800522,
"epoch": 0.013694721825962911,
"grad_norm": 14.0625,
"learning_rate": 5.681818181818182e-07,
"loss": 0.8384,
"mean_token_accuracy": 0.8118839636445045,
"num_tokens": 33937.0,
"step": 6
},
{
"entropy": 1.494078889489174,
"epoch": 0.015977175463623396,
"grad_norm": 14.9375,
"learning_rate": 6.818181818181818e-07,
"loss": 0.8747,
"mean_token_accuracy": 0.800664909183979,
"num_tokens": 39724.0,
"step": 7
},
{
"entropy": 1.3064402341842651,
"epoch": 0.01825962910128388,
"grad_norm": 12.0,
"learning_rate": 7.954545454545455e-07,
"loss": 0.8043,
"mean_token_accuracy": 0.8063121438026428,
"num_tokens": 46054.0,
"step": 8
},
{
"entropy": 1.507575884461403,
"epoch": 0.020542082738944364,
"grad_norm": 17.25,
"learning_rate": 9.090909090909091e-07,
"loss": 1.0366,
"mean_token_accuracy": 0.7458265796303749,
"num_tokens": 50806.0,
"step": 9
},
{
"entropy": 1.3228261321783066,
"epoch": 0.02282453637660485,
"grad_norm": 13.0,
"learning_rate": 1.0227272727272729e-06,
"loss": 0.6629,
"mean_token_accuracy": 0.8548868969082832,
"num_tokens": 56696.0,
"step": 10
},
{
"entropy": 1.3493094593286514,
"epoch": 0.025106990014265335,
"grad_norm": 10.9375,
"learning_rate": 1.1363636363636364e-06,
"loss": 0.7411,
"mean_token_accuracy": 0.8017316684126854,
"num_tokens": 63680.0,
"step": 11
},
{
"entropy": 1.3807552456855774,
"epoch": 0.027389443651925822,
"grad_norm": 12.9375,
"learning_rate": 1.25e-06,
"loss": 0.8135,
"mean_token_accuracy": 0.7994487285614014,
"num_tokens": 69861.0,
"step": 12
},
{
"entropy": 1.4055243730545044,
"epoch": 0.029671897289586305,
"grad_norm": 11.9375,
"learning_rate": 1.3636363636363636e-06,
"loss": 0.9012,
"mean_token_accuracy": 0.7958070710301399,
"num_tokens": 75989.0,
"step": 13
},
{
"entropy": 1.431694433093071,
"epoch": 0.03195435092724679,
"grad_norm": 13.75,
"learning_rate": 1.4772727272727275e-06,
"loss": 0.9413,
"mean_token_accuracy": 0.7656892687082291,
"num_tokens": 81844.0,
"step": 14
},
{
"entropy": 1.5010923892259598,
"epoch": 0.034236804564907276,
"grad_norm": 15.3125,
"learning_rate": 1.590909090909091e-06,
"loss": 1.0155,
"mean_token_accuracy": 0.7734663560986519,
"num_tokens": 86897.0,
"step": 15
},
{
"entropy": 1.4839733690023422,
"epoch": 0.03651925820256776,
"grad_norm": 12.9375,
"learning_rate": 1.7045454545454546e-06,
"loss": 0.8776,
"mean_token_accuracy": 0.7831285521388054,
"num_tokens": 92714.0,
"step": 16
},
{
"entropy": 1.3343003541231155,
"epoch": 0.038801711840228244,
"grad_norm": 9.375,
"learning_rate": 1.8181818181818183e-06,
"loss": 0.7208,
"mean_token_accuracy": 0.8181507587432861,
"num_tokens": 100046.0,
"step": 17
},
{
"entropy": 1.488086387515068,
"epoch": 0.04108416547788873,
"grad_norm": 12.125,
"learning_rate": 1.931818181818182e-06,
"loss": 0.7636,
"mean_token_accuracy": 0.7991937696933746,
"num_tokens": 105549.0,
"step": 18
},
{
"entropy": 1.3153499066829681,
"epoch": 0.04336661911554922,
"grad_norm": 11.375,
"learning_rate": 2.0454545454545457e-06,
"loss": 0.7598,
"mean_token_accuracy": 0.8102546408772469,
"num_tokens": 111552.0,
"step": 19
},
{
"entropy": 1.3515659272670746,
"epoch": 0.0456490727532097,
"grad_norm": 11.375,
"learning_rate": 2.1590909090909092e-06,
"loss": 0.7113,
"mean_token_accuracy": 0.810497097671032,
"num_tokens": 117303.0,
"step": 20
},
{
"entropy": 1.4470301866531372,
"epoch": 0.047931526390870186,
"grad_norm": 11.125,
"learning_rate": 2.2727272727272728e-06,
"loss": 0.8029,
"mean_token_accuracy": 0.7923144474625587,
"num_tokens": 123355.0,
"step": 21
},
{
"entropy": 1.3571707159280777,
"epoch": 0.05021398002853067,
"grad_norm": 9.4375,
"learning_rate": 2.3863636363636367e-06,
"loss": 0.6621,
"mean_token_accuracy": 0.8315573260188103,
"num_tokens": 129801.0,
"step": 22
},
{
"entropy": 1.4135605692863464,
"epoch": 0.05249643366619115,
"grad_norm": 10.875,
"learning_rate": 2.5e-06,
"loss": 0.7478,
"mean_token_accuracy": 0.8041789308190346,
"num_tokens": 135168.0,
"step": 23
},
{
"entropy": 1.4300416111946106,
"epoch": 0.054778887303851644,
"grad_norm": 10.0625,
"learning_rate": 2.6136363636363637e-06,
"loss": 0.7541,
"mean_token_accuracy": 0.8075885996222496,
"num_tokens": 141202.0,
"step": 24
},
{
"entropy": 1.3513601571321487,
"epoch": 0.05706134094151213,
"grad_norm": 9.25,
"learning_rate": 2.7272727272727272e-06,
"loss": 0.6913,
"mean_token_accuracy": 0.8184778317809105,
"num_tokens": 147326.0,
"step": 25
},
{
"entropy": 1.3810700178146362,
"epoch": 0.05934379457917261,
"grad_norm": 9.75,
"learning_rate": 2.8409090909090916e-06,
"loss": 0.6849,
"mean_token_accuracy": 0.8293009474873543,
"num_tokens": 153439.0,
"step": 26
},
{
"entropy": 1.3730244934558868,
"epoch": 0.061626248216833095,
"grad_norm": 9.0625,
"learning_rate": 2.954545454545455e-06,
"loss": 0.6562,
"mean_token_accuracy": 0.8283357098698616,
"num_tokens": 159411.0,
"step": 27
},
{
"entropy": 1.337988331913948,
"epoch": 0.06390870185449359,
"grad_norm": 8.375,
"learning_rate": 3.0681818181818186e-06,
"loss": 0.5966,
"mean_token_accuracy": 0.837442196905613,
"num_tokens": 165669.0,
"step": 28
},
{
"entropy": 1.4772655963897705,
"epoch": 0.06619115549215407,
"grad_norm": 9.6875,
"learning_rate": 3.181818181818182e-06,
"loss": 0.7038,
"mean_token_accuracy": 0.8186220824718475,
"num_tokens": 170944.0,
"step": 29
},
{
"entropy": 1.3892450034618378,
"epoch": 0.06847360912981455,
"grad_norm": 7.8125,
"learning_rate": 3.2954545454545456e-06,
"loss": 0.658,
"mean_token_accuracy": 0.8269658461213112,
"num_tokens": 176755.0,
"step": 30
},
{
"entropy": 1.490507110953331,
"epoch": 0.07075606276747504,
"grad_norm": 8.375,
"learning_rate": 3.409090909090909e-06,
"loss": 0.7584,
"mean_token_accuracy": 0.7987356930971146,
"num_tokens": 182319.0,
"step": 31
},
{
"entropy": 1.3267859369516373,
"epoch": 0.07303851640513552,
"grad_norm": 7.15625,
"learning_rate": 3.522727272727273e-06,
"loss": 0.6272,
"mean_token_accuracy": 0.8291826993227005,
"num_tokens": 188236.0,
"step": 32
},
{
"entropy": 1.4844342470169067,
"epoch": 0.075320970042796,
"grad_norm": 7.53125,
"learning_rate": 3.6363636363636366e-06,
"loss": 0.724,
"mean_token_accuracy": 0.806972049176693,
"num_tokens": 193965.0,
"step": 33
},
{
"entropy": 1.4742888659238815,
"epoch": 0.07760342368045649,
"grad_norm": 7.03125,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.6635,
"mean_token_accuracy": 0.8269493877887726,
"num_tokens": 199814.0,
"step": 34
},
{
"entropy": 1.3930696845054626,
"epoch": 0.07988587731811697,
"grad_norm": 6.5625,
"learning_rate": 3.863636363636364e-06,
"loss": 0.6553,
"mean_token_accuracy": 0.8298437520861626,
"num_tokens": 205725.0,
"step": 35
},
{
"entropy": 1.4377078860998154,
"epoch": 0.08216833095577745,
"grad_norm": 6.875,
"learning_rate": 3.9772727272727275e-06,
"loss": 0.6647,
"mean_token_accuracy": 0.8262319192290306,
"num_tokens": 211044.0,
"step": 36
},
{
"entropy": 1.4484449177980423,
"epoch": 0.08445078459343795,
"grad_norm": 5.8125,
"learning_rate": 4.0909090909090915e-06,
"loss": 0.6505,
"mean_token_accuracy": 0.8214789107441902,
"num_tokens": 217143.0,
"step": 37
},
{
"entropy": 1.3406399488449097,
"epoch": 0.08673323823109844,
"grad_norm": 5.5,
"learning_rate": 4.204545454545455e-06,
"loss": 0.5331,
"mean_token_accuracy": 0.8669695928692818,
"num_tokens": 224084.0,
"step": 38
},
{
"entropy": 1.465222254395485,
"epoch": 0.08901569186875892,
"grad_norm": 6.09375,
"learning_rate": 4.3181818181818185e-06,
"loss": 0.5913,
"mean_token_accuracy": 0.8346145749092102,
"num_tokens": 229446.0,
"step": 39
},
{
"entropy": 1.4082716703414917,
"epoch": 0.0912981455064194,
"grad_norm": 5.4375,
"learning_rate": 4.4318181818181824e-06,
"loss": 0.4967,
"mean_token_accuracy": 0.8573063313961029,
"num_tokens": 235250.0,
"step": 40
},
{
"entropy": 1.4065438956022263,
"epoch": 0.09358059914407989,
"grad_norm": 4.53125,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.5228,
"mean_token_accuracy": 0.8517210483551025,
"num_tokens": 241666.0,
"step": 41
},
{
"entropy": 1.4178601205348969,
"epoch": 0.09586305278174037,
"grad_norm": 4.875,
"learning_rate": 4.6590909090909095e-06,
"loss": 0.5534,
"mean_token_accuracy": 0.8581771478056908,
"num_tokens": 247901.0,
"step": 42
},
{
"entropy": 1.4665435552597046,
"epoch": 0.09814550641940085,
"grad_norm": 4.84375,
"learning_rate": 4.772727272727273e-06,
"loss": 0.524,
"mean_token_accuracy": 0.8218341246247292,
"num_tokens": 253273.0,
"step": 43
},
{
"entropy": 1.4858266711235046,
"epoch": 0.10042796005706134,
"grad_norm": 5.1875,
"learning_rate": 4.8863636363636365e-06,
"loss": 0.6752,
"mean_token_accuracy": 0.826298251748085,
"num_tokens": 258616.0,
"step": 44
},
{
"entropy": 1.3626787662506104,
"epoch": 0.10271041369472182,
"grad_norm": 4.5,
"learning_rate": 5e-06,
"loss": 0.5618,
"mean_token_accuracy": 0.8469245880842209,
"num_tokens": 264408.0,
"step": 45
},
{
"entropy": 1.3205972537398338,
"epoch": 0.1049928673323823,
"grad_norm": 3.796875,
"learning_rate": 4.99998226312344e-06,
"loss": 0.4616,
"mean_token_accuracy": 0.8739962726831436,
"num_tokens": 270566.0,
"step": 46
},
{
"entropy": 1.3779225647449493,
"epoch": 0.10727532097004279,
"grad_norm": 3.484375,
"learning_rate": 4.999929052745434e-06,
"loss": 0.4547,
"mean_token_accuracy": 0.8725937232375145,
"num_tokens": 276849.0,
"step": 47
},
{
"entropy": 1.5054886192083359,
"epoch": 0.10955777460770329,
"grad_norm": 4.71875,
"learning_rate": 4.999840369621011e-06,
"loss": 0.5994,
"mean_token_accuracy": 0.8370054960250854,
"num_tokens": 283205.0,
"step": 48
},
{
"entropy": 1.5157189071178436,
"epoch": 0.11184022824536377,
"grad_norm": 4.65625,
"learning_rate": 4.999716215008542e-06,
"loss": 0.5843,
"mean_token_accuracy": 0.8259787857532501,
"num_tokens": 288059.0,
"step": 49
},
{
"entropy": 1.38004170358181,
"epoch": 0.11412268188302425,
"grad_norm": 3.8125,
"learning_rate": 4.999556590669718e-06,
"loss": 0.405,
"mean_token_accuracy": 0.8887585029006004,
"num_tokens": 293798.0,
"step": 50
},
{
"entropy": 1.6085818111896515,
"epoch": 0.11640513552068474,
"grad_norm": 6.21875,
"learning_rate": 4.99936149886953e-06,
"loss": 0.5947,
"mean_token_accuracy": 0.8224818632006645,
"num_tokens": 298157.0,
"step": 51
},
{
"entropy": 1.4853103458881378,
"epoch": 0.11868758915834522,
"grad_norm": 3.453125,
"learning_rate": 4.999130942376232e-06,
"loss": 0.4428,
"mean_token_accuracy": 0.8794936537742615,
"num_tokens": 304309.0,
"step": 52
},
{
"entropy": 1.6272333711385727,
"epoch": 0.1209700427960057,
"grad_norm": 4.6875,
"learning_rate": 4.998864924461305e-06,
"loss": 0.5762,
"mean_token_accuracy": 0.8293572887778282,
"num_tokens": 309756.0,
"step": 53
},
{
"entropy": 1.289240226149559,
"epoch": 0.12325249643366619,
"grad_norm": 3.265625,
"learning_rate": 4.998563448899413e-06,
"loss": 0.4,
"mean_token_accuracy": 0.8821459114551544,
"num_tokens": 316395.0,
"step": 54
},
{
"entropy": 1.555517390370369,
"epoch": 0.12553495007132667,
"grad_norm": 4.5,
"learning_rate": 4.998226519968341e-06,
"loss": 0.5261,
"mean_token_accuracy": 0.8417777121067047,
"num_tokens": 321365.0,
"step": 55
},
{
"entropy": 1.4511889964342117,
"epoch": 0.12781740370898717,
"grad_norm": 3.828125,
"learning_rate": 4.997854142448944e-06,
"loss": 0.5362,
"mean_token_accuracy": 0.8543838635087013,
"num_tokens": 327850.0,
"step": 56
},
{
"entropy": 1.480227530002594,
"epoch": 0.13009985734664764,
"grad_norm": 4.90625,
"learning_rate": 4.9974463216250735e-06,
"loss": 0.6281,
"mean_token_accuracy": 0.8336407989263535,
"num_tokens": 332724.0,
"step": 57
},
{
"entropy": 1.4882567524909973,
"epoch": 0.13238231098430814,
"grad_norm": 4.03125,
"learning_rate": 4.997003063283503e-06,
"loss": 0.5103,
"mean_token_accuracy": 0.854725182056427,
"num_tokens": 338496.0,
"step": 58
},
{
"entropy": 1.3099189698696136,
"epoch": 0.1346647646219686,
"grad_norm": 3.546875,
"learning_rate": 4.996524373713848e-06,
"loss": 0.4035,
"mean_token_accuracy": 0.8902565762400627,
"num_tokens": 344181.0,
"step": 59
},
{
"entropy": 1.554222896695137,
"epoch": 0.1369472182596291,
"grad_norm": 4.28125,
"learning_rate": 4.996010259708475e-06,
"loss": 0.5154,
"mean_token_accuracy": 0.8221362680196762,
"num_tokens": 349987.0,
"step": 60
},
{
"entropy": 1.3615255653858185,
"epoch": 0.13922967189728958,
"grad_norm": 4.125,
"learning_rate": 4.995460728562403e-06,
"loss": 0.5219,
"mean_token_accuracy": 0.8591368719935417,
"num_tokens": 355808.0,
"step": 61
},
{
"entropy": 1.5018275529146194,
"epoch": 0.14151212553495007,
"grad_norm": 3.8125,
"learning_rate": 4.994875788073207e-06,
"loss": 0.4981,
"mean_token_accuracy": 0.8580456078052521,
"num_tokens": 361358.0,
"step": 62
},
{
"entropy": 1.3897339552640915,
"epoch": 0.14379457917261054,
"grad_norm": 3.984375,
"learning_rate": 4.9942554465409e-06,
"loss": 0.4961,
"mean_token_accuracy": 0.8571888878941536,
"num_tokens": 366798.0,
"step": 63
},
{
"entropy": 1.3545932322740555,
"epoch": 0.14607703281027104,
"grad_norm": 3.34375,
"learning_rate": 4.99359971276782e-06,
"loss": 0.4023,
"mean_token_accuracy": 0.8760626539587975,
"num_tokens": 373039.0,
"step": 64
},
{
"entropy": 1.4018055945634842,
"epoch": 0.14835948644793154,
"grad_norm": 3.484375,
"learning_rate": 4.992908596058501e-06,
"loss": 0.4874,
"mean_token_accuracy": 0.8551009446382523,
"num_tokens": 379151.0,
"step": 65
},
{
"entropy": 1.408715844154358,
"epoch": 0.150641940085592,
"grad_norm": 3.78125,
"learning_rate": 4.9921821062195445e-06,
"loss": 0.5979,
"mean_token_accuracy": 0.8376783430576324,
"num_tokens": 385466.0,
"step": 66
},
{
"entropy": 1.478136882185936,
"epoch": 0.1529243937232525,
"grad_norm": 3.34375,
"learning_rate": 4.9914202535594795e-06,
"loss": 0.4359,
"mean_token_accuracy": 0.8765653073787689,
"num_tokens": 391861.0,
"step": 67
},
{
"entropy": 1.3361108154058456,
"epoch": 0.15520684736091298,
"grad_norm": 3.453125,
"learning_rate": 4.990623048888615e-06,
"loss": 0.4471,
"mean_token_accuracy": 0.8761897683143616,
"num_tokens": 397602.0,
"step": 68
},
{
"entropy": 1.5057465434074402,
"epoch": 0.15748930099857347,
"grad_norm": 3.984375,
"learning_rate": 4.989790503518888e-06,
"loss": 0.5262,
"mean_token_accuracy": 0.8583421856164932,
"num_tokens": 403847.0,
"step": 69
},
{
"entropy": 1.5415615290403366,
"epoch": 0.15977175463623394,
"grad_norm": 4.03125,
"learning_rate": 4.988922629263701e-06,
"loss": 0.598,
"mean_token_accuracy": 0.8401808813214302,
"num_tokens": 409563.0,
"step": 70
},
{
"entropy": 1.433893471956253,
"epoch": 0.16205420827389444,
"grad_norm": 3.875,
"learning_rate": 4.988019438437759e-06,
"loss": 0.5086,
"mean_token_accuracy": 0.8572655767202377,
"num_tokens": 415590.0,
"step": 71
},
{
"entropy": 1.5654226392507553,
"epoch": 0.1643366619115549,
"grad_norm": 4.65625,
"learning_rate": 4.987080943856887e-06,
"loss": 0.6098,
"mean_token_accuracy": 0.8376531600952148,
"num_tokens": 421266.0,
"step": 72
},
{
"entropy": 1.513851910829544,
"epoch": 0.1666191155492154,
"grad_norm": 3.78125,
"learning_rate": 4.9861071588378565e-06,
"loss": 0.4454,
"mean_token_accuracy": 0.8665637820959091,
"num_tokens": 426394.0,
"step": 73
},
{
"entropy": 1.5542047619819641,
"epoch": 0.1689015691868759,
"grad_norm": 4.71875,
"learning_rate": 4.9850980971981914e-06,
"loss": 0.6814,
"mean_token_accuracy": 0.808769017457962,
"num_tokens": 431932.0,
"step": 74
},
{
"entropy": 1.4060749858617783,
"epoch": 0.17118402282453637,
"grad_norm": 3.53125,
"learning_rate": 4.984053773255971e-06,
"loss": 0.4207,
"mean_token_accuracy": 0.8581205531954765,
"num_tokens": 437984.0,
"step": 75
},
{
"entropy": 1.4776476472616196,
"epoch": 0.17346647646219687,
"grad_norm": 3.9375,
"learning_rate": 4.9829742018296335e-06,
"loss": 0.5346,
"mean_token_accuracy": 0.8503594622015953,
"num_tokens": 444584.0,
"step": 76
},
{
"entropy": 1.3919195085763931,
"epoch": 0.17574893009985734,
"grad_norm": 3.21875,
"learning_rate": 4.981859398237758e-06,
"loss": 0.4565,
"mean_token_accuracy": 0.8721787855029106,
"num_tokens": 450943.0,
"step": 77
},
{
"entropy": 1.4689613282680511,
"epoch": 0.17803138373751784,
"grad_norm": 3.9375,
"learning_rate": 4.980709378298851e-06,
"loss": 0.5434,
"mean_token_accuracy": 0.8531812652945518,
"num_tokens": 456471.0,
"step": 78
},
{
"entropy": 1.474008470773697,
"epoch": 0.1803138373751783,
"grad_norm": 4.09375,
"learning_rate": 4.979524158331123e-06,
"loss": 0.531,
"mean_token_accuracy": 0.8535453379154205,
"num_tokens": 462328.0,
"step": 79
},
{
"entropy": 1.3587582856416702,
"epoch": 0.1825962910128388,
"grad_norm": 4.03125,
"learning_rate": 4.978303755152254e-06,
"loss": 0.4992,
"mean_token_accuracy": 0.8549595400691032,
"num_tokens": 468402.0,
"step": 80
},
{
"entropy": 1.3619231432676315,
"epoch": 0.18487874465049928,
"grad_norm": 3.359375,
"learning_rate": 4.977048186079155e-06,
"loss": 0.4981,
"mean_token_accuracy": 0.8575711026787758,
"num_tokens": 473714.0,
"step": 81
},
{
"entropy": 1.4384445995092392,
"epoch": 0.18716119828815977,
"grad_norm": 3.328125,
"learning_rate": 4.975757468927727e-06,
"loss": 0.4181,
"mean_token_accuracy": 0.8731885701417923,
"num_tokens": 479842.0,
"step": 82
},
{
"entropy": 1.5311954617500305,
"epoch": 0.18944365192582024,
"grad_norm": 4.34375,
"learning_rate": 4.974431622012601e-06,
"loss": 0.6287,
"mean_token_accuracy": 0.821938157081604,
"num_tokens": 485680.0,
"step": 83
},
{
"entropy": 1.358711913228035,
"epoch": 0.19172610556348074,
"grad_norm": 3.65625,
"learning_rate": 4.973070664146885e-06,
"loss": 0.4416,
"mean_token_accuracy": 0.873858779668808,
"num_tokens": 491390.0,
"step": 84
},
{
"entropy": 1.4033315032720566,
"epoch": 0.19400855920114124,
"grad_norm": 3.890625,
"learning_rate": 4.971674614641891e-06,
"loss": 0.4835,
"mean_token_accuracy": 0.861111544072628,
"num_tokens": 497469.0,
"step": 85
},
{
"entropy": 1.373718798160553,
"epoch": 0.1962910128388017,
"grad_norm": 3.46875,
"learning_rate": 4.970243493306865e-06,
"loss": 0.4599,
"mean_token_accuracy": 0.8647707998752594,
"num_tokens": 503754.0,
"step": 86
},
{
"entropy": 1.4159798175096512,
"epoch": 0.1985734664764622,
"grad_norm": 3.71875,
"learning_rate": 4.968777320448707e-06,
"loss": 0.41,
"mean_token_accuracy": 0.8731393367052078,
"num_tokens": 509255.0,
"step": 87
},
{
"entropy": 1.397733435034752,
"epoch": 0.20085592011412268,
"grad_norm": 4.0,
"learning_rate": 4.9672761168716766e-06,
"loss": 0.4607,
"mean_token_accuracy": 0.8771609216928482,
"num_tokens": 515162.0,
"step": 88
},
{
"entropy": 1.3901693522930145,
"epoch": 0.20313837375178317,
"grad_norm": 3.703125,
"learning_rate": 4.9657399038771045e-06,
"loss": 0.4985,
"mean_token_accuracy": 0.8564205095171928,
"num_tokens": 520980.0,
"step": 89
},
{
"entropy": 1.470759555697441,
"epoch": 0.20542082738944364,
"grad_norm": 4.09375,
"learning_rate": 4.964168703263086e-06,
"loss": 0.5552,
"mean_token_accuracy": 0.834749348461628,
"num_tokens": 526901.0,
"step": 90
},
{
"entropy": 1.5493524819612503,
"epoch": 0.20770328102710414,
"grad_norm": 4.09375,
"learning_rate": 4.962562537324176e-06,
"loss": 0.5276,
"mean_token_accuracy": 0.8242713585495949,
"num_tokens": 532502.0,
"step": 91
},
{
"entropy": 1.4955510944128036,
"epoch": 0.2099857346647646,
"grad_norm": 4.5,
"learning_rate": 4.960921428851066e-06,
"loss": 0.6117,
"mean_token_accuracy": 0.8246004208922386,
"num_tokens": 538159.0,
"step": 92
},
{
"entropy": 1.4567335098981857,
"epoch": 0.2122681883024251,
"grad_norm": 3.0,
"learning_rate": 4.959245401130269e-06,
"loss": 0.3503,
"mean_token_accuracy": 0.8856313973665237,
"num_tokens": 544079.0,
"step": 93
},
{
"entropy": 1.458535224199295,
"epoch": 0.21455064194008558,
"grad_norm": 3.625,
"learning_rate": 4.957534477943782e-06,
"loss": 0.4434,
"mean_token_accuracy": 0.858425110578537,
"num_tokens": 550037.0,
"step": 94
},
{
"entropy": 1.3983053117990494,
"epoch": 0.21683309557774608,
"grad_norm": 3.375,
"learning_rate": 4.955788683568749e-06,
"loss": 0.4004,
"mean_token_accuracy": 0.8748428821563721,
"num_tokens": 556585.0,
"step": 95
},
{
"entropy": 1.481145054101944,
"epoch": 0.21911554921540657,
"grad_norm": 3.3125,
"learning_rate": 4.954008042777125e-06,
"loss": 0.409,
"mean_token_accuracy": 0.8758149892091751,
"num_tokens": 562355.0,
"step": 96
},
{
"entropy": 1.6243803054094315,
"epoch": 0.22139800285306704,
"grad_norm": 4.75,
"learning_rate": 4.952192580835313e-06,
"loss": 0.6636,
"mean_token_accuracy": 0.7973536550998688,
"num_tokens": 568202.0,
"step": 97
},
{
"entropy": 1.575976401567459,
"epoch": 0.22368045649072754,
"grad_norm": 4.59375,
"learning_rate": 4.950342323503812e-06,
"loss": 0.6046,
"mean_token_accuracy": 0.813086025416851,
"num_tokens": 573655.0,
"step": 98
},
{
"entropy": 1.5205018073320389,
"epoch": 0.225962910128388,
"grad_norm": 3.953125,
"learning_rate": 4.9484572970368516e-06,
"loss": 0.5502,
"mean_token_accuracy": 0.8478811085224152,
"num_tokens": 579742.0,
"step": 99
},
{
"entropy": 1.5319028943777084,
"epoch": 0.2282453637660485,
"grad_norm": 4.71875,
"learning_rate": 4.946537528182017e-06,
"loss": 0.6014,
"mean_token_accuracy": 0.8344146087765694,
"num_tokens": 584824.0,
"step": 100
},
{
"epoch": 0.2282453637660485,
"eval_entropy": 1.4501528475019667,
"eval_loss": 0.5052191615104675,
"eval_mean_token_accuracy": 0.8605326036612193,
"eval_num_tokens": 584824.0,
"eval_runtime": 4.4666,
"eval_samples_per_second": 20.149,
"eval_steps_per_second": 20.149,
"step": 100
},
{
"entropy": 1.3917143046855927,
"epoch": 0.23052781740370898,
"grad_norm": 3.203125,
"learning_rate": 4.944583044179871e-06,
"loss": 0.3933,
"mean_token_accuracy": 0.8733155429363251,
"num_tokens": 590608.0,
"step": 101
},
{
"entropy": 1.3328562825918198,
"epoch": 0.23281027104136948,
"grad_norm": 3.0625,
"learning_rate": 4.942593872763566e-06,
"loss": 0.3922,
"mean_token_accuracy": 0.8770610764622688,
"num_tokens": 596918.0,
"step": 102
},
{
"entropy": 1.3897913247346878,
"epoch": 0.23509272467902995,
"grad_norm": 3.4375,
"learning_rate": 4.940570042158454e-06,
"loss": 0.4864,
"mean_token_accuracy": 0.8629380613565445,
"num_tokens": 602674.0,
"step": 103
},
{
"entropy": 1.5906241983175278,
"epoch": 0.23737517831669044,
"grad_norm": 4.46875,
"learning_rate": 4.93851158108168e-06,
"loss": 0.6066,
"mean_token_accuracy": 0.8188068121671677,
"num_tokens": 608041.0,
"step": 104
},
{
"entropy": 1.421783059835434,
"epoch": 0.2396576319543509,
"grad_norm": 3.453125,
"learning_rate": 4.93641851874178e-06,
"loss": 0.4813,
"mean_token_accuracy": 0.8542051687836647,
"num_tokens": 613908.0,
"step": 105
},
{
"entropy": 1.4839935898780823,
"epoch": 0.2419400855920114,
"grad_norm": 4.34375,
"learning_rate": 4.934290884838266e-06,
"loss": 0.539,
"mean_token_accuracy": 0.8587613850831985,
"num_tokens": 620475.0,
"step": 106
},
{
"entropy": 1.4981091767549515,
"epoch": 0.2442225392296719,
"grad_norm": 3.40625,
"learning_rate": 4.932128709561202e-06,
"loss": 0.4702,
"mean_token_accuracy": 0.866189256310463,
"num_tokens": 626833.0,
"step": 107
},
{
"entropy": 1.47100168466568,
"epoch": 0.24650499286733238,
"grad_norm": 3.453125,
"learning_rate": 4.929932023590776e-06,
"loss": 0.4146,
"mean_token_accuracy": 0.8706357181072235,
"num_tokens": 632605.0,
"step": 108
},
{
"entropy": 1.4089600145816803,
"epoch": 0.24878744650499288,
"grad_norm": 2.921875,
"learning_rate": 4.9277008580968665e-06,
"loss": 0.4052,
"mean_token_accuracy": 0.8793638423085213,
"num_tokens": 639026.0,
"step": 109
},
{
"entropy": 1.4623335748910904,
"epoch": 0.25106990014265335,
"grad_norm": 3.109375,
"learning_rate": 4.925435244738599e-06,
"loss": 0.4251,
"mean_token_accuracy": 0.8607661128044128,
"num_tokens": 645661.0,
"step": 110
},
{
"entropy": 1.469603717327118,
"epoch": 0.25335235378031384,
"grad_norm": 3.203125,
"learning_rate": 4.923135215663897e-06,
"loss": 0.4562,
"mean_token_accuracy": 0.8637586832046509,
"num_tokens": 652088.0,
"step": 111
},
{
"entropy": 1.4699177891016006,
"epoch": 0.25563480741797434,
"grad_norm": 3.78125,
"learning_rate": 4.920800803509026e-06,
"loss": 0.4358,
"mean_token_accuracy": 0.8661052659153938,
"num_tokens": 657148.0,
"step": 112
},
{
"entropy": 1.4687887877225876,
"epoch": 0.2579172610556348,
"grad_norm": 4.15625,
"learning_rate": 4.91843204139813e-06,
"loss": 0.4832,
"mean_token_accuracy": 0.87067711353302,
"num_tokens": 662846.0,
"step": 113
},
{
"entropy": 1.3910206109285355,
"epoch": 0.2601997146932953,
"grad_norm": 3.96875,
"learning_rate": 4.916028962942763e-06,
"loss": 0.4606,
"mean_token_accuracy": 0.8688057661056519,
"num_tokens": 668283.0,
"step": 114
},
{
"entropy": 1.4946473091840744,
"epoch": 0.2624821683309558,
"grad_norm": 4.4375,
"learning_rate": 4.913591602241409e-06,
"loss": 0.5177,
"mean_token_accuracy": 0.8503523468971252,
"num_tokens": 673962.0,
"step": 115
},
{
"entropy": 1.4268899112939835,
"epoch": 0.2647646219686163,
"grad_norm": 3.734375,
"learning_rate": 4.911119993878999e-06,
"loss": 0.4608,
"mean_token_accuracy": 0.8624838441610336,
"num_tokens": 679433.0,
"step": 116
},
{
"entropy": 1.4775933474302292,
"epoch": 0.2670470756062768,
"grad_norm": 3.359375,
"learning_rate": 4.908614172926426e-06,
"loss": 0.373,
"mean_token_accuracy": 0.8674890100955963,
"num_tokens": 685178.0,
"step": 117
},
{
"entropy": 1.4562716633081436,
"epoch": 0.2693295292439372,
"grad_norm": 3.890625,
"learning_rate": 4.906074174940038e-06,
"loss": 0.5465,
"mean_token_accuracy": 0.8421404510736465,
"num_tokens": 691044.0,
"step": 118
},
{
"entropy": 1.404031679034233,
"epoch": 0.2716119828815977,
"grad_norm": 3.625,
"learning_rate": 4.903500035961139e-06,
"loss": 0.4888,
"mean_token_accuracy": 0.8540224209427834,
"num_tokens": 697301.0,
"step": 119
},
{
"entropy": 1.421856850385666,
"epoch": 0.2738944365192582,
"grad_norm": 3.328125,
"learning_rate": 4.9008917925154795e-06,
"loss": 0.438,
"mean_token_accuracy": 0.8775565698742867,
"num_tokens": 704275.0,
"step": 120
},
{
"entropy": 1.5078845471143723,
"epoch": 0.2761768901569187,
"grad_norm": 3.640625,
"learning_rate": 4.89824948161273e-06,
"loss": 0.4837,
"mean_token_accuracy": 0.8578910827636719,
"num_tokens": 710429.0,
"step": 121
},
{
"entropy": 1.4396383464336395,
"epoch": 0.27845934379457915,
"grad_norm": 3.71875,
"learning_rate": 4.895573140745967e-06,
"loss": 0.5219,
"mean_token_accuracy": 0.8433092087507248,
"num_tokens": 715838.0,
"step": 122
},
{
"entropy": 1.4553385972976685,
"epoch": 0.28074179743223965,
"grad_norm": 3.578125,
"learning_rate": 4.892862807891131e-06,
"loss": 0.4401,
"mean_token_accuracy": 0.869629830121994,
"num_tokens": 721249.0,
"step": 123
},
{
"entropy": 1.4222912788391113,
"epoch": 0.28302425106990015,
"grad_norm": 3.921875,
"learning_rate": 4.890118521506494e-06,
"loss": 0.5689,
"mean_token_accuracy": 0.8471446335315704,
"num_tokens": 727806.0,
"step": 124
},
{
"entropy": 1.4638441801071167,
"epoch": 0.28530670470756064,
"grad_norm": 3.59375,
"learning_rate": 4.8873403205321115e-06,
"loss": 0.4898,
"mean_token_accuracy": 0.8609614819288254,
"num_tokens": 733588.0,
"step": 125
},
{
"entropy": 1.360969141125679,
"epoch": 0.2875891583452211,
"grad_norm": 4.9375,
"learning_rate": 4.884528244389269e-06,
"loss": 0.5004,
"mean_token_accuracy": 0.8577578216791153,
"num_tokens": 739069.0,
"step": 126
},
{
"entropy": 1.4701900631189346,
"epoch": 0.2898716119828816,
"grad_norm": 3.890625,
"learning_rate": 4.881682332979925e-06,
"loss": 0.4782,
"mean_token_accuracy": 0.8597236052155495,
"num_tokens": 744612.0,
"step": 127
},
{
"entropy": 1.484321504831314,
"epoch": 0.2921540656205421,
"grad_norm": 4.34375,
"learning_rate": 4.878802626686141e-06,
"loss": 0.5044,
"mean_token_accuracy": 0.8599332422018051,
"num_tokens": 750198.0,
"step": 128
},
{
"entropy": 1.4526187181472778,
"epoch": 0.2944365192582026,
"grad_norm": 4.25,
"learning_rate": 4.8758891663695165e-06,
"loss": 0.5283,
"mean_token_accuracy": 0.8519927933812141,
"num_tokens": 755825.0,
"step": 129
},
{
"entropy": 1.487746685743332,
"epoch": 0.2967189728958631,
"grad_norm": 3.859375,
"learning_rate": 4.872941993370598e-06,
"loss": 0.4834,
"mean_token_accuracy": 0.865722268819809,
"num_tokens": 762609.0,
"step": 130
},
{
"entropy": 1.4334597885608673,
"epoch": 0.2990014265335235,
"grad_norm": 3.609375,
"learning_rate": 4.869961149508301e-06,
"loss": 0.462,
"mean_token_accuracy": 0.8797513917088509,
"num_tokens": 768825.0,
"step": 131
},
{
"entropy": 1.5593868792057037,
"epoch": 0.301283880171184,
"grad_norm": 3.75,
"learning_rate": 4.866946677079314e-06,
"loss": 0.4398,
"mean_token_accuracy": 0.8622937723994255,
"num_tokens": 774231.0,
"step": 132
},
{
"entropy": 1.582775130867958,
"epoch": 0.3035663338088445,
"grad_norm": 4.1875,
"learning_rate": 4.8638986188574955e-06,
"loss": 0.5733,
"mean_token_accuracy": 0.8216232135891914,
"num_tokens": 779217.0,
"step": 133
},
{
"entropy": 1.4957093298435211,
"epoch": 0.305848787446505,
"grad_norm": 3.875,
"learning_rate": 4.8608170180932725e-06,
"loss": 0.4983,
"mean_token_accuracy": 0.8560524433851242,
"num_tokens": 785209.0,
"step": 134
},
{
"entropy": 1.4334331154823303,
"epoch": 0.30813124108416545,
"grad_norm": 3.375,
"learning_rate": 4.857701918513023e-06,
"loss": 0.4457,
"mean_token_accuracy": 0.8704549074172974,
"num_tokens": 791251.0,
"step": 135
},
{
"entropy": 1.4960424304008484,
"epoch": 0.31041369472182595,
"grad_norm": 3.546875,
"learning_rate": 4.854553364318456e-06,
"loss": 0.4823,
"mean_token_accuracy": 0.869213730096817,
"num_tokens": 797202.0,
"step": 136
},
{
"entropy": 1.3933140188455582,
"epoch": 0.31269614835948645,
"grad_norm": 3.0,
"learning_rate": 4.851371400185986e-06,
"loss": 0.4387,
"mean_token_accuracy": 0.8605329319834709,
"num_tokens": 804144.0,
"step": 137
},
{
"entropy": 1.4915095120668411,
"epoch": 0.31497860199714695,
"grad_norm": 4.03125,
"learning_rate": 4.848156071266095e-06,
"loss": 0.404,
"mean_token_accuracy": 0.8624937981367111,
"num_tokens": 809125.0,
"step": 138
},
{
"entropy": 1.422121912240982,
"epoch": 0.31726105563480744,
"grad_norm": 3.5,
"learning_rate": 4.844907423182699e-06,
"loss": 0.3698,
"mean_token_accuracy": 0.8753552809357643,
"num_tokens": 814420.0,
"step": 139
},
{
"entropy": 1.3587403669953346,
"epoch": 0.3195435092724679,
"grad_norm": 3.65625,
"learning_rate": 4.841625502032495e-06,
"loss": 0.4201,
"mean_token_accuracy": 0.8749541118741035,
"num_tokens": 819445.0,
"step": 140
},
{
"entropy": 1.5249932259321213,
"epoch": 0.3218259629101284,
"grad_norm": 3.765625,
"learning_rate": 4.838310354384304e-06,
"loss": 0.4569,
"mean_token_accuracy": 0.8636204749345779,
"num_tokens": 825423.0,
"step": 141
},
{
"entropy": 1.4787572473287582,
"epoch": 0.3241084165477889,
"grad_norm": 4.28125,
"learning_rate": 4.834962027278418e-06,
"loss": 0.4271,
"mean_token_accuracy": 0.8966826573014259,
"num_tokens": 830608.0,
"step": 142
},
{
"entropy": 1.3900313079357147,
"epoch": 0.3263908701854494,
"grad_norm": 3.3125,
"learning_rate": 4.831580568225931e-06,
"loss": 0.4272,
"mean_token_accuracy": 0.8754951432347298,
"num_tokens": 837069.0,
"step": 143
},
{
"entropy": 1.4659005105495453,
"epoch": 0.3286733238231098,
"grad_norm": 3.71875,
"learning_rate": 4.828166025208059e-06,
"loss": 0.4788,
"mean_token_accuracy": 0.8542606756091118,
"num_tokens": 842779.0,
"step": 144
},
{
"entropy": 1.4241313189268112,
"epoch": 0.3309557774607703,
"grad_norm": 3.5,
"learning_rate": 4.824718446675465e-06,
"loss": 0.4501,
"mean_token_accuracy": 0.8673816919326782,
"num_tokens": 848075.0,
"step": 145
},
{
"entropy": 1.3615167737007141,
"epoch": 0.3332382310984308,
"grad_norm": 3.984375,
"learning_rate": 4.821237881547567e-06,
"loss": 0.4803,
"mean_token_accuracy": 0.8680660426616669,
"num_tokens": 853972.0,
"step": 146
},
{
"entropy": 1.4747860878705978,
"epoch": 0.3355206847360913,
"grad_norm": 3.890625,
"learning_rate": 4.8177243792118515e-06,
"loss": 0.4336,
"mean_token_accuracy": 0.8747361823916435,
"num_tokens": 859859.0,
"step": 147
},
{
"entropy": 1.5414969474077225,
"epoch": 0.3378031383737518,
"grad_norm": 3.484375,
"learning_rate": 4.814177989523162e-06,
"loss": 0.4489,
"mean_token_accuracy": 0.8644633367657661,
"num_tokens": 865836.0,
"step": 148
},
{
"entropy": 1.6249495893716812,
"epoch": 0.34008559201141225,
"grad_norm": 3.53125,
"learning_rate": 4.810598762803e-06,
"loss": 0.5226,
"mean_token_accuracy": 0.8477596640586853,
"num_tokens": 872086.0,
"step": 149
},
{
"entropy": 1.4743667244911194,
"epoch": 0.34236804564907275,
"grad_norm": 3.75,
"learning_rate": 4.8069867498388066e-06,
"loss": 0.4693,
"mean_token_accuracy": 0.8513918668031693,
"num_tokens": 877138.0,
"step": 150
},
{
"entropy": 1.3822671622037888,
"epoch": 0.34465049928673325,
"grad_norm": 3.21875,
"learning_rate": 4.803342001883247e-06,
"loss": 0.408,
"mean_token_accuracy": 0.8763712868094444,
"num_tokens": 883268.0,
"step": 151
},
{
"entropy": 1.4955266863107681,
"epoch": 0.34693295292439374,
"grad_norm": 4.15625,
"learning_rate": 4.799664570653473e-06,
"loss": 0.5271,
"mean_token_accuracy": 0.8504318669438362,
"num_tokens": 889206.0,
"step": 152
},
{
"entropy": 1.6125495880842209,
"epoch": 0.3492154065620542,
"grad_norm": 5.71875,
"learning_rate": 4.795954508330403e-06,
"loss": 0.6248,
"mean_token_accuracy": 0.8179907724261284,
"num_tokens": 894476.0,
"step": 153
},
{
"entropy": 1.5931424498558044,
"epoch": 0.3514978601997147,
"grad_norm": 4.75,
"learning_rate": 4.792211867557969e-06,
"loss": 0.4888,
"mean_token_accuracy": 0.8579384312033653,
"num_tokens": 899026.0,
"step": 154
},
{
"entropy": 1.4209279268980026,
"epoch": 0.3537803138373752,
"grad_norm": 3.484375,
"learning_rate": 4.788436701442378e-06,
"loss": 0.4354,
"mean_token_accuracy": 0.8708065152168274,
"num_tokens": 905347.0,
"step": 155
},
{
"entropy": 1.4381519109010696,
"epoch": 0.3560627674750357,
"grad_norm": 3.703125,
"learning_rate": 4.784629063551354e-06,
"loss": 0.5609,
"mean_token_accuracy": 0.8458188697695732,
"num_tokens": 911400.0,
"step": 156
},
{
"entropy": 1.4265454858541489,
"epoch": 0.3583452211126961,
"grad_norm": 3.5,
"learning_rate": 4.780789007913379e-06,
"loss": 0.516,
"mean_token_accuracy": 0.8464484214782715,
"num_tokens": 917633.0,
"step": 157
},
{
"entropy": 1.6952187418937683,
"epoch": 0.3606276747503566,
"grad_norm": 4.46875,
"learning_rate": 4.776916589016928e-06,
"loss": 0.6655,
"mean_token_accuracy": 0.8154120817780495,
"num_tokens": 922878.0,
"step": 158
},
{
"entropy": 1.4849806427955627,
"epoch": 0.3629101283880171,
"grad_norm": 3.984375,
"learning_rate": 4.773011861809694e-06,
"loss": 0.5529,
"mean_token_accuracy": 0.8317237794399261,
"num_tokens": 928432.0,
"step": 159
},
{
"entropy": 1.3825362920761108,
"epoch": 0.3651925820256776,
"grad_norm": 3.1875,
"learning_rate": 4.769074881697806e-06,
"loss": 0.422,
"mean_token_accuracy": 0.8742568120360374,
"num_tokens": 934019.0,
"step": 160
},
{
"entropy": 1.429061233997345,
"epoch": 0.3674750356633381,
"grad_norm": 3.328125,
"learning_rate": 4.765105704545052e-06,
"loss": 0.4181,
"mean_token_accuracy": 0.8700381815433502,
"num_tokens": 940405.0,
"step": 161
},
{
"entropy": 1.4522172808647156,
"epoch": 0.36975748930099855,
"grad_norm": 3.296875,
"learning_rate": 4.761104386672074e-06,
"loss": 0.4664,
"mean_token_accuracy": 0.8705998063087463,
"num_tokens": 946891.0,
"step": 162
},
{
"entropy": 1.4823334366083145,
"epoch": 0.37203994293865905,
"grad_norm": 3.171875,
"learning_rate": 4.757070984855577e-06,
"loss": 0.3902,
"mean_token_accuracy": 0.8853188008069992,
"num_tokens": 954063.0,
"step": 163
},
{
"entropy": 1.4951584190130234,
"epoch": 0.37432239657631955,
"grad_norm": 4.25,
"learning_rate": 4.7530055563275225e-06,
"loss": 0.4601,
"mean_token_accuracy": 0.8618411421775818,
"num_tokens": 959914.0,
"step": 164
},
{
"entropy": 1.4573408663272858,
"epoch": 0.37660485021398005,
"grad_norm": 4.625,
"learning_rate": 4.748908158774312e-06,
"loss": 0.5381,
"mean_token_accuracy": 0.8516411259770393,
"num_tokens": 965145.0,
"step": 165
},
{
"entropy": 1.4346065074205399,
"epoch": 0.3788873038516405,
"grad_norm": 3.421875,
"learning_rate": 4.744778850335974e-06,
"loss": 0.4718,
"mean_token_accuracy": 0.8635387867689133,
"num_tokens": 971469.0,
"step": 166
},
{
"entropy": 1.4204413443803787,
"epoch": 0.381169757489301,
"grad_norm": 3.6875,
"learning_rate": 4.7406176896053356e-06,
"loss": 0.4281,
"mean_token_accuracy": 0.8760756626725197,
"num_tokens": 976905.0,
"step": 167
},
{
"entropy": 1.4582399874925613,
"epoch": 0.3834522111269615,
"grad_norm": 3.625,
"learning_rate": 4.736424735627193e-06,
"loss": 0.472,
"mean_token_accuracy": 0.8653873577713966,
"num_tokens": 982797.0,
"step": 168
},
{
"entropy": 1.4145529568195343,
"epoch": 0.385734664764622,
"grad_norm": 4.15625,
"learning_rate": 4.73220004789747e-06,
"loss": 0.4677,
"mean_token_accuracy": 0.8689080029726028,
"num_tokens": 988588.0,
"step": 169
},
{
"entropy": 1.4675364196300507,
"epoch": 0.3880171184022825,
"grad_norm": 3.796875,
"learning_rate": 4.7279436863623805e-06,
"loss": 0.4218,
"mean_token_accuracy": 0.8724250420928001,
"num_tokens": 994490.0,
"step": 170
},
{
"entropy": 1.4822284132242203,
"epoch": 0.3902995720399429,
"grad_norm": 3.25,
"learning_rate": 4.7236557114175705e-06,
"loss": 0.4036,
"mean_token_accuracy": 0.8729385659098625,
"num_tokens": 1000341.0,
"step": 171
},
{
"entropy": 1.5275023579597473,
"epoch": 0.3925820256776034,
"grad_norm": 3.71875,
"learning_rate": 4.719336183907266e-06,
"loss": 0.5107,
"mean_token_accuracy": 0.846622422337532,
"num_tokens": 1005552.0,
"step": 172
},
{
"entropy": 1.4371494799852371,
"epoch": 0.3948644793152639,
"grad_norm": 3.859375,
"learning_rate": 4.7149851651234085e-06,
"loss": 0.4761,
"mean_token_accuracy": 0.856620728969574,
"num_tokens": 1011272.0,
"step": 173
},
{
"entropy": 1.4481075257062912,
"epoch": 0.3971469329529244,
"grad_norm": 3.265625,
"learning_rate": 4.710602716804784e-06,
"loss": 0.4907,
"mean_token_accuracy": 0.8551308736205101,
"num_tokens": 1018025.0,
"step": 174
},
{
"entropy": 1.4776830077171326,
"epoch": 0.39942938659058486,
"grad_norm": 3.484375,
"learning_rate": 4.706188901136148e-06,
"loss": 0.4157,
"mean_token_accuracy": 0.8659848943352699,
"num_tokens": 1023559.0,
"step": 175
},
{
"entropy": 1.3460393995046616,
"epoch": 0.40171184022824535,
"grad_norm": 3.15625,
"learning_rate": 4.701743780747345e-06,
"loss": 0.3891,
"mean_token_accuracy": 0.8979940786957741,
"num_tokens": 1029587.0,
"step": 176
},
{
"entropy": 1.5323508977890015,
"epoch": 0.40399429386590585,
"grad_norm": 3.671875,
"learning_rate": 4.697267418712415e-06,
"loss": 0.5064,
"mean_token_accuracy": 0.8600496724247932,
"num_tokens": 1035523.0,
"step": 177
},
{
"entropy": 1.3961755633354187,
"epoch": 0.40627674750356635,
"grad_norm": 3.4375,
"learning_rate": 4.6927598785487026e-06,
"loss": 0.4937,
"mean_token_accuracy": 0.8478540182113647,
"num_tokens": 1041403.0,
"step": 178
},
{
"entropy": 1.4182656705379486,
"epoch": 0.40855920114122685,
"grad_norm": 3.015625,
"learning_rate": 4.6882212242159555e-06,
"loss": 0.3456,
"mean_token_accuracy": 0.8982625529170036,
"num_tokens": 1047682.0,
"step": 179
},
{
"entropy": 1.548415094614029,
"epoch": 0.4108416547788873,
"grad_norm": 4.3125,
"learning_rate": 4.683651520115414e-06,
"loss": 0.5678,
"mean_token_accuracy": 0.8428888395428658,
"num_tokens": 1053172.0,
"step": 180
},
{
"entropy": 1.396517127752304,
"epoch": 0.4131241084165478,
"grad_norm": 3.46875,
"learning_rate": 4.679050831088902e-06,
"loss": 0.4803,
"mean_token_accuracy": 0.856790341436863,
"num_tokens": 1059373.0,
"step": 181
},
{
"entropy": 1.3589655607938766,
"epoch": 0.4154065620542083,
"grad_norm": 3.53125,
"learning_rate": 4.674419222417899e-06,
"loss": 0.3944,
"mean_token_accuracy": 0.8856743425130844,
"num_tokens": 1065347.0,
"step": 182
},
{
"entropy": 1.5359989404678345,
"epoch": 0.4176890156918688,
"grad_norm": 4.15625,
"learning_rate": 4.669756759822625e-06,
"loss": 0.4896,
"mean_token_accuracy": 0.8504308834671974,
"num_tokens": 1070311.0,
"step": 183
},
{
"entropy": 1.3297200053930283,
"epoch": 0.4199714693295292,
"grad_norm": 3.21875,
"learning_rate": 4.665063509461098e-06,
"loss": 0.3047,
"mean_token_accuracy": 0.9152820706367493,
"num_tokens": 1076590.0,
"step": 184
},
{
"entropy": 1.3356045931577682,
"epoch": 0.4222539229671897,
"grad_norm": 2.9375,
"learning_rate": 4.660339537928198e-06,
"loss": 0.3891,
"mean_token_accuracy": 0.8858283907175064,
"num_tokens": 1082550.0,
"step": 185
},
{
"entropy": 1.518212452530861,
"epoch": 0.4245363766048502,
"grad_norm": 3.078125,
"learning_rate": 4.655584912254727e-06,
"loss": 0.393,
"mean_token_accuracy": 0.8758783265948296,
"num_tokens": 1088391.0,
"step": 186
},
{
"entropy": 1.488260880112648,
"epoch": 0.4268188302425107,
"grad_norm": 3.65625,
"learning_rate": 4.650799699906452e-06,
"loss": 0.4005,
"mean_token_accuracy": 0.871321365237236,
"num_tokens": 1093823.0,
"step": 187
},
{
"entropy": 1.4447910338640213,
"epoch": 0.42910128388017116,
"grad_norm": 3.21875,
"learning_rate": 4.645983968783148e-06,
"loss": 0.3873,
"mean_token_accuracy": 0.8878121376037598,
"num_tokens": 1099347.0,
"step": 188
},
{
"entropy": 1.4393097907304764,
"epoch": 0.43138373751783166,
"grad_norm": 3.40625,
"learning_rate": 4.64113778721764e-06,
"loss": 0.3712,
"mean_token_accuracy": 0.8943579867482185,
"num_tokens": 1104941.0,
"step": 189
},
{
"entropy": 1.5411454141139984,
"epoch": 0.43366619115549215,
"grad_norm": 4.15625,
"learning_rate": 4.636261223974826e-06,
"loss": 0.498,
"mean_token_accuracy": 0.8571378961205482,
"num_tokens": 1110031.0,
"step": 190
},
{
"entropy": 1.3604239225387573,
"epoch": 0.43594864479315265,
"grad_norm": 3.53125,
"learning_rate": 4.631354348250706e-06,
"loss": 0.4366,
"mean_token_accuracy": 0.8668901473283768,
"num_tokens": 1116176.0,
"step": 191
},
{
"entropy": 1.4267419427633286,
"epoch": 0.43823109843081315,
"grad_norm": 3.125,
"learning_rate": 4.626417229671401e-06,
"loss": 0.4324,
"mean_token_accuracy": 0.8729524612426758,
"num_tokens": 1122065.0,
"step": 192
},
{
"entropy": 1.554912507534027,
"epoch": 0.4405135520684736,
"grad_norm": 4.34375,
"learning_rate": 4.621449938292159e-06,
"loss": 0.5843,
"mean_token_accuracy": 0.8273278325796127,
"num_tokens": 1127506.0,
"step": 193
},
{
"entropy": 1.3502502888441086,
"epoch": 0.4427960057061341,
"grad_norm": 2.828125,
"learning_rate": 4.616452544596367e-06,
"loss": 0.3874,
"mean_token_accuracy": 0.8785886839032173,
"num_tokens": 1133494.0,
"step": 194
},
{
"entropy": 1.4718603789806366,
"epoch": 0.4450784593437946,
"grad_norm": 3.90625,
"learning_rate": 4.611425119494552e-06,
"loss": 0.4499,
"mean_token_accuracy": 0.8621420189738274,
"num_tokens": 1139036.0,
"step": 195
},
{
"entropy": 1.592808559536934,
"epoch": 0.4473609129814551,
"grad_norm": 4.5625,
"learning_rate": 4.606367734323365e-06,
"loss": 0.5667,
"mean_token_accuracy": 0.832310289144516,
"num_tokens": 1144022.0,
"step": 196
},
{
"entropy": 1.410594865679741,
"epoch": 0.4496433666191155,
"grad_norm": 4.03125,
"learning_rate": 4.601280460844583e-06,
"loss": 0.5266,
"mean_token_accuracy": 0.855924166738987,
"num_tokens": 1150011.0,
"step": 197
},
{
"entropy": 1.4304940402507782,
"epoch": 0.451925820256776,
"grad_norm": 4.46875,
"learning_rate": 4.596163371244076e-06,
"loss": 0.5302,
"mean_token_accuracy": 0.8468711525201797,
"num_tokens": 1155938.0,
"step": 198
},
{
"entropy": 1.4850642681121826,
"epoch": 0.4542082738944365,
"grad_norm": 3.8125,
"learning_rate": 4.591016538130796e-06,
"loss": 0.5296,
"mean_token_accuracy": 0.8607726991176605,
"num_tokens": 1161187.0,
"step": 199
},
{
"entropy": 1.495200276374817,
"epoch": 0.456490727532097,
"grad_norm": 3.71875,
"learning_rate": 4.585840034535736e-06,
"loss": 0.4806,
"mean_token_accuracy": 0.865336537361145,
"num_tokens": 1167354.0,
"step": 200
},
{
"epoch": 0.456490727532097,
"eval_entropy": 1.4645510156949362,
"eval_loss": 0.48574092984199524,
"eval_mean_token_accuracy": 0.8648963557349311,
"eval_num_tokens": 1167354.0,
"eval_runtime": 4.6146,
"eval_samples_per_second": 19.503,
"eval_steps_per_second": 19.503,
"step": 200
},
{
"entropy": 1.5986905246973038,
"epoch": 0.4587731811697575,
"grad_norm": 3.984375,
"learning_rate": 4.580633933910901e-06,
"loss": 0.4827,
"mean_token_accuracy": 0.8589570224285126,
"num_tokens": 1173168.0,
"step": 201
},
{
"entropy": 1.4304189831018448,
"epoch": 0.46105563480741796,
"grad_norm": 3.328125,
"learning_rate": 4.575398310128263e-06,
"loss": 0.432,
"mean_token_accuracy": 0.870637446641922,
"num_tokens": 1178884.0,
"step": 202
},
{
"entropy": 1.5412327647209167,
"epoch": 0.46333808844507846,
"grad_norm": 4.03125,
"learning_rate": 4.570133237478711e-06,
"loss": 0.5089,
"mean_token_accuracy": 0.8491686582565308,
"num_tokens": 1184480.0,
"step": 203
},
{
"entropy": 1.4805094599723816,
"epoch": 0.46562054208273895,
"grad_norm": 3.453125,
"learning_rate": 4.564838790671e-06,
"loss": 0.5336,
"mean_token_accuracy": 0.8480750620365143,
"num_tokens": 1190484.0,
"step": 204
},
{
"entropy": 1.4799759984016418,
"epoch": 0.46790299572039945,
"grad_norm": 3.359375,
"learning_rate": 4.55951504483069e-06,
"loss": 0.4372,
"mean_token_accuracy": 0.8827960044145584,
"num_tokens": 1195901.0,
"step": 205
},
{
"entropy": 1.5237813293933868,
"epoch": 0.4701854493580599,
"grad_norm": 4.125,
"learning_rate": 4.55416207549908e-06,
"loss": 0.613,
"mean_token_accuracy": 0.8307090178132057,
"num_tokens": 1201383.0,
"step": 206
},
{
"entropy": 1.438712790608406,
"epoch": 0.4724679029957204,
"grad_norm": 3.21875,
"learning_rate": 4.548779958632134e-06,
"loss": 0.5351,
"mean_token_accuracy": 0.8520702794194221,
"num_tokens": 1207874.0,
"step": 207
},
{
"entropy": 1.4036246687173843,
"epoch": 0.4747503566333809,
"grad_norm": 3.390625,
"learning_rate": 4.543368770599406e-06,
"loss": 0.346,
"mean_token_accuracy": 0.8787712529301643,
"num_tokens": 1213989.0,
"step": 208
},
{
"entropy": 1.486038789153099,
"epoch": 0.4770328102710414,
"grad_norm": 3.71875,
"learning_rate": 4.537928588182955e-06,
"loss": 0.5211,
"mean_token_accuracy": 0.8482290953397751,
"num_tokens": 1219525.0,
"step": 209
},
{
"entropy": 1.444077506661415,
"epoch": 0.4793152639087018,
"grad_norm": 2.953125,
"learning_rate": 4.532459488576258e-06,
"loss": 0.3976,
"mean_token_accuracy": 0.8832324147224426,
"num_tokens": 1226231.0,
"step": 210
},
{
"entropy": 1.5054399818181992,
"epoch": 0.4815977175463623,
"grad_norm": 3.671875,
"learning_rate": 4.526961549383109e-06,
"loss": 0.4581,
"mean_token_accuracy": 0.8546851649880409,
"num_tokens": 1232271.0,
"step": 211
},
{
"entropy": 1.4887232929468155,
"epoch": 0.4838801711840228,
"grad_norm": 3.1875,
"learning_rate": 4.521434848616523e-06,
"loss": 0.4776,
"mean_token_accuracy": 0.8665826469659805,
"num_tokens": 1239076.0,
"step": 212
},
{
"entropy": 1.4471513032913208,
"epoch": 0.4861626248216833,
"grad_norm": 3.140625,
"learning_rate": 4.515879464697629e-06,
"loss": 0.3437,
"mean_token_accuracy": 0.9033405035734177,
"num_tokens": 1245117.0,
"step": 213
},
{
"entropy": 1.4255793392658234,
"epoch": 0.4884450784593438,
"grad_norm": 3.21875,
"learning_rate": 4.5102954764545525e-06,
"loss": 0.3922,
"mean_token_accuracy": 0.879116877913475,
"num_tokens": 1251024.0,
"step": 214
},
{
"entropy": 1.4146728217601776,
"epoch": 0.49072753209700426,
"grad_norm": 3.21875,
"learning_rate": 4.5046829631213014e-06,
"loss": 0.4581,
"mean_token_accuracy": 0.8701305538415909,
"num_tokens": 1257738.0,
"step": 215
},
{
"entropy": 1.4356386065483093,
"epoch": 0.49300998573466476,
"grad_norm": 2.796875,
"learning_rate": 4.499042004336642e-06,
"loss": 0.4283,
"mean_token_accuracy": 0.8771600425243378,
"num_tokens": 1265254.0,
"step": 216
},
{
"entropy": 1.3496776968240738,
"epoch": 0.49529243937232525,
"grad_norm": 2.859375,
"learning_rate": 4.4933726801429665e-06,
"loss": 0.3705,
"mean_token_accuracy": 0.8920829594135284,
"num_tokens": 1271970.0,
"step": 217
},
{
"entropy": 1.5127773433923721,
"epoch": 0.49757489300998575,
"grad_norm": 3.78125,
"learning_rate": 4.487675070985156e-06,
"loss": 0.4624,
"mean_token_accuracy": 0.8566678315401077,
"num_tokens": 1277606.0,
"step": 218
},
{
"entropy": 1.4766086488962173,
"epoch": 0.4998573466476462,
"grad_norm": 3.796875,
"learning_rate": 4.481949257709442e-06,
"loss": 0.4412,
"mean_token_accuracy": 0.8686520978808403,
"num_tokens": 1283617.0,
"step": 219
},
{
"entropy": 1.5000656843185425,
"epoch": 0.5021398002853067,
"grad_norm": 4.40625,
"learning_rate": 4.476195321562262e-06,
"loss": 0.5898,
"mean_token_accuracy": 0.8323855772614479,
"num_tokens": 1289328.0,
"step": 220
},
{
"entropy": 1.4562593698501587,
"epoch": 0.5044222539229671,
"grad_norm": 3.625,
"learning_rate": 4.470413344189098e-06,
"loss": 0.4657,
"mean_token_accuracy": 0.8688141480088234,
"num_tokens": 1294897.0,
"step": 221
},
{
"entropy": 1.412929117679596,
"epoch": 0.5067047075606277,
"grad_norm": 3.515625,
"learning_rate": 4.464603407633326e-06,
"loss": 0.4717,
"mean_token_accuracy": 0.8586973398923874,
"num_tokens": 1300887.0,
"step": 222
},
{
"entropy": 1.5253776609897614,
"epoch": 0.5089871611982881,
"grad_norm": 3.390625,
"learning_rate": 4.458765594335048e-06,
"loss": 0.473,
"mean_token_accuracy": 0.8543320819735527,
"num_tokens": 1306712.0,
"step": 223
},
{
"entropy": 1.5946801453828812,
"epoch": 0.5112696148359487,
"grad_norm": 3.90625,
"learning_rate": 4.452899987129922e-06,
"loss": 0.5303,
"mean_token_accuracy": 0.8440029099583626,
"num_tokens": 1311955.0,
"step": 224
},
{
"entropy": 1.3364089578390121,
"epoch": 0.5135520684736091,
"grad_norm": 3.140625,
"learning_rate": 4.44700666924799e-06,
"loss": 0.3431,
"mean_token_accuracy": 0.8987620249390602,
"num_tokens": 1318460.0,
"step": 225
},
{
"entropy": 1.4394992887973785,
"epoch": 0.5158345221112696,
"grad_norm": 3.0,
"learning_rate": 4.441085724312494e-06,
"loss": 0.4805,
"mean_token_accuracy": 0.861751489341259,
"num_tokens": 1325269.0,
"step": 226
},
{
"entropy": 1.4739690721035004,
"epoch": 0.5181169757489301,
"grad_norm": 3.5,
"learning_rate": 4.435137236338688e-06,
"loss": 0.4712,
"mean_token_accuracy": 0.8692339286208153,
"num_tokens": 1331087.0,
"step": 227
},
{
"entropy": 1.408553659915924,
"epoch": 0.5203994293865906,
"grad_norm": 3.703125,
"learning_rate": 4.42916128973265e-06,
"loss": 0.545,
"mean_token_accuracy": 0.8480049669742584,
"num_tokens": 1336928.0,
"step": 228
},
{
"entropy": 1.4906915128231049,
"epoch": 0.5226818830242511,
"grad_norm": 4.3125,
"learning_rate": 4.423157969290081e-06,
"loss": 0.4943,
"mean_token_accuracy": 0.8629228696227074,
"num_tokens": 1341951.0,
"step": 229
},
{
"entropy": 1.5799495428800583,
"epoch": 0.5249643366619116,
"grad_norm": 3.875,
"learning_rate": 4.417127360195107e-06,
"loss": 0.454,
"mean_token_accuracy": 0.8446270450949669,
"num_tokens": 1346983.0,
"step": 230
},
{
"entropy": 1.3668962121009827,
"epoch": 0.527246790299572,
"grad_norm": 3.28125,
"learning_rate": 4.41106954801906e-06,
"loss": 0.3977,
"mean_token_accuracy": 0.8871706500649452,
"num_tokens": 1354122.0,
"step": 231
},
{
"entropy": 1.5603487640619278,
"epoch": 0.5295292439372326,
"grad_norm": 3.359375,
"learning_rate": 4.404984618719275e-06,
"loss": 0.4717,
"mean_token_accuracy": 0.8657551482319832,
"num_tokens": 1359608.0,
"step": 232
},
{
"entropy": 1.4570914506912231,
"epoch": 0.531811697574893,
"grad_norm": 3.1875,
"learning_rate": 4.398872658637863e-06,
"loss": 0.4311,
"mean_token_accuracy": 0.8685552924871445,
"num_tokens": 1365590.0,
"step": 233
},
{
"entropy": 1.329675242304802,
"epoch": 0.5340941512125535,
"grad_norm": 3.09375,
"learning_rate": 4.39273375450049e-06,
"loss": 0.4566,
"mean_token_accuracy": 0.8627236634492874,
"num_tokens": 1372145.0,
"step": 234
},
{
"entropy": 1.4357402175664902,
"epoch": 0.536376604850214,
"grad_norm": 3.5,
"learning_rate": 4.386567993415144e-06,
"loss": 0.4507,
"mean_token_accuracy": 0.8667884543538094,
"num_tokens": 1377900.0,
"step": 235
},
{
"entropy": 1.5077559649944305,
"epoch": 0.5386590584878744,
"grad_norm": 3.984375,
"learning_rate": 4.3803754628708995e-06,
"loss": 0.5176,
"mean_token_accuracy": 0.8583211898803711,
"num_tokens": 1383999.0,
"step": 236
},
{
"entropy": 1.3777508586645126,
"epoch": 0.540941512125535,
"grad_norm": 3.265625,
"learning_rate": 4.3741562507366754e-06,
"loss": 0.3431,
"mean_token_accuracy": 0.8923545554280281,
"num_tokens": 1390419.0,
"step": 237
},
{
"entropy": 1.4933728128671646,
"epoch": 0.5432239657631954,
"grad_norm": 3.203125,
"learning_rate": 4.367910445259991e-06,
"loss": 0.4044,
"mean_token_accuracy": 0.8686385452747345,
"num_tokens": 1396684.0,
"step": 238
},
{
"entropy": 1.4653480350971222,
"epoch": 0.5455064194008559,
"grad_norm": 3.25,
"learning_rate": 4.361638135065711e-06,
"loss": 0.4561,
"mean_token_accuracy": 0.8716481998562813,
"num_tokens": 1402830.0,
"step": 239
},
{
"entropy": 1.5274227857589722,
"epoch": 0.5477888730385164,
"grad_norm": 3.6875,
"learning_rate": 4.355339409154788e-06,
"loss": 0.5069,
"mean_token_accuracy": 0.8373076170682907,
"num_tokens": 1408506.0,
"step": 240
},
{
"entropy": 1.4511406421661377,
"epoch": 0.5500713266761769,
"grad_norm": 3.3125,
"learning_rate": 4.3490143569030025e-06,
"loss": 0.4684,
"mean_token_accuracy": 0.8665965721011162,
"num_tokens": 1414792.0,
"step": 241
},
{
"entropy": 1.3838857859373093,
"epoch": 0.5523537803138374,
"grad_norm": 3.4375,
"learning_rate": 4.34266306805969e-06,
"loss": 0.4547,
"mean_token_accuracy": 0.8690644651651382,
"num_tokens": 1420524.0,
"step": 242
},
{
"entropy": 1.4130767732858658,
"epoch": 0.5546362339514979,
"grad_norm": 3.46875,
"learning_rate": 4.336285632746472e-06,
"loss": 0.471,
"mean_token_accuracy": 0.8564508408308029,
"num_tokens": 1426426.0,
"step": 243
},
{
"entropy": 1.618276908993721,
"epoch": 0.5569186875891583,
"grad_norm": 4.03125,
"learning_rate": 4.329882141455974e-06,
"loss": 0.5143,
"mean_token_accuracy": 0.8403759598731995,
"num_tokens": 1431586.0,
"step": 244
},
{
"entropy": 1.4412871301174164,
"epoch": 0.5592011412268189,
"grad_norm": 3.90625,
"learning_rate": 4.323452685050545e-06,
"loss": 0.4539,
"mean_token_accuracy": 0.863670825958252,
"num_tokens": 1437354.0,
"step": 245
},
{
"entropy": 1.4914350509643555,
"epoch": 0.5614835948644793,
"grad_norm": 3.03125,
"learning_rate": 4.316997354760965e-06,
"loss": 0.3826,
"mean_token_accuracy": 0.8802237138152122,
"num_tokens": 1443221.0,
"step": 246
},
{
"entropy": 1.5026773810386658,
"epoch": 0.5637660485021398,
"grad_norm": 3.359375,
"learning_rate": 4.3105162421851494e-06,
"loss": 0.4275,
"mean_token_accuracy": 0.8739782869815826,
"num_tokens": 1448716.0,
"step": 247
},
{
"entropy": 1.467271402478218,
"epoch": 0.5660485021398003,
"grad_norm": 3.765625,
"learning_rate": 4.304009439286855e-06,
"loss": 0.4786,
"mean_token_accuracy": 0.8454955220222473,
"num_tokens": 1453607.0,
"step": 248
},
{
"entropy": 1.3084248155355453,
"epoch": 0.5683309557774607,
"grad_norm": 3.171875,
"learning_rate": 4.297477038394368e-06,
"loss": 0.4264,
"mean_token_accuracy": 0.8782637789845467,
"num_tokens": 1460122.0,
"step": 249
},
{
"entropy": 1.4157914519309998,
"epoch": 0.5706134094151213,
"grad_norm": 3.046875,
"learning_rate": 4.2909191321992e-06,
"loss": 0.4883,
"mean_token_accuracy": 0.8630497455596924,
"num_tokens": 1466789.0,
"step": 250
},
{
"entropy": 1.3701231330633163,
"epoch": 0.5728958630527817,
"grad_norm": 3.53125,
"learning_rate": 4.28433581375477e-06,
"loss": 0.4331,
"mean_token_accuracy": 0.874555304646492,
"num_tokens": 1472752.0,
"step": 251
},
{
"entropy": 1.5737513154745102,
"epoch": 0.5751783166904422,
"grad_norm": 3.625,
"learning_rate": 4.2777271764750805e-06,
"loss": 0.4553,
"mean_token_accuracy": 0.8664311170578003,
"num_tokens": 1478473.0,
"step": 252
},
{
"entropy": 1.525623768568039,
"epoch": 0.5774607703281027,
"grad_norm": 3.390625,
"learning_rate": 4.271093314133401e-06,
"loss": 0.466,
"mean_token_accuracy": 0.8556927219033241,
"num_tokens": 1484284.0,
"step": 253
},
{
"entropy": 1.4639706760644913,
"epoch": 0.5797432239657632,
"grad_norm": 3.75,
"learning_rate": 4.264434320860929e-06,
"loss": 0.5532,
"mean_token_accuracy": 0.844054289162159,
"num_tokens": 1490166.0,
"step": 254
},
{
"entropy": 1.5366946905851364,
"epoch": 0.5820256776034237,
"grad_norm": 3.65625,
"learning_rate": 4.257750291145457e-06,
"loss": 0.5268,
"mean_token_accuracy": 0.8521439135074615,
"num_tokens": 1495689.0,
"step": 255
},
{
"entropy": 1.5063273757696152,
"epoch": 0.5843081312410842,
"grad_norm": 3.796875,
"learning_rate": 4.251041319830034e-06,
"loss": 0.5244,
"mean_token_accuracy": 0.8497593402862549,
"num_tokens": 1501104.0,
"step": 256
},
{
"entropy": 1.5439026057720184,
"epoch": 0.5865905848787446,
"grad_norm": 3.3125,
"learning_rate": 4.2443075021116166e-06,
"loss": 0.3605,
"mean_token_accuracy": 0.8726519420742989,
"num_tokens": 1506924.0,
"step": 257
},
{
"entropy": 1.4876836389303207,
"epoch": 0.5888730385164052,
"grad_norm": 3.640625,
"learning_rate": 4.237548933539718e-06,
"loss": 0.4703,
"mean_token_accuracy": 0.866664931178093,
"num_tokens": 1512828.0,
"step": 258
},
{
"entropy": 1.480648323893547,
"epoch": 0.5911554921540656,
"grad_norm": 3.3125,
"learning_rate": 4.230765710015058e-06,
"loss": 0.466,
"mean_token_accuracy": 0.8522143065929413,
"num_tokens": 1518522.0,
"step": 259
},
{
"entropy": 1.4419532120227814,
"epoch": 0.5934379457917262,
"grad_norm": 3.703125,
"learning_rate": 4.223957927788195e-06,
"loss": 0.4973,
"mean_token_accuracy": 0.8543191030621529,
"num_tokens": 1523970.0,
"step": 260
},
{
"entropy": 1.4034761041402817,
"epoch": 0.5957203994293866,
"grad_norm": 2.984375,
"learning_rate": 4.217125683458162e-06,
"loss": 0.3724,
"mean_token_accuracy": 0.8887425437569618,
"num_tokens": 1530150.0,
"step": 261
},
{
"entropy": 1.5668024867773056,
"epoch": 0.598002853067047,
"grad_norm": 4.5,
"learning_rate": 4.210269073971098e-06,
"loss": 0.4921,
"mean_token_accuracy": 0.8630413040518761,
"num_tokens": 1535368.0,
"step": 262
},
{
"entropy": 1.4702572673559189,
"epoch": 0.6002853067047076,
"grad_norm": 3.40625,
"learning_rate": 4.203388196618874e-06,
"loss": 0.3834,
"mean_token_accuracy": 0.8823850229382515,
"num_tokens": 1541388.0,
"step": 263
},
{
"entropy": 1.353348970413208,
"epoch": 0.602567760342368,
"grad_norm": 3.03125,
"learning_rate": 4.196483149037707e-06,
"loss": 0.3882,
"mean_token_accuracy": 0.8797778934240341,
"num_tokens": 1547245.0,
"step": 264
},
{
"entropy": 1.3397300243377686,
"epoch": 0.6048502139800286,
"grad_norm": 3.453125,
"learning_rate": 4.1895540292067765e-06,
"loss": 0.4969,
"mean_token_accuracy": 0.8677136451005936,
"num_tokens": 1553007.0,
"step": 265
},
{
"entropy": 1.533875733613968,
"epoch": 0.607132667617689,
"grad_norm": 4.15625,
"learning_rate": 4.18260093544684e-06,
"loss": 0.5423,
"mean_token_accuracy": 0.8619329035282135,
"num_tokens": 1559044.0,
"step": 266
},
{
"entropy": 1.4415863156318665,
"epoch": 0.6094151212553495,
"grad_norm": 3.296875,
"learning_rate": 4.1756239664188275e-06,
"loss": 0.4586,
"mean_token_accuracy": 0.8679523020982742,
"num_tokens": 1565121.0,
"step": 267
},
{
"entropy": 1.5389132052659988,
"epoch": 0.61169757489301,
"grad_norm": 3.515625,
"learning_rate": 4.168623221122451e-06,
"loss": 0.3954,
"mean_token_accuracy": 0.8800017014145851,
"num_tokens": 1570839.0,
"step": 268
},
{
"entropy": 1.4849002212285995,
"epoch": 0.6139800285306705,
"grad_norm": 4.46875,
"learning_rate": 4.161598798894795e-06,
"loss": 0.5272,
"mean_token_accuracy": 0.842116691172123,
"num_tokens": 1576765.0,
"step": 269
},
{
"entropy": 1.526948407292366,
"epoch": 0.6162624821683309,
"grad_norm": 3.796875,
"learning_rate": 4.154550799408906e-06,
"loss": 0.4815,
"mean_token_accuracy": 0.8517501726746559,
"num_tokens": 1582404.0,
"step": 270
},
{
"entropy": 1.5471256375312805,
"epoch": 0.6185449358059915,
"grad_norm": 4.0625,
"learning_rate": 4.147479322672383e-06,
"loss": 0.5704,
"mean_token_accuracy": 0.8349821045994759,
"num_tokens": 1588027.0,
"step": 271
},
{
"entropy": 1.3742996156215668,
"epoch": 0.6208273894436519,
"grad_norm": 3.03125,
"learning_rate": 4.1403844690259544e-06,
"loss": 0.4357,
"mean_token_accuracy": 0.8906814530491829,
"num_tokens": 1594482.0,
"step": 272
},
{
"entropy": 1.7183980494737625,
"epoch": 0.6231098430813125,
"grad_norm": 4.625,
"learning_rate": 4.1332663391420515e-06,
"loss": 0.6023,
"mean_token_accuracy": 0.8240282908082008,
"num_tokens": 1599978.0,
"step": 273
},
{
"entropy": 1.4364304840564728,
"epoch": 0.6253922967189729,
"grad_norm": 3.140625,
"learning_rate": 4.126125034023392e-06,
"loss": 0.4642,
"mean_token_accuracy": 0.8591607213020325,
"num_tokens": 1606427.0,
"step": 274
},
{
"entropy": 1.4346765726804733,
"epoch": 0.6276747503566333,
"grad_norm": 3.28125,
"learning_rate": 4.11896065500153e-06,
"loss": 0.4256,
"mean_token_accuracy": 0.8701624721288681,
"num_tokens": 1612618.0,
"step": 275
},
{
"entropy": 1.625702291727066,
"epoch": 0.6299572039942939,
"grad_norm": 5.125,
"learning_rate": 4.111773303735432e-06,
"loss": 0.4558,
"mean_token_accuracy": 0.8545658215880394,
"num_tokens": 1617388.0,
"step": 276
},
{
"entropy": 1.4506097733974457,
"epoch": 0.6322396576319543,
"grad_norm": 3.046875,
"learning_rate": 4.104563082210028e-06,
"loss": 0.4293,
"mean_token_accuracy": 0.8728143572807312,
"num_tokens": 1623851.0,
"step": 277
},
{
"entropy": 1.5303080081939697,
"epoch": 0.6345221112696149,
"grad_norm": 3.515625,
"learning_rate": 4.097330092734765e-06,
"loss": 0.5024,
"mean_token_accuracy": 0.8505230322480202,
"num_tokens": 1629428.0,
"step": 278
},
{
"entropy": 1.4354898631572723,
"epoch": 0.6368045649072753,
"grad_norm": 3.03125,
"learning_rate": 4.090074437942155e-06,
"loss": 0.435,
"mean_token_accuracy": 0.8785936459898949,
"num_tokens": 1635769.0,
"step": 279
},
{
"entropy": 1.547384113073349,
"epoch": 0.6390870185449358,
"grad_norm": 4.0,
"learning_rate": 4.082796220786324e-06,
"loss": 0.5469,
"mean_token_accuracy": 0.8383355513215065,
"num_tokens": 1641791.0,
"step": 280
},
{
"entropy": 1.480806604027748,
"epoch": 0.6413694721825963,
"grad_norm": 4.4375,
"learning_rate": 4.0754955445415405e-06,
"loss": 0.4233,
"mean_token_accuracy": 0.8961210995912552,
"num_tokens": 1646709.0,
"step": 281
},
{
"entropy": 1.4669694900512695,
"epoch": 0.6436519258202568,
"grad_norm": 4.0,
"learning_rate": 4.06817251280076e-06,
"loss": 0.4288,
"mean_token_accuracy": 0.8806118816137314,
"num_tokens": 1651676.0,
"step": 282
},
{
"entropy": 1.4136276096105576,
"epoch": 0.6459343794579172,
"grad_norm": 3.15625,
"learning_rate": 4.06082722947415e-06,
"loss": 0.4005,
"mean_token_accuracy": 0.8672489523887634,
"num_tokens": 1657293.0,
"step": 283
},
{
"entropy": 1.4642555862665176,
"epoch": 0.6482168330955778,
"grad_norm": 3.421875,
"learning_rate": 4.053459798787619e-06,
"loss": 0.4534,
"mean_token_accuracy": 0.8670831546187401,
"num_tokens": 1662778.0,
"step": 284
},
{
"entropy": 1.4143490493297577,
"epoch": 0.6504992867332382,
"grad_norm": 3.015625,
"learning_rate": 4.046070325281333e-06,
"loss": 0.4511,
"mean_token_accuracy": 0.8704198077321053,
"num_tokens": 1669050.0,
"step": 285
},
{
"entropy": 1.402878537774086,
"epoch": 0.6527817403708988,
"grad_norm": 2.78125,
"learning_rate": 4.038658913808235e-06,
"loss": 0.3552,
"mean_token_accuracy": 0.8852335959672928,
"num_tokens": 1675168.0,
"step": 286
},
{
"entropy": 1.4332346022129059,
"epoch": 0.6550641940085592,
"grad_norm": 3.359375,
"learning_rate": 4.031225669532558e-06,
"loss": 0.4411,
"mean_token_accuracy": 0.8605756536126137,
"num_tokens": 1680716.0,
"step": 287
},
{
"entropy": 1.4855122715234756,
"epoch": 0.6573466476462196,
"grad_norm": 3.890625,
"learning_rate": 4.0237706979283306e-06,
"loss": 0.5067,
"mean_token_accuracy": 0.8480587676167488,
"num_tokens": 1686358.0,
"step": 288
},
{
"entropy": 1.415476381778717,
"epoch": 0.6596291012838802,
"grad_norm": 3.109375,
"learning_rate": 4.016294104777883e-06,
"loss": 0.3724,
"mean_token_accuracy": 0.8872483521699905,
"num_tokens": 1692477.0,
"step": 289
},
{
"entropy": 1.4918617755174637,
"epoch": 0.6619115549215406,
"grad_norm": 3.546875,
"learning_rate": 4.008795996170341e-06,
"loss": 0.481,
"mean_token_accuracy": 0.8568604290485382,
"num_tokens": 1698377.0,
"step": 290
},
{
"entropy": 1.3961764425039291,
"epoch": 0.6641940085592012,
"grad_norm": 3.015625,
"learning_rate": 4.001276478500127e-06,
"loss": 0.3972,
"mean_token_accuracy": 0.885112538933754,
"num_tokens": 1704209.0,
"step": 291
},
{
"entropy": 1.4769706726074219,
"epoch": 0.6664764621968616,
"grad_norm": 3.640625,
"learning_rate": 3.993735658465446e-06,
"loss": 0.5053,
"mean_token_accuracy": 0.8577989414334297,
"num_tokens": 1710422.0,
"step": 292
},
{
"entropy": 1.3838878571987152,
"epoch": 0.6687589158345221,
"grad_norm": 2.96875,
"learning_rate": 3.986173643066774e-06,
"loss": 0.3759,
"mean_token_accuracy": 0.8760515302419662,
"num_tokens": 1716105.0,
"step": 293
},
{
"entropy": 1.3878977000713348,
"epoch": 0.6710413694721826,
"grad_norm": 2.796875,
"learning_rate": 3.978590539605338e-06,
"loss": 0.329,
"mean_token_accuracy": 0.8979349583387375,
"num_tokens": 1723015.0,
"step": 294
},
{
"entropy": 1.4417504221200943,
"epoch": 0.6733238231098431,
"grad_norm": 3.453125,
"learning_rate": 3.970986455681593e-06,
"loss": 0.5339,
"mean_token_accuracy": 0.854948602616787,
"num_tokens": 1729102.0,
"step": 295
},
{
"entropy": 1.4643060863018036,
"epoch": 0.6756062767475036,
"grad_norm": 3.171875,
"learning_rate": 3.963361499193699e-06,
"loss": 0.4545,
"mean_token_accuracy": 0.8652586191892624,
"num_tokens": 1734903.0,
"step": 296
},
{
"entropy": 1.4911223948001862,
"epoch": 0.6778887303851641,
"grad_norm": 3.25,
"learning_rate": 3.955715778335984e-06,
"loss": 0.4584,
"mean_token_accuracy": 0.8684913441538811,
"num_tokens": 1740820.0,
"step": 297
},
{
"entropy": 1.414558470249176,
"epoch": 0.6801711840228245,
"grad_norm": 3.046875,
"learning_rate": 3.948049401597414e-06,
"loss": 0.4304,
"mean_token_accuracy": 0.8772279694676399,
"num_tokens": 1747285.0,
"step": 298
},
{
"entropy": 1.4684519618749619,
"epoch": 0.6824536376604851,
"grad_norm": 3.5,
"learning_rate": 3.9403624777600526e-06,
"loss": 0.3402,
"mean_token_accuracy": 0.8974613174796104,
"num_tokens": 1752238.0,
"step": 299
},
{
"entropy": 1.513798087835312,
"epoch": 0.6847360912981455,
"grad_norm": 4.4375,
"learning_rate": 3.932655115897513e-06,
"loss": 0.518,
"mean_token_accuracy": 0.8387879729270935,
"num_tokens": 1757263.0,
"step": 300
},
{
"epoch": 0.6847360912981455,
"eval_entropy": 1.4728518161508772,
"eval_loss": 0.4787273108959198,
"eval_mean_token_accuracy": 0.8652989500098758,
"eval_num_tokens": 1757263.0,
"eval_runtime": 4.4526,
"eval_samples_per_second": 20.213,
"eval_steps_per_second": 20.213,
"step": 300
},
{
"entropy": 1.5329459309577942,
"epoch": 0.6870185449358059,
"grad_norm": 3.0625,
"learning_rate": 3.924927425373417e-06,
"loss": 0.3762,
"mean_token_accuracy": 0.8721340969204903,
"num_tokens": 1762777.0,
"step": 301
},
{
"entropy": 1.3736970275640488,
"epoch": 0.6893009985734665,
"grad_norm": 2.84375,
"learning_rate": 3.91717951583984e-06,
"loss": 0.403,
"mean_token_accuracy": 0.8769481182098389,
"num_tokens": 1769311.0,
"step": 302
},
{
"entropy": 1.4778434038162231,
"epoch": 0.6915834522111269,
"grad_norm": 3.265625,
"learning_rate": 3.909411497235752e-06,
"loss": 0.4176,
"mean_token_accuracy": 0.8799067437648773,
"num_tokens": 1775618.0,
"step": 303
},
{
"entropy": 1.493824690580368,
"epoch": 0.6938659058487875,
"grad_norm": 3.046875,
"learning_rate": 3.901623479785465e-06,
"loss": 0.4883,
"mean_token_accuracy": 0.8613429367542267,
"num_tokens": 1782559.0,
"step": 304
},
{
"entropy": 1.412913128733635,
"epoch": 0.6961483594864479,
"grad_norm": 4.28125,
"learning_rate": 3.89381557399706e-06,
"loss": 0.4606,
"mean_token_accuracy": 0.8659727945923805,
"num_tokens": 1788268.0,
"step": 305
},
{
"entropy": 1.3977010250091553,
"epoch": 0.6984308131241084,
"grad_norm": 3.046875,
"learning_rate": 3.885987890660828e-06,
"loss": 0.3609,
"mean_token_accuracy": 0.8855833634734154,
"num_tokens": 1794289.0,
"step": 306
},
{
"entropy": 1.4351555556058884,
"epoch": 0.7007132667617689,
"grad_norm": 3.203125,
"learning_rate": 3.87814054084769e-06,
"loss": 0.3922,
"mean_token_accuracy": 0.882360152900219,
"num_tokens": 1800100.0,
"step": 307
},
{
"entropy": 1.4589954763650894,
"epoch": 0.7029957203994294,
"grad_norm": 3.84375,
"learning_rate": 3.8702736359076265e-06,
"loss": 0.4728,
"mean_token_accuracy": 0.8583435043692589,
"num_tokens": 1806175.0,
"step": 308
},
{
"entropy": 1.497866302728653,
"epoch": 0.7052781740370899,
"grad_norm": 3.984375,
"learning_rate": 3.862387287468095e-06,
"loss": 0.5149,
"mean_token_accuracy": 0.8527609705924988,
"num_tokens": 1811406.0,
"step": 309
},
{
"entropy": 1.5516266524791718,
"epoch": 0.7075606276747504,
"grad_norm": 3.5,
"learning_rate": 3.854481607432445e-06,
"loss": 0.4476,
"mean_token_accuracy": 0.8626842275261879,
"num_tokens": 1816804.0,
"step": 310
},
{
"entropy": 1.3300371170043945,
"epoch": 0.7098430813124108,
"grad_norm": 3.0625,
"learning_rate": 3.846556707978337e-06,
"loss": 0.4001,
"mean_token_accuracy": 0.8860765770077705,
"num_tokens": 1823102.0,
"step": 311
},
{
"entropy": 1.5138549208641052,
"epoch": 0.7121255349500714,
"grad_norm": 3.71875,
"learning_rate": 3.838612701556138e-06,
"loss": 0.4696,
"mean_token_accuracy": 0.8707823753356934,
"num_tokens": 1828740.0,
"step": 312
},
{
"entropy": 1.4780635386705399,
"epoch": 0.7144079885877318,
"grad_norm": 3.140625,
"learning_rate": 3.830649700887339e-06,
"loss": 0.4598,
"mean_token_accuracy": 0.8627598807215691,
"num_tokens": 1835314.0,
"step": 313
},
{
"entropy": 1.483512207865715,
"epoch": 0.7166904422253922,
"grad_norm": 3.859375,
"learning_rate": 3.822667818962948e-06,
"loss": 0.3944,
"mean_token_accuracy": 0.8666610270738602,
"num_tokens": 1840589.0,
"step": 314
},
{
"entropy": 1.351017713546753,
"epoch": 0.7189728958630528,
"grad_norm": 3.046875,
"learning_rate": 3.814667169041887e-06,
"loss": 0.4589,
"mean_token_accuracy": 0.8681119009852409,
"num_tokens": 1846865.0,
"step": 315
},
{
"entropy": 1.454156056046486,
"epoch": 0.7212553495007132,
"grad_norm": 2.859375,
"learning_rate": 3.8066478646493898e-06,
"loss": 0.3616,
"mean_token_accuracy": 0.887380801141262,
"num_tokens": 1853343.0,
"step": 316
},
{
"entropy": 1.3507359623908997,
"epoch": 0.7235378031383738,
"grad_norm": 3.109375,
"learning_rate": 3.798610019575384e-06,
"loss": 0.3908,
"mean_token_accuracy": 0.8893059492111206,
"num_tokens": 1859535.0,
"step": 317
},
{
"entropy": 1.5166684240102768,
"epoch": 0.7258202567760342,
"grad_norm": 4.3125,
"learning_rate": 3.790553747872885e-06,
"loss": 0.5235,
"mean_token_accuracy": 0.8411901965737343,
"num_tokens": 1864957.0,
"step": 318
},
{
"entropy": 1.4589732587337494,
"epoch": 0.7281027104136947,
"grad_norm": 3.34375,
"learning_rate": 3.7824791638563674e-06,
"loss": 0.4074,
"mean_token_accuracy": 0.8821713030338287,
"num_tokens": 1870586.0,
"step": 319
},
{
"entropy": 1.429191216826439,
"epoch": 0.7303851640513552,
"grad_norm": 3.40625,
"learning_rate": 3.7743863821001538e-06,
"loss": 0.4902,
"mean_token_accuracy": 0.8597285747528076,
"num_tokens": 1876572.0,
"step": 320
},
{
"entropy": 1.52955062687397,
"epoch": 0.7326676176890157,
"grad_norm": 4.09375,
"learning_rate": 3.766275517436779e-06,
"loss": 0.5007,
"mean_token_accuracy": 0.8509823232889175,
"num_tokens": 1881581.0,
"step": 321
},
{
"entropy": 1.6073177456855774,
"epoch": 0.7349500713266762,
"grad_norm": 4.9375,
"learning_rate": 3.7581466849553685e-06,
"loss": 0.5742,
"mean_token_accuracy": 0.8330699577927589,
"num_tokens": 1886980.0,
"step": 322
},
{
"entropy": 1.490510642528534,
"epoch": 0.7372325249643367,
"grad_norm": 4.15625,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.5597,
"mean_token_accuracy": 0.8421717286109924,
"num_tokens": 1892848.0,
"step": 323
},
{
"entropy": 1.4249206632375717,
"epoch": 0.7395149786019971,
"grad_norm": 3.59375,
"learning_rate": 3.741835578168071e-06,
"loss": 0.5289,
"mean_token_accuracy": 0.8406483083963394,
"num_tokens": 1899057.0,
"step": 324
},
{
"entropy": 1.4169852286577225,
"epoch": 0.7417974322396577,
"grad_norm": 3.421875,
"learning_rate": 3.7336535353086546e-06,
"loss": 0.4855,
"mean_token_accuracy": 0.8616788312792778,
"num_tokens": 1905042.0,
"step": 325
},
{
"entropy": 1.5189264565706253,
"epoch": 0.7440798858773181,
"grad_norm": 3.59375,
"learning_rate": 3.7254539875208577e-06,
"loss": 0.5092,
"mean_token_accuracy": 0.8563691675662994,
"num_tokens": 1910608.0,
"step": 326
},
{
"entropy": 1.4049191176891327,
"epoch": 0.7463623395149787,
"grad_norm": 3.40625,
"learning_rate": 3.717237051152175e-06,
"loss": 0.4253,
"mean_token_accuracy": 0.8755350038409233,
"num_tokens": 1916900.0,
"step": 327
},
{
"entropy": 1.4406355023384094,
"epoch": 0.7486447931526391,
"grad_norm": 3.3125,
"learning_rate": 3.7090028427968343e-06,
"loss": 0.5454,
"mean_token_accuracy": 0.8430257961153984,
"num_tokens": 1923487.0,
"step": 328
},
{
"entropy": 1.4147418439388275,
"epoch": 0.7509272467902995,
"grad_norm": 3.734375,
"learning_rate": 3.7007514792941462e-06,
"loss": 0.4328,
"mean_token_accuracy": 0.873896099627018,
"num_tokens": 1929126.0,
"step": 329
},
{
"entropy": 1.4407319128513336,
"epoch": 0.7532097004279601,
"grad_norm": 3.609375,
"learning_rate": 3.692483077726843e-06,
"loss": 0.4482,
"mean_token_accuracy": 0.8734828159213066,
"num_tokens": 1935299.0,
"step": 330
},
{
"entropy": 1.3825494647026062,
"epoch": 0.7554921540656205,
"grad_norm": 3.046875,
"learning_rate": 3.684197755419419e-06,
"loss": 0.3914,
"mean_token_accuracy": 0.8881862461566925,
"num_tokens": 1941583.0,
"step": 331
},
{
"entropy": 1.5679688155651093,
"epoch": 0.757774607703281,
"grad_norm": 3.5,
"learning_rate": 3.6758956299364643e-06,
"loss": 0.5205,
"mean_token_accuracy": 0.850575216114521,
"num_tokens": 1947719.0,
"step": 332
},
{
"entropy": 1.478807806968689,
"epoch": 0.7600570613409415,
"grad_norm": 3.953125,
"learning_rate": 3.6675768190810023e-06,
"loss": 0.5383,
"mean_token_accuracy": 0.8558880761265755,
"num_tokens": 1952792.0,
"step": 333
},
{
"entropy": 1.4567435085773468,
"epoch": 0.762339514978602,
"grad_norm": 3.625,
"learning_rate": 3.659241440892806e-06,
"loss": 0.4479,
"mean_token_accuracy": 0.8747463598847389,
"num_tokens": 1959114.0,
"step": 334
},
{
"entropy": 1.3806256204843521,
"epoch": 0.7646219686162625,
"grad_norm": 2.90625,
"learning_rate": 3.6508896136467376e-06,
"loss": 0.3259,
"mean_token_accuracy": 0.9004263803362846,
"num_tokens": 1965297.0,
"step": 335
},
{
"entropy": 1.3886072635650635,
"epoch": 0.766904422253923,
"grad_norm": 2.75,
"learning_rate": 3.642521455851058e-06,
"loss": 0.3218,
"mean_token_accuracy": 0.8972492516040802,
"num_tokens": 1972145.0,
"step": 336
},
{
"entropy": 1.4320484548807144,
"epoch": 0.7691868758915834,
"grad_norm": 3.671875,
"learning_rate": 3.634137086245754e-06,
"loss": 0.4502,
"mean_token_accuracy": 0.8562175408005714,
"num_tokens": 1977851.0,
"step": 337
},
{
"entropy": 1.606041207909584,
"epoch": 0.771469329529244,
"grad_norm": 3.921875,
"learning_rate": 3.625736623800849e-06,
"loss": 0.5698,
"mean_token_accuracy": 0.8275244310498238,
"num_tokens": 1983459.0,
"step": 338
},
{
"entropy": 1.396391972899437,
"epoch": 0.7737517831669044,
"grad_norm": 3.125,
"learning_rate": 3.6173201877147134e-06,
"loss": 0.4157,
"mean_token_accuracy": 0.8768060877919197,
"num_tokens": 1989443.0,
"step": 339
},
{
"entropy": 1.2999221831560135,
"epoch": 0.776034236804565,
"grad_norm": 2.65625,
"learning_rate": 3.6088878974123796e-06,
"loss": 0.3211,
"mean_token_accuracy": 0.9015626162290573,
"num_tokens": 1996081.0,
"step": 340
},
{
"entropy": 1.4438531249761581,
"epoch": 0.7783166904422254,
"grad_norm": 3.109375,
"learning_rate": 3.6004398725438406e-06,
"loss": 0.4224,
"mean_token_accuracy": 0.8693163841962814,
"num_tokens": 2002046.0,
"step": 341
},
{
"entropy": 1.5661528557538986,
"epoch": 0.7805991440798858,
"grad_norm": 3.8125,
"learning_rate": 3.5919762329823556e-06,
"loss": 0.4583,
"mean_token_accuracy": 0.8407174274325371,
"num_tokens": 2007992.0,
"step": 342
},
{
"entropy": 1.5423270612955093,
"epoch": 0.7828815977175464,
"grad_norm": 3.828125,
"learning_rate": 3.5834970988227484e-06,
"loss": 0.5046,
"mean_token_accuracy": 0.8615901097655296,
"num_tokens": 2013678.0,
"step": 343
},
{
"entropy": 1.3757345080375671,
"epoch": 0.7851640513552068,
"grad_norm": 3.078125,
"learning_rate": 3.5750025903797053e-06,
"loss": 0.435,
"mean_token_accuracy": 0.8637730702757835,
"num_tokens": 2019976.0,
"step": 344
},
{
"entropy": 1.5496114045381546,
"epoch": 0.7874465049928673,
"grad_norm": 4.03125,
"learning_rate": 3.566492828186063e-06,
"loss": 0.466,
"mean_token_accuracy": 0.861820325255394,
"num_tokens": 2025396.0,
"step": 345
},
{
"entropy": 1.4001742899417877,
"epoch": 0.7897289586305278,
"grad_norm": 3.265625,
"learning_rate": 3.5579679329911025e-06,
"loss": 0.4244,
"mean_token_accuracy": 0.8774027079343796,
"num_tokens": 2031341.0,
"step": 346
},
{
"entropy": 1.4246700257062912,
"epoch": 0.7920114122681883,
"grad_norm": 2.59375,
"learning_rate": 3.5494280257588367e-06,
"loss": 0.3573,
"mean_token_accuracy": 0.8994497805833817,
"num_tokens": 2038154.0,
"step": 347
},
{
"entropy": 1.4771685898303986,
"epoch": 0.7942938659058488,
"grad_norm": 3.5625,
"learning_rate": 3.5408732276662882e-06,
"loss": 0.4837,
"mean_token_accuracy": 0.8569220453500748,
"num_tokens": 2043977.0,
"step": 348
},
{
"entropy": 1.3758689016103745,
"epoch": 0.7965763195435093,
"grad_norm": 3.484375,
"learning_rate": 3.532303660101776e-06,
"loss": 0.4086,
"mean_token_accuracy": 0.8799771890044212,
"num_tokens": 2049581.0,
"step": 349
},
{
"entropy": 1.4391580671072006,
"epoch": 0.7988587731811697,
"grad_norm": 3.4375,
"learning_rate": 3.5237194446631883e-06,
"loss": 0.4414,
"mean_token_accuracy": 0.8686051443219185,
"num_tokens": 2054885.0,
"step": 350
},
{
"entropy": 1.572434514760971,
"epoch": 0.8011412268188303,
"grad_norm": 3.578125,
"learning_rate": 3.515120703156264e-06,
"loss": 0.4561,
"mean_token_accuracy": 0.869783990085125,
"num_tokens": 2060752.0,
"step": 351
},
{
"entropy": 1.3927340656518936,
"epoch": 0.8034236804564907,
"grad_norm": 3.25,
"learning_rate": 3.506507557592853e-06,
"loss": 0.3986,
"mean_token_accuracy": 0.8710938170552254,
"num_tokens": 2066701.0,
"step": 352
},
{
"entropy": 1.6066904217004776,
"epoch": 0.8057061340941513,
"grad_norm": 4.5,
"learning_rate": 3.4978801301891972e-06,
"loss": 0.5213,
"mean_token_accuracy": 0.8417335525155067,
"num_tokens": 2072037.0,
"step": 353
},
{
"entropy": 1.5368521958589554,
"epoch": 0.8079885877318117,
"grad_norm": 3.6875,
"learning_rate": 3.4892385433641875e-06,
"loss": 0.5679,
"mean_token_accuracy": 0.8372282758355141,
"num_tokens": 2077090.0,
"step": 354
},
{
"entropy": 1.4477348923683167,
"epoch": 0.8102710413694721,
"grad_norm": 3.140625,
"learning_rate": 3.480582919737631e-06,
"loss": 0.4322,
"mean_token_accuracy": 0.8827796950936317,
"num_tokens": 2083157.0,
"step": 355
},
{
"entropy": 1.449633464217186,
"epoch": 0.8125534950071327,
"grad_norm": 3.578125,
"learning_rate": 3.4719133821285108e-06,
"loss": 0.497,
"mean_token_accuracy": 0.8483736291527748,
"num_tokens": 2089047.0,
"step": 356
},
{
"entropy": 1.4000667333602905,
"epoch": 0.8148359486447931,
"grad_norm": 3.109375,
"learning_rate": 3.4632300535532415e-06,
"loss": 0.5416,
"mean_token_accuracy": 0.8374148234724998,
"num_tokens": 2095911.0,
"step": 357
},
{
"entropy": 1.5335423648357391,
"epoch": 0.8171184022824537,
"grad_norm": 3.703125,
"learning_rate": 3.4545330572239234e-06,
"loss": 0.4418,
"mean_token_accuracy": 0.8705498203635216,
"num_tokens": 2101062.0,
"step": 358
},
{
"entropy": 1.4877882897853851,
"epoch": 0.8194008559201141,
"grad_norm": 2.96875,
"learning_rate": 3.445822516546598e-06,
"loss": 0.382,
"mean_token_accuracy": 0.885826900601387,
"num_tokens": 2107503.0,
"step": 359
},
{
"entropy": 1.5615941286087036,
"epoch": 0.8216833095577746,
"grad_norm": 3.515625,
"learning_rate": 3.437098555119493e-06,
"loss": 0.4703,
"mean_token_accuracy": 0.8597147017717361,
"num_tokens": 2112957.0,
"step": 360
},
{
"entropy": 1.4338414072990417,
"epoch": 0.8239657631954351,
"grad_norm": 3.65625,
"learning_rate": 3.4283612967312692e-06,
"loss": 0.4431,
"mean_token_accuracy": 0.8747149705886841,
"num_tokens": 2119534.0,
"step": 361
},
{
"entropy": 1.3991961032152176,
"epoch": 0.8262482168330956,
"grad_norm": 2.8125,
"learning_rate": 3.4196108653592662e-06,
"loss": 0.3343,
"mean_token_accuracy": 0.9073175340890884,
"num_tokens": 2125905.0,
"step": 362
},
{
"entropy": 1.4029065370559692,
"epoch": 0.828530670470756,
"grad_norm": 3.265625,
"learning_rate": 3.4108473851677408e-06,
"loss": 0.3691,
"mean_token_accuracy": 0.8828721046447754,
"num_tokens": 2132517.0,
"step": 363
},
{
"entropy": 1.4478721916675568,
"epoch": 0.8308131241084166,
"grad_norm": 3.0,
"learning_rate": 3.4020709805061066e-06,
"loss": 0.399,
"mean_token_accuracy": 0.8760695457458496,
"num_tokens": 2138908.0,
"step": 364
},
{
"entropy": 1.470540538430214,
"epoch": 0.833095577746077,
"grad_norm": 3.59375,
"learning_rate": 3.3932817759071666e-06,
"loss": 0.4839,
"mean_token_accuracy": 0.8647991716861725,
"num_tokens": 2144936.0,
"step": 365
},
{
"entropy": 1.3821264803409576,
"epoch": 0.8353780313837376,
"grad_norm": 3.40625,
"learning_rate": 3.3844798960853533e-06,
"loss": 0.4712,
"mean_token_accuracy": 0.8681535720825195,
"num_tokens": 2151022.0,
"step": 366
},
{
"entropy": 1.4431174248456955,
"epoch": 0.837660485021398,
"grad_norm": 3.484375,
"learning_rate": 3.3756654659349487e-06,
"loss": 0.4008,
"mean_token_accuracy": 0.8728353902697563,
"num_tokens": 2156626.0,
"step": 367
},
{
"entropy": 1.3731088489294052,
"epoch": 0.8399429386590584,
"grad_norm": 3.40625,
"learning_rate": 3.3668386105283226e-06,
"loss": 0.4741,
"mean_token_accuracy": 0.863268293440342,
"num_tokens": 2163234.0,
"step": 368
},
{
"entropy": 1.4210239797830582,
"epoch": 0.842225392296719,
"grad_norm": 2.921875,
"learning_rate": 3.357999455114148e-06,
"loss": 0.4039,
"mean_token_accuracy": 0.8817742839455605,
"num_tokens": 2169749.0,
"step": 369
},
{
"entropy": 1.4794443249702454,
"epoch": 0.8445078459343794,
"grad_norm": 3.25,
"learning_rate": 3.3491481251156355e-06,
"loss": 0.4879,
"mean_token_accuracy": 0.8580229580402374,
"num_tokens": 2175776.0,
"step": 370
},
{
"entropy": 1.6413906067609787,
"epoch": 0.84679029957204,
"grad_norm": 4.1875,
"learning_rate": 3.34028474612874e-06,
"loss": 0.4411,
"mean_token_accuracy": 0.8557733818888664,
"num_tokens": 2180562.0,
"step": 371
},
{
"entropy": 1.410418540239334,
"epoch": 0.8490727532097004,
"grad_norm": 3.359375,
"learning_rate": 3.3314094439203903e-06,
"loss": 0.4152,
"mean_token_accuracy": 0.8825007230043411,
"num_tokens": 2185764.0,
"step": 372
},
{
"entropy": 1.479749009013176,
"epoch": 0.8513552068473609,
"grad_norm": 3.578125,
"learning_rate": 3.322522344426698e-06,
"loss": 0.4534,
"mean_token_accuracy": 0.8688785433769226,
"num_tokens": 2191225.0,
"step": 373
},
{
"entropy": 1.4503730237483978,
"epoch": 0.8536376604850214,
"grad_norm": 3.1875,
"learning_rate": 3.3136235737511715e-06,
"loss": 0.3714,
"mean_token_accuracy": 0.8881650194525719,
"num_tokens": 2196792.0,
"step": 374
},
{
"entropy": 1.3789267241954803,
"epoch": 0.8559201141226819,
"grad_norm": 2.953125,
"learning_rate": 3.3047132581629297e-06,
"loss": 0.398,
"mean_token_accuracy": 0.8848712220788002,
"num_tokens": 2203140.0,
"step": 375
},
{
"entropy": 1.4894972145557404,
"epoch": 0.8582025677603423,
"grad_norm": 3.203125,
"learning_rate": 3.295791524094906e-06,
"loss": 0.3865,
"mean_token_accuracy": 0.8710450083017349,
"num_tokens": 2209122.0,
"step": 376
},
{
"entropy": 1.3985904306173325,
"epoch": 0.8604850213980029,
"grad_norm": 2.875,
"learning_rate": 3.286858498142057e-06,
"loss": 0.4158,
"mean_token_accuracy": 0.878923624753952,
"num_tokens": 2215258.0,
"step": 377
},
{
"entropy": 1.548867017030716,
"epoch": 0.8627674750356633,
"grad_norm": 3.484375,
"learning_rate": 3.277914307059566e-06,
"loss": 0.5408,
"mean_token_accuracy": 0.8471002653241158,
"num_tokens": 2221371.0,
"step": 378
},
{
"entropy": 1.4772979021072388,
"epoch": 0.8650499286733239,
"grad_norm": 3.25,
"learning_rate": 3.2689590777610443e-06,
"loss": 0.3972,
"mean_token_accuracy": 0.8763172924518585,
"num_tokens": 2227158.0,
"step": 379
},
{
"entropy": 1.5023012608289719,
"epoch": 0.8673323823109843,
"grad_norm": 2.96875,
"learning_rate": 3.259992937316727e-06,
"loss": 0.4516,
"mean_token_accuracy": 0.8623324111104012,
"num_tokens": 2233629.0,
"step": 380
},
{
"entropy": 1.5667530596256256,
"epoch": 0.8696148359486447,
"grad_norm": 5.0625,
"learning_rate": 3.251016012951678e-06,
"loss": 0.6043,
"mean_token_accuracy": 0.8312884569168091,
"num_tokens": 2239082.0,
"step": 381
},
{
"entropy": 1.380866751074791,
"epoch": 0.8718972895863053,
"grad_norm": 3.140625,
"learning_rate": 3.242028432043974e-06,
"loss": 0.4196,
"mean_token_accuracy": 0.8756621181964874,
"num_tokens": 2245272.0,
"step": 382
},
{
"entropy": 1.4950210005044937,
"epoch": 0.8741797432239657,
"grad_norm": 3.265625,
"learning_rate": 3.2330303221229078e-06,
"loss": 0.4317,
"mean_token_accuracy": 0.8579834923148155,
"num_tokens": 2251010.0,
"step": 383
},
{
"entropy": 1.7085559666156769,
"epoch": 0.8764621968616263,
"grad_norm": 4.6875,
"learning_rate": 3.2240218108671683e-06,
"loss": 0.6511,
"mean_token_accuracy": 0.8028427958488464,
"num_tokens": 2256288.0,
"step": 384
},
{
"entropy": 1.579810380935669,
"epoch": 0.8787446504992867,
"grad_norm": 3.5,
"learning_rate": 3.2150030261030414e-06,
"loss": 0.4849,
"mean_token_accuracy": 0.8453002646565437,
"num_tokens": 2262186.0,
"step": 385
},
{
"entropy": 1.5028070509433746,
"epoch": 0.8810271041369472,
"grad_norm": 3.6875,
"learning_rate": 3.205974095802582e-06,
"loss": 0.5576,
"mean_token_accuracy": 0.8453918322920799,
"num_tokens": 2268003.0,
"step": 386
},
{
"entropy": 1.50083489716053,
"epoch": 0.8833095577746077,
"grad_norm": 3.859375,
"learning_rate": 3.196935148081808e-06,
"loss": 0.5821,
"mean_token_accuracy": 0.8238921985030174,
"num_tokens": 2273238.0,
"step": 387
},
{
"entropy": 1.460751935839653,
"epoch": 0.8855920114122682,
"grad_norm": 3.078125,
"learning_rate": 3.187886311198881e-06,
"loss": 0.463,
"mean_token_accuracy": 0.8708171024918556,
"num_tokens": 2279778.0,
"step": 388
},
{
"entropy": 1.3422992527484894,
"epoch": 0.8878744650499286,
"grad_norm": 3.28125,
"learning_rate": 3.178827713552281e-06,
"loss": 0.4008,
"mean_token_accuracy": 0.875513955950737,
"num_tokens": 2286016.0,
"step": 389
},
{
"entropy": 1.5027628540992737,
"epoch": 0.8901569186875892,
"grad_norm": 3.328125,
"learning_rate": 3.1697594836789924e-06,
"loss": 0.5086,
"mean_token_accuracy": 0.8417061790823936,
"num_tokens": 2291896.0,
"step": 390
},
{
"entropy": 1.5571343451738358,
"epoch": 0.8924393723252496,
"grad_norm": 4.53125,
"learning_rate": 3.160681750252674e-06,
"loss": 0.5863,
"mean_token_accuracy": 0.8346568569540977,
"num_tokens": 2296989.0,
"step": 391
},
{
"entropy": 1.4478174448013306,
"epoch": 0.8947218259629102,
"grad_norm": 3.265625,
"learning_rate": 3.1515946420818343e-06,
"loss": 0.4618,
"mean_token_accuracy": 0.8564577624201775,
"num_tokens": 2303240.0,
"step": 392
},
{
"entropy": 1.4417100101709366,
"epoch": 0.8970042796005706,
"grad_norm": 3.3125,
"learning_rate": 3.142498288108007e-06,
"loss": 0.5086,
"mean_token_accuracy": 0.8544816300272942,
"num_tokens": 2308819.0,
"step": 393
},
{
"entropy": 1.549110621213913,
"epoch": 0.899286733238231,
"grad_norm": 3.234375,
"learning_rate": 3.133392817403919e-06,
"loss": 0.4943,
"mean_token_accuracy": 0.8492691740393639,
"num_tokens": 2315199.0,
"step": 394
},
{
"entropy": 1.437395378947258,
"epoch": 0.9015691868758916,
"grad_norm": 3.265625,
"learning_rate": 3.124278359171657e-06,
"loss": 0.4162,
"mean_token_accuracy": 0.8790151923894882,
"num_tokens": 2321449.0,
"step": 395
},
{
"entropy": 1.4882071912288666,
"epoch": 0.903851640513552,
"grad_norm": 3.15625,
"learning_rate": 3.1151550427408383e-06,
"loss": 0.3974,
"mean_token_accuracy": 0.8646276146173477,
"num_tokens": 2327198.0,
"step": 396
},
{
"entropy": 1.414357990026474,
"epoch": 0.9061340941512126,
"grad_norm": 3.640625,
"learning_rate": 3.1060229975667716e-06,
"loss": 0.3884,
"mean_token_accuracy": 0.874775730073452,
"num_tokens": 2333184.0,
"step": 397
},
{
"entropy": 1.5017937868833542,
"epoch": 0.908416547788873,
"grad_norm": 3.09375,
"learning_rate": 3.0968823532286246e-06,
"loss": 0.4596,
"mean_token_accuracy": 0.8661977797746658,
"num_tokens": 2339353.0,
"step": 398
},
{
"entropy": 1.4912959188222885,
"epoch": 0.9106990014265335,
"grad_norm": 3.015625,
"learning_rate": 3.0877332394275806e-06,
"loss": 0.3845,
"mean_token_accuracy": 0.8872612118721008,
"num_tokens": 2345323.0,
"step": 399
},
{
"entropy": 1.5040694773197174,
"epoch": 0.912981455064194,
"grad_norm": 3.84375,
"learning_rate": 3.0785757859850025e-06,
"loss": 0.4793,
"mean_token_accuracy": 0.8584380373358727,
"num_tokens": 2350382.0,
"step": 400
},
{
"epoch": 0.912981455064194,
"eval_entropy": 1.4835859013928308,
"eval_loss": 0.47563549876213074,
"eval_mean_token_accuracy": 0.8651414997047848,
"eval_num_tokens": 2350382.0,
"eval_runtime": 4.4144,
"eval_samples_per_second": 20.388,
"eval_steps_per_second": 20.388,
"step": 400
},
{
"entropy": 1.4684801995754242,
"epoch": 0.9152639087018545,
"grad_norm": 3.765625,
"learning_rate": 3.069410122840585e-06,
"loss": 0.4838,
"mean_token_accuracy": 0.8577789217233658,
"num_tokens": 2356642.0,
"step": 401
},
{
"entropy": 1.4736972451210022,
"epoch": 0.917546362339515,
"grad_norm": 3.09375,
"learning_rate": 3.0602363800505198e-06,
"loss": 0.4626,
"mean_token_accuracy": 0.8666577711701393,
"num_tokens": 2363069.0,
"step": 402
},
{
"entropy": 1.4170372486114502,
"epoch": 0.9198288159771755,
"grad_norm": 2.96875,
"learning_rate": 3.05105468778564e-06,
"loss": 0.4183,
"mean_token_accuracy": 0.8878279328346252,
"num_tokens": 2369558.0,
"step": 403
},
{
"entropy": 1.2785319834947586,
"epoch": 0.9221112696148359,
"grad_norm": 3.0,
"learning_rate": 3.041865176329579e-06,
"loss": 0.383,
"mean_token_accuracy": 0.8874974772334099,
"num_tokens": 2376487.0,
"step": 404
},
{
"entropy": 1.5108132362365723,
"epoch": 0.9243937232524965,
"grad_norm": 3.796875,
"learning_rate": 3.032667976076923e-06,
"loss": 0.5087,
"mean_token_accuracy": 0.8496776968240738,
"num_tokens": 2382047.0,
"step": 405
},
{
"entropy": 1.4732455164194107,
"epoch": 0.9266761768901569,
"grad_norm": 2.84375,
"learning_rate": 3.0234632175313537e-06,
"loss": 0.3808,
"mean_token_accuracy": 0.8731858357787132,
"num_tokens": 2388697.0,
"step": 406
},
{
"entropy": 1.428204596042633,
"epoch": 0.9289586305278174,
"grad_norm": 2.96875,
"learning_rate": 3.0142510313038057e-06,
"loss": 0.3893,
"mean_token_accuracy": 0.8852085620164871,
"num_tokens": 2395175.0,
"step": 407
},
{
"entropy": 1.3948392271995544,
"epoch": 0.9312410841654779,
"grad_norm": 3.015625,
"learning_rate": 3.0050315481106074e-06,
"loss": 0.4367,
"mean_token_accuracy": 0.8680780380964279,
"num_tokens": 2401107.0,
"step": 408
},
{
"entropy": 1.4686945080757141,
"epoch": 0.9335235378031383,
"grad_norm": 3.078125,
"learning_rate": 2.9958048987716266e-06,
"loss": 0.4492,
"mean_token_accuracy": 0.8716259375214577,
"num_tokens": 2407315.0,
"step": 409
},
{
"entropy": 1.5125146508216858,
"epoch": 0.9358059914407989,
"grad_norm": 3.5625,
"learning_rate": 2.9865712142084145e-06,
"loss": 0.5313,
"mean_token_accuracy": 0.8568686470389366,
"num_tokens": 2413259.0,
"step": 410
},
{
"entropy": 1.433497592806816,
"epoch": 0.9380884450784593,
"grad_norm": 3.0,
"learning_rate": 2.977330625442352e-06,
"loss": 0.412,
"mean_token_accuracy": 0.8721762746572495,
"num_tokens": 2419468.0,
"step": 411
},
{
"entropy": 1.4551435112953186,
"epoch": 0.9403708987161198,
"grad_norm": 2.90625,
"learning_rate": 2.9680832635927824e-06,
"loss": 0.472,
"mean_token_accuracy": 0.8528627678751945,
"num_tokens": 2426271.0,
"step": 412
},
{
"entropy": 1.447442203760147,
"epoch": 0.9426533523537803,
"grad_norm": 2.71875,
"learning_rate": 2.95882925987516e-06,
"loss": 0.3598,
"mean_token_accuracy": 0.8820754066109657,
"num_tokens": 2432887.0,
"step": 413
},
{
"entropy": 1.5209446549415588,
"epoch": 0.9449358059914408,
"grad_norm": 3.46875,
"learning_rate": 2.949568745599182e-06,
"loss": 0.4893,
"mean_token_accuracy": 0.8616260290145874,
"num_tokens": 2438656.0,
"step": 414
},
{
"entropy": 1.4069498479366302,
"epoch": 0.9472182596291013,
"grad_norm": 3.46875,
"learning_rate": 2.9403018521669256e-06,
"loss": 0.5104,
"mean_token_accuracy": 0.8574993088841438,
"num_tokens": 2444704.0,
"step": 415
},
{
"entropy": 1.487932413816452,
"epoch": 0.9495007132667618,
"grad_norm": 3.21875,
"learning_rate": 2.9310287110709895e-06,
"loss": 0.4016,
"mean_token_accuracy": 0.8731286600232124,
"num_tokens": 2450361.0,
"step": 416
},
{
"entropy": 1.5046747326850891,
"epoch": 0.9517831669044222,
"grad_norm": 3.34375,
"learning_rate": 2.921749453892618e-06,
"loss": 0.4286,
"mean_token_accuracy": 0.8756372630596161,
"num_tokens": 2456532.0,
"step": 417
},
{
"entropy": 1.5569333881139755,
"epoch": 0.9540656205420828,
"grad_norm": 3.4375,
"learning_rate": 2.9124642122998453e-06,
"loss": 0.5047,
"mean_token_accuracy": 0.8422510251402855,
"num_tokens": 2462276.0,
"step": 418
},
{
"entropy": 1.477220967411995,
"epoch": 0.9563480741797432,
"grad_norm": 3.015625,
"learning_rate": 2.903173118045616e-06,
"loss": 0.4585,
"mean_token_accuracy": 0.8631913363933563,
"num_tokens": 2468621.0,
"step": 419
},
{
"entropy": 1.3926943019032478,
"epoch": 0.9586305278174037,
"grad_norm": 3.53125,
"learning_rate": 2.893876302965925e-06,
"loss": 0.4379,
"mean_token_accuracy": 0.8661207035183907,
"num_tokens": 2474234.0,
"step": 420
},
{
"entropy": 1.5482182949781418,
"epoch": 0.9609129814550642,
"grad_norm": 3.78125,
"learning_rate": 2.884573898977941e-06,
"loss": 0.507,
"mean_token_accuracy": 0.8496933579444885,
"num_tokens": 2479680.0,
"step": 421
},
{
"entropy": 1.360275536775589,
"epoch": 0.9631954350927246,
"grad_norm": 3.3125,
"learning_rate": 2.8752660380781367e-06,
"loss": 0.4307,
"mean_token_accuracy": 0.8788939565420151,
"num_tokens": 2485907.0,
"step": 422
},
{
"entropy": 1.3031716644763947,
"epoch": 0.9654778887303852,
"grad_norm": 2.875,
"learning_rate": 2.865952852340417e-06,
"loss": 0.3625,
"mean_token_accuracy": 0.8956428542733192,
"num_tokens": 2492467.0,
"step": 423
},
{
"entropy": 1.541382610797882,
"epoch": 0.9677603423680456,
"grad_norm": 3.8125,
"learning_rate": 2.856634473914242e-06,
"loss": 0.5266,
"mean_token_accuracy": 0.8559072092175484,
"num_tokens": 2498045.0,
"step": 424
},
{
"entropy": 1.4921831041574478,
"epoch": 0.9700427960057061,
"grad_norm": 3.28125,
"learning_rate": 2.8473110350227536e-06,
"loss": 0.3466,
"mean_token_accuracy": 0.8902567103505135,
"num_tokens": 2503553.0,
"step": 425
},
{
"entropy": 1.470309928059578,
"epoch": 0.9723252496433666,
"grad_norm": 3.375,
"learning_rate": 2.8379826679609e-06,
"loss": 0.4556,
"mean_token_accuracy": 0.8601387813687325,
"num_tokens": 2509707.0,
"step": 426
},
{
"entropy": 1.3546678721904755,
"epoch": 0.9746077032810271,
"grad_norm": 2.828125,
"learning_rate": 2.828649505093558e-06,
"loss": 0.3985,
"mean_token_accuracy": 0.8941172435879707,
"num_tokens": 2516288.0,
"step": 427
},
{
"entropy": 1.4447802305221558,
"epoch": 0.9768901569186876,
"grad_norm": 3.421875,
"learning_rate": 2.819311678853652e-06,
"loss": 0.4776,
"mean_token_accuracy": 0.8569598346948624,
"num_tokens": 2521956.0,
"step": 428
},
{
"entropy": 1.6203635483980179,
"epoch": 0.9791726105563481,
"grad_norm": 3.734375,
"learning_rate": 2.8099693217402807e-06,
"loss": 0.4593,
"mean_token_accuracy": 0.8529090061783791,
"num_tokens": 2526920.0,
"step": 429
},
{
"entropy": 1.473097711801529,
"epoch": 0.9814550641940085,
"grad_norm": 3.265625,
"learning_rate": 2.800622566316831e-06,
"loss": 0.5033,
"mean_token_accuracy": 0.8560734689235687,
"num_tokens": 2533504.0,
"step": 430
},
{
"entropy": 1.5207239985466003,
"epoch": 0.9837375178316691,
"grad_norm": 4.09375,
"learning_rate": 2.7912715452091014e-06,
"loss": 0.5041,
"mean_token_accuracy": 0.8554971441626549,
"num_tokens": 2538535.0,
"step": 431
},
{
"entropy": 1.5741059184074402,
"epoch": 0.9860199714693295,
"grad_norm": 4.0,
"learning_rate": 2.7819163911034175e-06,
"loss": 0.4511,
"mean_token_accuracy": 0.8700136467814445,
"num_tokens": 2543371.0,
"step": 432
},
{
"entropy": 1.3865297734737396,
"epoch": 0.9883024251069901,
"grad_norm": 3.4375,
"learning_rate": 2.77255723674475e-06,
"loss": 0.4648,
"mean_token_accuracy": 0.8642655313014984,
"num_tokens": 2549303.0,
"step": 433
},
{
"entropy": 1.484322428703308,
"epoch": 0.9905848787446505,
"grad_norm": 3.453125,
"learning_rate": 2.7631942149348313e-06,
"loss": 0.5178,
"mean_token_accuracy": 0.8604016155004501,
"num_tokens": 2554892.0,
"step": 434
},
{
"entropy": 1.4711394906044006,
"epoch": 0.992867332382311,
"grad_norm": 3.1875,
"learning_rate": 2.7538274585302707e-06,
"loss": 0.5105,
"mean_token_accuracy": 0.8574899211525917,
"num_tokens": 2561168.0,
"step": 435
},
{
"entropy": 1.4003391563892365,
"epoch": 0.9951497860199715,
"grad_norm": 2.890625,
"learning_rate": 2.74445710044067e-06,
"loss": 0.3995,
"mean_token_accuracy": 0.8786035105586052,
"num_tokens": 2567401.0,
"step": 436
},
{
"entropy": 1.4778650850057602,
"epoch": 0.997432239657632,
"grad_norm": 3.25,
"learning_rate": 2.735083273626738e-06,
"loss": 0.5094,
"mean_token_accuracy": 0.8610806316137314,
"num_tokens": 2573896.0,
"step": 437
},
{
"entropy": 1.5298404842615128,
"epoch": 0.9997146932952924,
"grad_norm": 3.765625,
"learning_rate": 2.7257061110984005e-06,
"loss": 0.5801,
"mean_token_accuracy": 0.8354984298348427,
"num_tokens": 2579575.0,
"step": 438
},
{
"entropy": 1.2647957801818848,
"epoch": 1.0,
"grad_norm": 7.5,
"learning_rate": 2.7163257459129184e-06,
"loss": 0.3378,
"mean_token_accuracy": 0.9111570119857788,
"num_tokens": 2580462.0,
"step": 439
},
{
"entropy": 1.5493428707122803,
"epoch": 1.0022824536376604,
"grad_norm": 3.53125,
"learning_rate": 2.7069423111729948e-06,
"loss": 0.482,
"mean_token_accuracy": 0.8536929711699486,
"num_tokens": 2586104.0,
"step": 440
},
{
"entropy": 1.6429398506879807,
"epoch": 1.0045649072753209,
"grad_norm": 3.765625,
"learning_rate": 2.6975559400248876e-06,
"loss": 0.5162,
"mean_token_accuracy": 0.8646445199847221,
"num_tokens": 2591601.0,
"step": 441
},
{
"entropy": 1.3536241203546524,
"epoch": 1.0068473609129815,
"grad_norm": 2.53125,
"learning_rate": 2.688166765656523e-06,
"loss": 0.3578,
"mean_token_accuracy": 0.8843531683087349,
"num_tokens": 2598127.0,
"step": 442
},
{
"entropy": 1.4669701904058456,
"epoch": 1.009129814550642,
"grad_norm": 3.921875,
"learning_rate": 2.6787749212956023e-06,
"loss": 0.5313,
"mean_token_accuracy": 0.8472650721669197,
"num_tokens": 2603447.0,
"step": 443
},
{
"entropy": 1.4554204195737839,
"epoch": 1.0114122681883024,
"grad_norm": 3.78125,
"learning_rate": 2.6693805402077123e-06,
"loss": 0.5817,
"mean_token_accuracy": 0.83076561242342,
"num_tokens": 2609040.0,
"step": 444
},
{
"entropy": 1.4986287206411362,
"epoch": 1.0136947218259629,
"grad_norm": 3.546875,
"learning_rate": 2.6599837556944353e-06,
"loss": 0.498,
"mean_token_accuracy": 0.8590250089764595,
"num_tokens": 2615545.0,
"step": 445
},
{
"entropy": 1.5251432359218597,
"epoch": 1.0159771754636233,
"grad_norm": 4.0,
"learning_rate": 2.6505847010914575e-06,
"loss": 0.633,
"mean_token_accuracy": 0.8183507323265076,
"num_tokens": 2621930.0,
"step": 446
},
{
"entropy": 1.4970913529396057,
"epoch": 1.018259629101284,
"grad_norm": 3.203125,
"learning_rate": 2.641183509766675e-06,
"loss": 0.3988,
"mean_token_accuracy": 0.8723035603761673,
"num_tokens": 2627761.0,
"step": 447
},
{
"entropy": 1.4567296206951141,
"epoch": 1.0205420827389444,
"grad_norm": 3.296875,
"learning_rate": 2.6317803151183053e-06,
"loss": 0.4201,
"mean_token_accuracy": 0.8818748518824577,
"num_tokens": 2633748.0,
"step": 448
},
{
"entropy": 1.4635232239961624,
"epoch": 1.0228245363766049,
"grad_norm": 3.109375,
"learning_rate": 2.6223752505729884e-06,
"loss": 0.452,
"mean_token_accuracy": 0.8645489439368248,
"num_tokens": 2639662.0,
"step": 449
},
{
"entropy": 1.4294497519731522,
"epoch": 1.0251069900142653,
"grad_norm": 3.28125,
"learning_rate": 2.6129684495839013e-06,
"loss": 0.5102,
"mean_token_accuracy": 0.8570954278111458,
"num_tokens": 2645946.0,
"step": 450
},
{
"entropy": 1.3900626301765442,
"epoch": 1.0273894436519257,
"grad_norm": 2.9375,
"learning_rate": 2.6035600456288573e-06,
"loss": 0.3859,
"mean_token_accuracy": 0.8834785372018814,
"num_tokens": 2652364.0,
"step": 451
},
{
"entropy": 1.4409504532814026,
"epoch": 1.0296718972895864,
"grad_norm": 3.1875,
"learning_rate": 2.594150172208417e-06,
"loss": 0.4641,
"mean_token_accuracy": 0.8652448132634163,
"num_tokens": 2658338.0,
"step": 452
},
{
"entropy": 1.5055885165929794,
"epoch": 1.0319543509272469,
"grad_norm": 3.625,
"learning_rate": 2.5847389628439905e-06,
"loss": 0.426,
"mean_token_accuracy": 0.8645097240805626,
"num_tokens": 2663620.0,
"step": 453
},
{
"entropy": 1.5077017843723297,
"epoch": 1.0342368045649073,
"grad_norm": 3.15625,
"learning_rate": 2.575326551075945e-06,
"loss": 0.4288,
"mean_token_accuracy": 0.8733096942305565,
"num_tokens": 2669362.0,
"step": 454
},
{
"entropy": 1.3824554234743118,
"epoch": 1.0365192582025677,
"grad_norm": 3.03125,
"learning_rate": 2.5659130704617092e-06,
"loss": 0.4209,
"mean_token_accuracy": 0.8664216324687004,
"num_tokens": 2675587.0,
"step": 455
},
{
"entropy": 1.4790180027484894,
"epoch": 1.0388017118402282,
"grad_norm": 3.09375,
"learning_rate": 2.5564986545738767e-06,
"loss": 0.3928,
"mean_token_accuracy": 0.8827410265803337,
"num_tokens": 2681742.0,
"step": 456
},
{
"entropy": 1.4870340526103973,
"epoch": 1.0410841654778886,
"grad_norm": 3.734375,
"learning_rate": 2.547083436998316e-06,
"loss": 0.3968,
"mean_token_accuracy": 0.8777871504426003,
"num_tokens": 2687070.0,
"step": 457
},
{
"entropy": 1.492873653769493,
"epoch": 1.0433666191155493,
"grad_norm": 3.375,
"learning_rate": 2.5376675513322665e-06,
"loss": 0.4273,
"mean_token_accuracy": 0.8743336573243141,
"num_tokens": 2693415.0,
"step": 458
},
{
"entropy": 1.5607992857694626,
"epoch": 1.0456490727532097,
"grad_norm": 4.0,
"learning_rate": 2.52825113118245e-06,
"loss": 0.5436,
"mean_token_accuracy": 0.8444447070360184,
"num_tokens": 2699241.0,
"step": 459
},
{
"entropy": 1.4991340637207031,
"epoch": 1.0479315263908702,
"grad_norm": 3.0,
"learning_rate": 2.5188343101631717e-06,
"loss": 0.4713,
"mean_token_accuracy": 0.8594570085406303,
"num_tokens": 2705629.0,
"step": 460
},
{
"entropy": 1.4429044276475906,
"epoch": 1.0502139800285306,
"grad_norm": 3.28125,
"learning_rate": 2.5094172218944276e-06,
"loss": 0.5136,
"mean_token_accuracy": 0.8507946282625198,
"num_tokens": 2711944.0,
"step": 461
},
{
"entropy": 1.5478469878435135,
"epoch": 1.052496433666191,
"grad_norm": 3.21875,
"learning_rate": 2.5e-06,
"loss": 0.4498,
"mean_token_accuracy": 0.8698392882943153,
"num_tokens": 2717870.0,
"step": 462
},
{
"entropy": 1.4724483042955399,
"epoch": 1.0547788873038517,
"grad_norm": 4.09375,
"learning_rate": 2.4905827781055733e-06,
"loss": 0.5091,
"mean_token_accuracy": 0.8364823833107948,
"num_tokens": 2722955.0,
"step": 463
},
{
"entropy": 1.4399842321872711,
"epoch": 1.0570613409415122,
"grad_norm": 2.96875,
"learning_rate": 2.4811656898368287e-06,
"loss": 0.4118,
"mean_token_accuracy": 0.8793508112430573,
"num_tokens": 2729267.0,
"step": 464
},
{
"entropy": 1.4447701424360275,
"epoch": 1.0593437945791726,
"grad_norm": 3.3125,
"learning_rate": 2.4717488688175513e-06,
"loss": 0.4089,
"mean_token_accuracy": 0.8816163316369057,
"num_tokens": 2735200.0,
"step": 465
},
{
"entropy": 1.507298544049263,
"epoch": 1.061626248216833,
"grad_norm": 3.71875,
"learning_rate": 2.4623324486677352e-06,
"loss": 0.5426,
"mean_token_accuracy": 0.8359150066971779,
"num_tokens": 2740627.0,
"step": 466
},
{
"entropy": 1.4749993681907654,
"epoch": 1.0639087018544935,
"grad_norm": 3.28125,
"learning_rate": 2.4529165630016855e-06,
"loss": 0.4186,
"mean_token_accuracy": 0.8762158378958702,
"num_tokens": 2745817.0,
"step": 467
},
{
"entropy": 1.5043630599975586,
"epoch": 1.0661911554921542,
"grad_norm": 3.25,
"learning_rate": 2.4435013454261246e-06,
"loss": 0.4691,
"mean_token_accuracy": 0.8595764860510826,
"num_tokens": 2752047.0,
"step": 468
},
{
"entropy": 1.464219182729721,
"epoch": 1.0684736091298146,
"grad_norm": 3.609375,
"learning_rate": 2.4340869295382924e-06,
"loss": 0.4847,
"mean_token_accuracy": 0.8647123128175735,
"num_tokens": 2758030.0,
"step": 469
},
{
"entropy": 1.5525110363960266,
"epoch": 1.070756062767475,
"grad_norm": 3.40625,
"learning_rate": 2.4246734489240554e-06,
"loss": 0.4389,
"mean_token_accuracy": 0.871659129858017,
"num_tokens": 2763739.0,
"step": 470
},
{
"entropy": 1.441315084695816,
"epoch": 1.0730385164051355,
"grad_norm": 3.09375,
"learning_rate": 2.4152610371560095e-06,
"loss": 0.4706,
"mean_token_accuracy": 0.8659368455410004,
"num_tokens": 2770144.0,
"step": 471
},
{
"entropy": 1.5431715548038483,
"epoch": 1.075320970042796,
"grad_norm": 3.671875,
"learning_rate": 2.4058498277915835e-06,
"loss": 0.5396,
"mean_token_accuracy": 0.8234963491559029,
"num_tokens": 2776060.0,
"step": 472
},
{
"entropy": 1.3775285333395004,
"epoch": 1.0776034236804566,
"grad_norm": 3.0625,
"learning_rate": 2.3964399543711427e-06,
"loss": 0.3289,
"mean_token_accuracy": 0.8977130725979805,
"num_tokens": 2782100.0,
"step": 473
},
{
"entropy": 1.424841582775116,
"epoch": 1.079885877318117,
"grad_norm": 3.125,
"learning_rate": 2.3870315504160995e-06,
"loss": 0.4425,
"mean_token_accuracy": 0.8671782091259956,
"num_tokens": 2787965.0,
"step": 474
},
{
"entropy": 1.4423463493585587,
"epoch": 1.0821683309557775,
"grad_norm": 2.953125,
"learning_rate": 2.377624749427012e-06,
"loss": 0.3595,
"mean_token_accuracy": 0.8889539316296577,
"num_tokens": 2794165.0,
"step": 475
},
{
"entropy": 1.4992396533489227,
"epoch": 1.084450784593438,
"grad_norm": 3.875,
"learning_rate": 2.3682196848816955e-06,
"loss": 0.4793,
"mean_token_accuracy": 0.8694660887122154,
"num_tokens": 2800010.0,
"step": 476
},
{
"entropy": 1.4096488505601883,
"epoch": 1.0867332382310984,
"grad_norm": 3.03125,
"learning_rate": 2.358816490233326e-06,
"loss": 0.3516,
"mean_token_accuracy": 0.8974229022860527,
"num_tokens": 2805889.0,
"step": 477
},
{
"entropy": 1.4805195033550262,
"epoch": 1.089015691868759,
"grad_norm": 3.34375,
"learning_rate": 2.3494152989085433e-06,
"loss": 0.5061,
"mean_token_accuracy": 0.8679251745343208,
"num_tokens": 2811684.0,
"step": 478
},
{
"entropy": 1.5036189705133438,
"epoch": 1.0912981455064195,
"grad_norm": 3.546875,
"learning_rate": 2.3400162443055655e-06,
"loss": 0.5221,
"mean_token_accuracy": 0.8420342952013016,
"num_tokens": 2817131.0,
"step": 479
},
{
"entropy": 1.594360738992691,
"epoch": 1.09358059914408,
"grad_norm": 4.0,
"learning_rate": 2.330619459792289e-06,
"loss": 0.5052,
"mean_token_accuracy": 0.8538608327507973,
"num_tokens": 2822205.0,
"step": 480
},
{
"entropy": 1.3911210894584656,
"epoch": 1.0958630527817403,
"grad_norm": 2.796875,
"learning_rate": 2.321225078704399e-06,
"loss": 0.3525,
"mean_token_accuracy": 0.8852925226092339,
"num_tokens": 2828146.0,
"step": 481
},
{
"entropy": 1.5996953547000885,
"epoch": 1.0981455064194008,
"grad_norm": 3.4375,
"learning_rate": 2.311833234343478e-06,
"loss": 0.4677,
"mean_token_accuracy": 0.8572832494974136,
"num_tokens": 2833879.0,
"step": 482
},
{
"entropy": 1.5117892771959305,
"epoch": 1.1004279600570612,
"grad_norm": 4.09375,
"learning_rate": 2.3024440599751132e-06,
"loss": 0.4467,
"mean_token_accuracy": 0.8582476228475571,
"num_tokens": 2839173.0,
"step": 483
},
{
"entropy": 1.433998242020607,
"epoch": 1.102710413694722,
"grad_norm": 2.90625,
"learning_rate": 2.293057688827007e-06,
"loss": 0.3942,
"mean_token_accuracy": 0.8847835510969162,
"num_tokens": 2845616.0,
"step": 484
},
{
"entropy": 1.5342581421136856,
"epoch": 1.1049928673323823,
"grad_norm": 3.078125,
"learning_rate": 2.283674254087082e-06,
"loss": 0.4659,
"mean_token_accuracy": 0.8615615218877792,
"num_tokens": 2851949.0,
"step": 485
},
{
"entropy": 1.5389353781938553,
"epoch": 1.1072753209700428,
"grad_norm": 3.421875,
"learning_rate": 2.274293888901599e-06,
"loss": 0.4388,
"mean_token_accuracy": 0.871217891573906,
"num_tokens": 2857358.0,
"step": 486
},
{
"entropy": 1.4772920906543732,
"epoch": 1.1095577746077032,
"grad_norm": 4.03125,
"learning_rate": 2.264916726373263e-06,
"loss": 0.5044,
"mean_token_accuracy": 0.8598240464925766,
"num_tokens": 2862299.0,
"step": 487
},
{
"entropy": 1.4805989265441895,
"epoch": 1.1118402282453639,
"grad_norm": 2.890625,
"learning_rate": 2.2555428995593303e-06,
"loss": 0.444,
"mean_token_accuracy": 0.8689677938818932,
"num_tokens": 2868820.0,
"step": 488
},
{
"entropy": 1.4840258061885834,
"epoch": 1.1141226818830243,
"grad_norm": 3.421875,
"learning_rate": 2.24617254146973e-06,
"loss": 0.4531,
"mean_token_accuracy": 0.8679408878087997,
"num_tokens": 2874968.0,
"step": 489
},
{
"entropy": 1.4381522238254547,
"epoch": 1.1164051355206848,
"grad_norm": 3.125,
"learning_rate": 2.23680578506517e-06,
"loss": 0.4115,
"mean_token_accuracy": 0.8769493475556374,
"num_tokens": 2880835.0,
"step": 490
},
{
"entropy": 1.4330200850963593,
"epoch": 1.1186875891583452,
"grad_norm": 2.90625,
"learning_rate": 2.2274427632552507e-06,
"loss": 0.4123,
"mean_token_accuracy": 0.8793010637164116,
"num_tokens": 2887529.0,
"step": 491
},
{
"entropy": 1.3696521073579788,
"epoch": 1.1209700427960057,
"grad_norm": 2.9375,
"learning_rate": 2.2180836088965833e-06,
"loss": 0.3384,
"mean_token_accuracy": 0.8860399350523949,
"num_tokens": 2893458.0,
"step": 492
},
{
"entropy": 1.4893521070480347,
"epoch": 1.123252496433666,
"grad_norm": 3.0,
"learning_rate": 2.208728454790899e-06,
"loss": 0.4691,
"mean_token_accuracy": 0.8572286292910576,
"num_tokens": 2899716.0,
"step": 493
},
{
"entropy": 1.3807679414749146,
"epoch": 1.1255349500713268,
"grad_norm": 3.015625,
"learning_rate": 2.1993774336831696e-06,
"loss": 0.4068,
"mean_token_accuracy": 0.8788377121090889,
"num_tokens": 2906271.0,
"step": 494
},
{
"entropy": 1.4945531785488129,
"epoch": 1.1278174037089872,
"grad_norm": 3.078125,
"learning_rate": 2.19003067825972e-06,
"loss": 0.4081,
"mean_token_accuracy": 0.8731363192200661,
"num_tokens": 2912348.0,
"step": 495
},
{
"entropy": 1.5495448559522629,
"epoch": 1.1300998573466476,
"grad_norm": 3.921875,
"learning_rate": 2.180688321146349e-06,
"loss": 0.601,
"mean_token_accuracy": 0.8166243210434914,
"num_tokens": 2918060.0,
"step": 496
},
{
"entropy": 1.5690300911664963,
"epoch": 1.132382310984308,
"grad_norm": 3.5,
"learning_rate": 2.1713504949064433e-06,
"loss": 0.4601,
"mean_token_accuracy": 0.85266974568367,
"num_tokens": 2923409.0,
"step": 497
},
{
"entropy": 1.3820966184139252,
"epoch": 1.1346647646219685,
"grad_norm": 2.703125,
"learning_rate": 2.1620173320391007e-06,
"loss": 0.2558,
"mean_token_accuracy": 0.9106499254703522,
"num_tokens": 2929722.0,
"step": 498
},
{
"entropy": 1.540186420083046,
"epoch": 1.1369472182596292,
"grad_norm": 3.21875,
"learning_rate": 2.1526889649772477e-06,
"loss": 0.4437,
"mean_token_accuracy": 0.8645635023713112,
"num_tokens": 2935812.0,
"step": 499
},
{
"entropy": 1.435683935880661,
"epoch": 1.1392296718972896,
"grad_norm": 3.234375,
"learning_rate": 2.143365526085759e-06,
"loss": 0.48,
"mean_token_accuracy": 0.8664367198944092,
"num_tokens": 2942222.0,
"step": 500
},
{
"epoch": 1.1392296718972896,
"eval_entropy": 1.4798295431666904,
"eval_loss": 0.4741344451904297,
"eval_mean_token_accuracy": 0.8666040844387478,
"eval_num_tokens": 2942222.0,
"eval_runtime": 4.4417,
"eval_samples_per_second": 20.262,
"eval_steps_per_second": 20.262,
"step": 500
},
{
"entropy": 1.4722786843776703,
"epoch": 1.14151212553495,
"grad_norm": 3.484375,
"learning_rate": 2.1340471476595836e-06,
"loss": 0.4947,
"mean_token_accuracy": 0.8604092225432396,
"num_tokens": 2947869.0,
"step": 501
},
{
"entropy": 1.5302625745534897,
"epoch": 1.1437945791726105,
"grad_norm": 3.765625,
"learning_rate": 2.124733961921864e-06,
"loss": 0.5213,
"mean_token_accuracy": 0.8443537354469299,
"num_tokens": 2953787.0,
"step": 502
},
{
"entropy": 1.482778623700142,
"epoch": 1.146077032810271,
"grad_norm": 4.0,
"learning_rate": 2.11542610102206e-06,
"loss": 0.5494,
"mean_token_accuracy": 0.8402880057692528,
"num_tokens": 2958803.0,
"step": 503
},
{
"entropy": 1.4486149698495865,
"epoch": 1.1483594864479316,
"grad_norm": 3.203125,
"learning_rate": 2.1061236970340756e-06,
"loss": 0.4747,
"mean_token_accuracy": 0.8640668168663979,
"num_tokens": 2965403.0,
"step": 504
},
{
"entropy": 1.4366931170225143,
"epoch": 1.150641940085592,
"grad_norm": 3.078125,
"learning_rate": 2.096826881954385e-06,
"loss": 0.4002,
"mean_token_accuracy": 0.869108684360981,
"num_tokens": 2971085.0,
"step": 505
},
{
"entropy": 1.4204550981521606,
"epoch": 1.1529243937232525,
"grad_norm": 2.890625,
"learning_rate": 2.0875357877001556e-06,
"loss": 0.3827,
"mean_token_accuracy": 0.8868636935949326,
"num_tokens": 2976577.0,
"step": 506
},
{
"entropy": 1.484930396080017,
"epoch": 1.155206847360913,
"grad_norm": 3.578125,
"learning_rate": 2.0782505461073822e-06,
"loss": 0.4416,
"mean_token_accuracy": 0.8644617721438408,
"num_tokens": 2981977.0,
"step": 507
},
{
"entropy": 1.5487978011369705,
"epoch": 1.1574893009985734,
"grad_norm": 3.359375,
"learning_rate": 2.0689712889290114e-06,
"loss": 0.4142,
"mean_token_accuracy": 0.8582484424114227,
"num_tokens": 2987315.0,
"step": 508
},
{
"entropy": 1.4167566150426865,
"epoch": 1.159771754636234,
"grad_norm": 3.046875,
"learning_rate": 2.059698147833075e-06,
"loss": 0.4121,
"mean_token_accuracy": 0.8841976970434189,
"num_tokens": 2993295.0,
"step": 509
},
{
"entropy": 1.3966283351182938,
"epoch": 1.1620542082738945,
"grad_norm": 3.1875,
"learning_rate": 2.0504312544008193e-06,
"loss": 0.4939,
"mean_token_accuracy": 0.8544362857937813,
"num_tokens": 2999720.0,
"step": 510
},
{
"entropy": 1.5113223046064377,
"epoch": 1.164336661911555,
"grad_norm": 3.578125,
"learning_rate": 2.0411707401248406e-06,
"loss": 0.4498,
"mean_token_accuracy": 0.8582001850008965,
"num_tokens": 3004838.0,
"step": 511
},
{
"entropy": 1.4698415398597717,
"epoch": 1.1666191155492154,
"grad_norm": 3.453125,
"learning_rate": 2.0319167364072184e-06,
"loss": 0.4023,
"mean_token_accuracy": 0.8724709004163742,
"num_tokens": 3010321.0,
"step": 512
},
{
"entropy": 1.5211911350488663,
"epoch": 1.1689015691868758,
"grad_norm": 4.71875,
"learning_rate": 2.0226693745576494e-06,
"loss": 0.5156,
"mean_token_accuracy": 0.8473959043622017,
"num_tokens": 3015170.0,
"step": 513
},
{
"entropy": 1.3687680065631866,
"epoch": 1.1711840228245363,
"grad_norm": 3.15625,
"learning_rate": 2.0134287857915864e-06,
"loss": 0.4614,
"mean_token_accuracy": 0.8563283011317253,
"num_tokens": 3021067.0,
"step": 514
},
{
"entropy": 1.4768206179141998,
"epoch": 1.173466476462197,
"grad_norm": 4.0625,
"learning_rate": 2.004195101228374e-06,
"loss": 0.5225,
"mean_token_accuracy": 0.8456647023558617,
"num_tokens": 3026317.0,
"step": 515
},
{
"entropy": 1.5133604854345322,
"epoch": 1.1757489300998574,
"grad_norm": 3.359375,
"learning_rate": 1.9949684518893926e-06,
"loss": 0.4637,
"mean_token_accuracy": 0.8587842807173729,
"num_tokens": 3032462.0,
"step": 516
},
{
"entropy": 1.5985838025808334,
"epoch": 1.1780313837375178,
"grad_norm": 3.5,
"learning_rate": 1.985748968696194e-06,
"loss": 0.4668,
"mean_token_accuracy": 0.8562392815947533,
"num_tokens": 3037823.0,
"step": 517
},
{
"entropy": 1.2920548766851425,
"epoch": 1.1803138373751783,
"grad_norm": 2.453125,
"learning_rate": 1.9765367824686467e-06,
"loss": 0.3451,
"mean_token_accuracy": 0.8893763497471809,
"num_tokens": 3044938.0,
"step": 518
},
{
"entropy": 1.5204766243696213,
"epoch": 1.182596291012839,
"grad_norm": 3.203125,
"learning_rate": 1.9673320239230783e-06,
"loss": 0.4753,
"mean_token_accuracy": 0.8598108664155006,
"num_tokens": 3051301.0,
"step": 519
},
{
"entropy": 1.445823684334755,
"epoch": 1.1848787446504994,
"grad_norm": 3.875,
"learning_rate": 1.9581348236704217e-06,
"loss": 0.4797,
"mean_token_accuracy": 0.8649851009249687,
"num_tokens": 3056991.0,
"step": 520
},
{
"entropy": 1.4846927672624588,
"epoch": 1.1871611982881598,
"grad_norm": 3.875,
"learning_rate": 1.9489453122143605e-06,
"loss": 0.5029,
"mean_token_accuracy": 0.8675966411828995,
"num_tokens": 3062974.0,
"step": 521
},
{
"entropy": 1.4466316848993301,
"epoch": 1.1894436519258202,
"grad_norm": 3.421875,
"learning_rate": 1.939763619949481e-06,
"loss": 0.4049,
"mean_token_accuracy": 0.8771371468901634,
"num_tokens": 3068426.0,
"step": 522
},
{
"entropy": 1.5384458899497986,
"epoch": 1.1917261055634807,
"grad_norm": 3.484375,
"learning_rate": 1.930589877159415e-06,
"loss": 0.454,
"mean_token_accuracy": 0.864221066236496,
"num_tokens": 3074213.0,
"step": 523
},
{
"entropy": 1.541999727487564,
"epoch": 1.1940085592011411,
"grad_norm": 2.984375,
"learning_rate": 1.9214242140149987e-06,
"loss": 0.3965,
"mean_token_accuracy": 0.874009445309639,
"num_tokens": 3080429.0,
"step": 524
},
{
"entropy": 1.4880231320858002,
"epoch": 1.1962910128388018,
"grad_norm": 4.1875,
"learning_rate": 1.9122667605724202e-06,
"loss": 0.5623,
"mean_token_accuracy": 0.8356714621186256,
"num_tokens": 3085713.0,
"step": 525
},
{
"entropy": 1.5347374975681305,
"epoch": 1.1985734664764622,
"grad_norm": 2.890625,
"learning_rate": 1.9031176467713763e-06,
"loss": 0.3592,
"mean_token_accuracy": 0.8790554702281952,
"num_tokens": 3092191.0,
"step": 526
},
{
"entropy": 1.4829518347978592,
"epoch": 1.2008559201141227,
"grad_norm": 3.078125,
"learning_rate": 1.8939770024332294e-06,
"loss": 0.3886,
"mean_token_accuracy": 0.882826641201973,
"num_tokens": 3098756.0,
"step": 527
},
{
"entropy": 1.4315824955701828,
"epoch": 1.2031383737517831,
"grad_norm": 3.203125,
"learning_rate": 1.884844957259163e-06,
"loss": 0.4995,
"mean_token_accuracy": 0.8524395078420639,
"num_tokens": 3104965.0,
"step": 528
},
{
"entropy": 1.4298695474863052,
"epoch": 1.2054208273894436,
"grad_norm": 2.8125,
"learning_rate": 1.875721640828344e-06,
"loss": 0.3871,
"mean_token_accuracy": 0.8858682960271835,
"num_tokens": 3111490.0,
"step": 529
},
{
"entropy": 1.4648047238588333,
"epoch": 1.2077032810271042,
"grad_norm": 2.859375,
"learning_rate": 1.866607182596081e-06,
"loss": 0.3277,
"mean_token_accuracy": 0.8968348726630211,
"num_tokens": 3117215.0,
"step": 530
},
{
"entropy": 1.5635619461536407,
"epoch": 1.2099857346647647,
"grad_norm": 3.46875,
"learning_rate": 1.857501711891993e-06,
"loss": 0.4185,
"mean_token_accuracy": 0.8711593821644783,
"num_tokens": 3123093.0,
"step": 531
},
{
"entropy": 1.4186049550771713,
"epoch": 1.212268188302425,
"grad_norm": 3.109375,
"learning_rate": 1.848405357918166e-06,
"loss": 0.4707,
"mean_token_accuracy": 0.8640479817986488,
"num_tokens": 3129377.0,
"step": 532
},
{
"entropy": 1.439442053437233,
"epoch": 1.2145506419400856,
"grad_norm": 3.046875,
"learning_rate": 1.8393182497473271e-06,
"loss": 0.3726,
"mean_token_accuracy": 0.8774393498897552,
"num_tokens": 3135006.0,
"step": 533
},
{
"entropy": 1.5040159970521927,
"epoch": 1.216833095577746,
"grad_norm": 3.734375,
"learning_rate": 1.830240516321008e-06,
"loss": 0.5652,
"mean_token_accuracy": 0.8349686115980148,
"num_tokens": 3140699.0,
"step": 534
},
{
"entropy": 1.5229474604129791,
"epoch": 1.2191155492154067,
"grad_norm": 3.109375,
"learning_rate": 1.8211722864477197e-06,
"loss": 0.4583,
"mean_token_accuracy": 0.8692138940095901,
"num_tokens": 3147116.0,
"step": 535
},
{
"entropy": 1.4866357445716858,
"epoch": 1.221398002853067,
"grad_norm": 3.390625,
"learning_rate": 1.8121136888011198e-06,
"loss": 0.5026,
"mean_token_accuracy": 0.8499261438846588,
"num_tokens": 3153155.0,
"step": 536
},
{
"entropy": 1.4372419267892838,
"epoch": 1.2236804564907275,
"grad_norm": 3.453125,
"learning_rate": 1.8030648519181926e-06,
"loss": 0.4709,
"mean_token_accuracy": 0.8507603630423546,
"num_tokens": 3158699.0,
"step": 537
},
{
"entropy": 1.4247355163097382,
"epoch": 1.225962910128388,
"grad_norm": 2.875,
"learning_rate": 1.7940259041974189e-06,
"loss": 0.4764,
"mean_token_accuracy": 0.8748277649283409,
"num_tokens": 3165422.0,
"step": 538
},
{
"entropy": 1.4786742329597473,
"epoch": 1.2282453637660484,
"grad_norm": 3.5625,
"learning_rate": 1.7849969738969592e-06,
"loss": 0.4736,
"mean_token_accuracy": 0.8629911243915558,
"num_tokens": 3171419.0,
"step": 539
},
{
"entropy": 1.5304382294416428,
"epoch": 1.230527817403709,
"grad_norm": 3.375,
"learning_rate": 1.7759781891328321e-06,
"loss": 0.494,
"mean_token_accuracy": 0.8473329395055771,
"num_tokens": 3177530.0,
"step": 540
},
{
"entropy": 1.3744118362665176,
"epoch": 1.2328102710413695,
"grad_norm": 2.96875,
"learning_rate": 1.766969677877094e-06,
"loss": 0.4123,
"mean_token_accuracy": 0.8834565728902817,
"num_tokens": 3184220.0,
"step": 541
},
{
"entropy": 1.3755813837051392,
"epoch": 1.23509272467903,
"grad_norm": 2.796875,
"learning_rate": 1.7579715679560273e-06,
"loss": 0.4265,
"mean_token_accuracy": 0.8768275752663612,
"num_tokens": 3190613.0,
"step": 542
},
{
"entropy": 1.4333829581737518,
"epoch": 1.2373751783166904,
"grad_norm": 3.484375,
"learning_rate": 1.7489839870483236e-06,
"loss": 0.4931,
"mean_token_accuracy": 0.8496510609984398,
"num_tokens": 3196269.0,
"step": 543
},
{
"entropy": 1.4510899037122726,
"epoch": 1.2396576319543509,
"grad_norm": 3.078125,
"learning_rate": 1.7400070626832732e-06,
"loss": 0.3757,
"mean_token_accuracy": 0.8865254819393158,
"num_tokens": 3201924.0,
"step": 544
},
{
"entropy": 1.4932819455862045,
"epoch": 1.2419400855920113,
"grad_norm": 3.484375,
"learning_rate": 1.7310409222389563e-06,
"loss": 0.4531,
"mean_token_accuracy": 0.850062184035778,
"num_tokens": 3207808.0,
"step": 545
},
{
"entropy": 1.5299255549907684,
"epoch": 1.244222539229672,
"grad_norm": 3.328125,
"learning_rate": 1.7220856929404342e-06,
"loss": 0.4531,
"mean_token_accuracy": 0.8687416762113571,
"num_tokens": 3213083.0,
"step": 546
},
{
"entropy": 1.5315914154052734,
"epoch": 1.2465049928673324,
"grad_norm": 3.84375,
"learning_rate": 1.713141501857943e-06,
"loss": 0.504,
"mean_token_accuracy": 0.850853443145752,
"num_tokens": 3218803.0,
"step": 547
},
{
"entropy": 1.5325356125831604,
"epoch": 1.2487874465049928,
"grad_norm": 3.578125,
"learning_rate": 1.7042084759050948e-06,
"loss": 0.495,
"mean_token_accuracy": 0.8577945232391357,
"num_tokens": 3224187.0,
"step": 548
},
{
"entropy": 1.3780454993247986,
"epoch": 1.2510699001426533,
"grad_norm": 3.140625,
"learning_rate": 1.6952867418370707e-06,
"loss": 0.4453,
"mean_token_accuracy": 0.8700388446450233,
"num_tokens": 3230589.0,
"step": 549
},
{
"entropy": 1.466676115989685,
"epoch": 1.253352353780314,
"grad_norm": 3.296875,
"learning_rate": 1.6863764262488292e-06,
"loss": 0.496,
"mean_token_accuracy": 0.8478997200727463,
"num_tokens": 3237256.0,
"step": 550
},
{
"entropy": 1.4295217841863632,
"epoch": 1.2556348074179744,
"grad_norm": 3.109375,
"learning_rate": 1.677477655573303e-06,
"loss": 0.4455,
"mean_token_accuracy": 0.8676532134413719,
"num_tokens": 3243578.0,
"step": 551
},
{
"entropy": 1.4432758837938309,
"epoch": 1.2579172610556348,
"grad_norm": 3.46875,
"learning_rate": 1.6685905560796101e-06,
"loss": 0.4933,
"mean_token_accuracy": 0.8503763899207115,
"num_tokens": 3249344.0,
"step": 552
},
{
"entropy": 1.4768379628658295,
"epoch": 1.2601997146932953,
"grad_norm": 3.578125,
"learning_rate": 1.6597152538712608e-06,
"loss": 0.5331,
"mean_token_accuracy": 0.8477922007441521,
"num_tokens": 3256038.0,
"step": 553
},
{
"entropy": 1.4157912582159042,
"epoch": 1.2624821683309557,
"grad_norm": 3.515625,
"learning_rate": 1.6508518748843651e-06,
"loss": 0.5013,
"mean_token_accuracy": 0.860062412917614,
"num_tokens": 3261703.0,
"step": 554
},
{
"entropy": 1.403880551457405,
"epoch": 1.2647646219686162,
"grad_norm": 3.234375,
"learning_rate": 1.6420005448858522e-06,
"loss": 0.5094,
"mean_token_accuracy": 0.8528245538473129,
"num_tokens": 3268063.0,
"step": 555
},
{
"entropy": 1.5064998269081116,
"epoch": 1.2670470756062768,
"grad_norm": 3.0625,
"learning_rate": 1.6331613894716787e-06,
"loss": 0.4452,
"mean_token_accuracy": 0.8757540956139565,
"num_tokens": 3274092.0,
"step": 556
},
{
"entropy": 1.4100589752197266,
"epoch": 1.2693295292439373,
"grad_norm": 3.203125,
"learning_rate": 1.6243345340650523e-06,
"loss": 0.4675,
"mean_token_accuracy": 0.8688594177365303,
"num_tokens": 3280661.0,
"step": 557
},
{
"entropy": 1.5577640682458878,
"epoch": 1.2716119828815977,
"grad_norm": 3.40625,
"learning_rate": 1.6155201039146478e-06,
"loss": 0.4195,
"mean_token_accuracy": 0.8589218854904175,
"num_tokens": 3286601.0,
"step": 558
},
{
"entropy": 1.3485192209482193,
"epoch": 1.2738944365192582,
"grad_norm": 2.890625,
"learning_rate": 1.6067182240928332e-06,
"loss": 0.3449,
"mean_token_accuracy": 0.8934107944369316,
"num_tokens": 3292073.0,
"step": 559
},
{
"entropy": 1.5532638430595398,
"epoch": 1.2761768901569188,
"grad_norm": 3.328125,
"learning_rate": 1.5979290194938938e-06,
"loss": 0.4331,
"mean_token_accuracy": 0.8702542334794998,
"num_tokens": 3298200.0,
"step": 560
},
{
"entropy": 1.5261798650026321,
"epoch": 1.2784593437945793,
"grad_norm": 3.109375,
"learning_rate": 1.5891526148322594e-06,
"loss": 0.4389,
"mean_token_accuracy": 0.862305723130703,
"num_tokens": 3304356.0,
"step": 561
},
{
"entropy": 1.536175400018692,
"epoch": 1.2807417974322397,
"grad_norm": 4.3125,
"learning_rate": 1.5803891346407342e-06,
"loss": 0.5677,
"mean_token_accuracy": 0.8316505700349808,
"num_tokens": 3309722.0,
"step": 562
},
{
"entropy": 1.4453733563423157,
"epoch": 1.2830242510699001,
"grad_norm": 3.1875,
"learning_rate": 1.5716387032687314e-06,
"loss": 0.3941,
"mean_token_accuracy": 0.8798687309026718,
"num_tokens": 3315076.0,
"step": 563
},
{
"entropy": 1.5081749856472015,
"epoch": 1.2853067047075606,
"grad_norm": 2.96875,
"learning_rate": 1.562901444880508e-06,
"loss": 0.4143,
"mean_token_accuracy": 0.8727659210562706,
"num_tokens": 3320848.0,
"step": 564
},
{
"entropy": 1.5081788897514343,
"epoch": 1.287589158345221,
"grad_norm": 3.1875,
"learning_rate": 1.5541774834534024e-06,
"loss": 0.4623,
"mean_token_accuracy": 0.8562600538134575,
"num_tokens": 3327236.0,
"step": 565
},
{
"entropy": 1.4714922159910202,
"epoch": 1.2898716119828815,
"grad_norm": 3.46875,
"learning_rate": 1.5454669427760774e-06,
"loss": 0.4112,
"mean_token_accuracy": 0.8714669123291969,
"num_tokens": 3333039.0,
"step": 566
},
{
"entropy": 1.496582642197609,
"epoch": 1.2921540656205421,
"grad_norm": 3.328125,
"learning_rate": 1.5367699464467596e-06,
"loss": 0.4667,
"mean_token_accuracy": 0.8694412559270859,
"num_tokens": 3339578.0,
"step": 567
},
{
"entropy": 1.453754335641861,
"epoch": 1.2944365192582026,
"grad_norm": 3.234375,
"learning_rate": 1.5280866178714898e-06,
"loss": 0.4655,
"mean_token_accuracy": 0.8703877553343773,
"num_tokens": 3346073.0,
"step": 568
},
{
"entropy": 1.496316447854042,
"epoch": 1.296718972895863,
"grad_norm": 3.3125,
"learning_rate": 1.5194170802623692e-06,
"loss": 0.403,
"mean_token_accuracy": 0.8825008124113083,
"num_tokens": 3351735.0,
"step": 569
},
{
"entropy": 1.5532702058553696,
"epoch": 1.2990014265335235,
"grad_norm": 3.375,
"learning_rate": 1.5107614566358136e-06,
"loss": 0.5159,
"mean_token_accuracy": 0.872811533510685,
"num_tokens": 3358008.0,
"step": 570
},
{
"entropy": 1.3984228074550629,
"epoch": 1.3012838801711841,
"grad_norm": 2.90625,
"learning_rate": 1.5021198698108038e-06,
"loss": 0.4531,
"mean_token_accuracy": 0.8692669570446014,
"num_tokens": 3364752.0,
"step": 571
},
{
"entropy": 1.500732660293579,
"epoch": 1.3035663338088446,
"grad_norm": 3.28125,
"learning_rate": 1.4934924424071479e-06,
"loss": 0.3973,
"mean_token_accuracy": 0.8750224709510803,
"num_tokens": 3369908.0,
"step": 572
},
{
"entropy": 1.4046034514904022,
"epoch": 1.305848787446505,
"grad_norm": 2.984375,
"learning_rate": 1.4848792968437376e-06,
"loss": 0.407,
"mean_token_accuracy": 0.8775566592812538,
"num_tokens": 3376101.0,
"step": 573
},
{
"entropy": 1.4599164128303528,
"epoch": 1.3081312410841655,
"grad_norm": 3.15625,
"learning_rate": 1.4762805553368115e-06,
"loss": 0.4068,
"mean_token_accuracy": 0.8896359950304031,
"num_tokens": 3381766.0,
"step": 574
},
{
"entropy": 1.5650553405284882,
"epoch": 1.310413694721826,
"grad_norm": 3.90625,
"learning_rate": 1.4676963398982248e-06,
"loss": 0.526,
"mean_token_accuracy": 0.8529334291815758,
"num_tokens": 3387045.0,
"step": 575
},
{
"entropy": 1.4292816668748856,
"epoch": 1.3126961483594863,
"grad_norm": 3.1875,
"learning_rate": 1.4591267723337122e-06,
"loss": 0.4427,
"mean_token_accuracy": 0.8748316466808319,
"num_tokens": 3393002.0,
"step": 576
},
{
"entropy": 1.5142599791288376,
"epoch": 1.314978601997147,
"grad_norm": 3.03125,
"learning_rate": 1.4505719742411644e-06,
"loss": 0.3505,
"mean_token_accuracy": 0.8907722160220146,
"num_tokens": 3398389.0,
"step": 577
},
{
"entropy": 1.3882330507040024,
"epoch": 1.3172610556348074,
"grad_norm": 2.734375,
"learning_rate": 1.4420320670088977e-06,
"loss": 0.3516,
"mean_token_accuracy": 0.891185887157917,
"num_tokens": 3404815.0,
"step": 578
},
{
"entropy": 1.5874699354171753,
"epoch": 1.3195435092724679,
"grad_norm": 3.6875,
"learning_rate": 1.4335071718139379e-06,
"loss": 0.5036,
"mean_token_accuracy": 0.8607900366187096,
"num_tokens": 3410299.0,
"step": 579
},
{
"entropy": 1.5547137558460236,
"epoch": 1.3218259629101283,
"grad_norm": 3.578125,
"learning_rate": 1.424997409620295e-06,
"loss": 0.4533,
"mean_token_accuracy": 0.8668412491679192,
"num_tokens": 3415403.0,
"step": 580
},
{
"entropy": 1.3236225843429565,
"epoch": 1.324108416547789,
"grad_norm": 3.078125,
"learning_rate": 1.4165029011772513e-06,
"loss": 0.4062,
"mean_token_accuracy": 0.8871461227536201,
"num_tokens": 3421683.0,
"step": 581
},
{
"entropy": 1.469793826341629,
"epoch": 1.3263908701854494,
"grad_norm": 2.875,
"learning_rate": 1.4080237670176456e-06,
"loss": 0.4243,
"mean_token_accuracy": 0.8801388815045357,
"num_tokens": 3427994.0,
"step": 582
},
{
"entropy": 1.4647793471813202,
"epoch": 1.3286733238231099,
"grad_norm": 3.09375,
"learning_rate": 1.3995601274561605e-06,
"loss": 0.4262,
"mean_token_accuracy": 0.8648821488022804,
"num_tokens": 3434912.0,
"step": 583
},
{
"entropy": 1.4584257155656815,
"epoch": 1.3309557774607703,
"grad_norm": 3.171875,
"learning_rate": 1.3911121025876212e-06,
"loss": 0.4423,
"mean_token_accuracy": 0.8798868283629417,
"num_tokens": 3442058.0,
"step": 584
},
{
"entropy": 1.5300581902265549,
"epoch": 1.3332382310984308,
"grad_norm": 3.21875,
"learning_rate": 1.382679812285287e-06,
"loss": 0.4313,
"mean_token_accuracy": 0.8496553376317024,
"num_tokens": 3447771.0,
"step": 585
},
{
"entropy": 1.5843760669231415,
"epoch": 1.3355206847360912,
"grad_norm": 4.0625,
"learning_rate": 1.3742633761991519e-06,
"loss": 0.4945,
"mean_token_accuracy": 0.8482984900474548,
"num_tokens": 3452785.0,
"step": 586
},
{
"entropy": 1.4104232043027878,
"epoch": 1.3378031383737519,
"grad_norm": 2.984375,
"learning_rate": 1.365862913754247e-06,
"loss": 0.3925,
"mean_token_accuracy": 0.8749718070030212,
"num_tokens": 3458611.0,
"step": 587
},
{
"entropy": 1.5725494027137756,
"epoch": 1.3400855920114123,
"grad_norm": 3.5,
"learning_rate": 1.357478544148943e-06,
"loss": 0.4045,
"mean_token_accuracy": 0.8671303018927574,
"num_tokens": 3465091.0,
"step": 588
},
{
"entropy": 1.4776265919208527,
"epoch": 1.3423680456490727,
"grad_norm": 3.015625,
"learning_rate": 1.3491103863532626e-06,
"loss": 0.3392,
"mean_token_accuracy": 0.9015164896845818,
"num_tokens": 3470488.0,
"step": 589
},
{
"entropy": 1.6683387607336044,
"epoch": 1.3446504992867332,
"grad_norm": 4.8125,
"learning_rate": 1.3407585591071944e-06,
"loss": 0.5101,
"mean_token_accuracy": 0.846831701695919,
"num_tokens": 3475407.0,
"step": 590
},
{
"entropy": 1.5126305967569351,
"epoch": 1.3469329529243939,
"grad_norm": 3.3125,
"learning_rate": 1.3324231809189985e-06,
"loss": 0.4343,
"mean_token_accuracy": 0.8680194914340973,
"num_tokens": 3481469.0,
"step": 591
},
{
"entropy": 1.5650955736637115,
"epoch": 1.3492154065620543,
"grad_norm": 3.515625,
"learning_rate": 1.3241043700635352e-06,
"loss": 0.4892,
"mean_token_accuracy": 0.86560869961977,
"num_tokens": 3487280.0,
"step": 592
},
{
"entropy": 1.5224829465150833,
"epoch": 1.3514978601997147,
"grad_norm": 3.5625,
"learning_rate": 1.3158022445805816e-06,
"loss": 0.437,
"mean_token_accuracy": 0.8517628982663155,
"num_tokens": 3492699.0,
"step": 593
},
{
"entropy": 1.5090250372886658,
"epoch": 1.3537803138373752,
"grad_norm": 3.734375,
"learning_rate": 1.3075169222731573e-06,
"loss": 0.4919,
"mean_token_accuracy": 0.8590176850557327,
"num_tokens": 3498075.0,
"step": 594
},
{
"entropy": 1.3978570252656937,
"epoch": 1.3560627674750356,
"grad_norm": 3.375,
"learning_rate": 1.2992485207058548e-06,
"loss": 0.4248,
"mean_token_accuracy": 0.8699210062623024,
"num_tokens": 3503380.0,
"step": 595
},
{
"entropy": 1.4768076539039612,
"epoch": 1.358345221112696,
"grad_norm": 3.5625,
"learning_rate": 1.2909971572031663e-06,
"loss": 0.4681,
"mean_token_accuracy": 0.8609839826822281,
"num_tokens": 3509109.0,
"step": 596
},
{
"entropy": 1.4522972255945206,
"epoch": 1.3606276747503565,
"grad_norm": 3.296875,
"learning_rate": 1.2827629488478254e-06,
"loss": 0.5161,
"mean_token_accuracy": 0.8707276359200478,
"num_tokens": 3515057.0,
"step": 597
},
{
"entropy": 1.57838936150074,
"epoch": 1.3629101283880172,
"grad_norm": 3.484375,
"learning_rate": 1.2745460124791425e-06,
"loss": 0.4295,
"mean_token_accuracy": 0.8608080074191093,
"num_tokens": 3520795.0,
"step": 598
},
{
"entropy": 1.4834775626659393,
"epoch": 1.3651925820256776,
"grad_norm": 3.1875,
"learning_rate": 1.266346464691346e-06,
"loss": 0.4126,
"mean_token_accuracy": 0.8710288777947426,
"num_tokens": 3526380.0,
"step": 599
},
{
"entropy": 1.5034915506839752,
"epoch": 1.367475035663338,
"grad_norm": 3.484375,
"learning_rate": 1.25816442183193e-06,
"loss": 0.5211,
"mean_token_accuracy": 0.837138943374157,
"num_tokens": 3531865.0,
"step": 600
},
{
"epoch": 1.367475035663338,
"eval_entropy": 1.4793427891201443,
"eval_loss": 0.473636656999588,
"eval_mean_token_accuracy": 0.8656807369656033,
"eval_num_tokens": 3531865.0,
"eval_runtime": 4.3898,
"eval_samples_per_second": 20.502,
"eval_steps_per_second": 20.502,
"step": 600
}
],
"logging_steps": 1,
"max_steps": 878,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.360105773011712e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}