{ "best_global_step": 600, "best_metric": 0.473636656999588, "best_model_checkpoint": "./liquidaps-clean-large/checkpoint-600", "epoch": 1.367475035663338, "eval_steps": 100, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.3900936841964722, "epoch": 0.002282453637660485, "grad_norm": 13.0, "learning_rate": 0.0, "loss": 0.8776, "mean_token_accuracy": 0.7829889133572578, "num_tokens": 5919.0, "step": 1 }, { "entropy": 1.4252997040748596, "epoch": 0.00456490727532097, "grad_norm": 12.875, "learning_rate": 1.1363636363636364e-07, "loss": 0.6708, "mean_token_accuracy": 0.8342809975147247, "num_tokens": 11950.0, "step": 2 }, { "entropy": 1.398602306842804, "epoch": 0.0068473609129814554, "grad_norm": 13.375, "learning_rate": 2.2727272727272729e-07, "loss": 0.817, "mean_token_accuracy": 0.7826605513691902, "num_tokens": 17559.0, "step": 3 }, { "entropy": 1.3683724850416183, "epoch": 0.00912981455064194, "grad_norm": 13.875, "learning_rate": 3.409090909090909e-07, "loss": 0.8089, "mean_token_accuracy": 0.8110606968402863, "num_tokens": 23355.0, "step": 4 }, { "entropy": 1.6440566033124924, "epoch": 0.011412268188302425, "grad_norm": 16.5, "learning_rate": 4.5454545454545457e-07, "loss": 1.0826, "mean_token_accuracy": 0.7466800287365913, "num_tokens": 28342.0, "step": 5 }, { "entropy": 1.2425581067800522, "epoch": 0.013694721825962911, "grad_norm": 14.0625, "learning_rate": 5.681818181818182e-07, "loss": 0.8384, "mean_token_accuracy": 0.8118839636445045, "num_tokens": 33937.0, "step": 6 }, { "entropy": 1.494078889489174, "epoch": 0.015977175463623396, "grad_norm": 14.9375, "learning_rate": 6.818181818181818e-07, "loss": 0.8747, "mean_token_accuracy": 0.800664909183979, "num_tokens": 39724.0, "step": 7 }, { "entropy": 1.3064402341842651, "epoch": 0.01825962910128388, "grad_norm": 12.0, "learning_rate": 7.954545454545455e-07, "loss": 0.8043, "mean_token_accuracy": 0.8063121438026428, "num_tokens": 46054.0, "step": 8 }, { "entropy": 1.507575884461403, "epoch": 0.020542082738944364, "grad_norm": 17.25, "learning_rate": 9.090909090909091e-07, "loss": 1.0366, "mean_token_accuracy": 0.7458265796303749, "num_tokens": 50806.0, "step": 9 }, { "entropy": 1.3228261321783066, "epoch": 0.02282453637660485, "grad_norm": 13.0, "learning_rate": 1.0227272727272729e-06, "loss": 0.6629, "mean_token_accuracy": 0.8548868969082832, "num_tokens": 56696.0, "step": 10 }, { "entropy": 1.3493094593286514, "epoch": 0.025106990014265335, "grad_norm": 10.9375, "learning_rate": 1.1363636363636364e-06, "loss": 0.7411, "mean_token_accuracy": 0.8017316684126854, "num_tokens": 63680.0, "step": 11 }, { "entropy": 1.3807552456855774, "epoch": 0.027389443651925822, "grad_norm": 12.9375, "learning_rate": 1.25e-06, "loss": 0.8135, "mean_token_accuracy": 0.7994487285614014, "num_tokens": 69861.0, "step": 12 }, { "entropy": 1.4055243730545044, "epoch": 0.029671897289586305, "grad_norm": 11.9375, "learning_rate": 1.3636363636363636e-06, "loss": 0.9012, "mean_token_accuracy": 0.7958070710301399, "num_tokens": 75989.0, "step": 13 }, { "entropy": 1.431694433093071, "epoch": 0.03195435092724679, "grad_norm": 13.75, "learning_rate": 1.4772727272727275e-06, "loss": 0.9413, "mean_token_accuracy": 0.7656892687082291, "num_tokens": 81844.0, "step": 14 }, { "entropy": 1.5010923892259598, "epoch": 0.034236804564907276, "grad_norm": 15.3125, "learning_rate": 1.590909090909091e-06, "loss": 1.0155, "mean_token_accuracy": 0.7734663560986519, "num_tokens": 86897.0, "step": 15 }, { "entropy": 1.4839733690023422, "epoch": 0.03651925820256776, "grad_norm": 12.9375, "learning_rate": 1.7045454545454546e-06, "loss": 0.8776, "mean_token_accuracy": 0.7831285521388054, "num_tokens": 92714.0, "step": 16 }, { "entropy": 1.3343003541231155, "epoch": 0.038801711840228244, "grad_norm": 9.375, "learning_rate": 1.8181818181818183e-06, "loss": 0.7208, "mean_token_accuracy": 0.8181507587432861, "num_tokens": 100046.0, "step": 17 }, { "entropy": 1.488086387515068, "epoch": 0.04108416547788873, "grad_norm": 12.125, "learning_rate": 1.931818181818182e-06, "loss": 0.7636, "mean_token_accuracy": 0.7991937696933746, "num_tokens": 105549.0, "step": 18 }, { "entropy": 1.3153499066829681, "epoch": 0.04336661911554922, "grad_norm": 11.375, "learning_rate": 2.0454545454545457e-06, "loss": 0.7598, "mean_token_accuracy": 0.8102546408772469, "num_tokens": 111552.0, "step": 19 }, { "entropy": 1.3515659272670746, "epoch": 0.0456490727532097, "grad_norm": 11.375, "learning_rate": 2.1590909090909092e-06, "loss": 0.7113, "mean_token_accuracy": 0.810497097671032, "num_tokens": 117303.0, "step": 20 }, { "entropy": 1.4470301866531372, "epoch": 0.047931526390870186, "grad_norm": 11.125, "learning_rate": 2.2727272727272728e-06, "loss": 0.8029, "mean_token_accuracy": 0.7923144474625587, "num_tokens": 123355.0, "step": 21 }, { "entropy": 1.3571707159280777, "epoch": 0.05021398002853067, "grad_norm": 9.4375, "learning_rate": 2.3863636363636367e-06, "loss": 0.6621, "mean_token_accuracy": 0.8315573260188103, "num_tokens": 129801.0, "step": 22 }, { "entropy": 1.4135605692863464, "epoch": 0.05249643366619115, "grad_norm": 10.875, "learning_rate": 2.5e-06, "loss": 0.7478, "mean_token_accuracy": 0.8041789308190346, "num_tokens": 135168.0, "step": 23 }, { "entropy": 1.4300416111946106, "epoch": 0.054778887303851644, "grad_norm": 10.0625, "learning_rate": 2.6136363636363637e-06, "loss": 0.7541, "mean_token_accuracy": 0.8075885996222496, "num_tokens": 141202.0, "step": 24 }, { "entropy": 1.3513601571321487, "epoch": 0.05706134094151213, "grad_norm": 9.25, "learning_rate": 2.7272727272727272e-06, "loss": 0.6913, "mean_token_accuracy": 0.8184778317809105, "num_tokens": 147326.0, "step": 25 }, { "entropy": 1.3810700178146362, "epoch": 0.05934379457917261, "grad_norm": 9.75, "learning_rate": 2.8409090909090916e-06, "loss": 0.6849, "mean_token_accuracy": 0.8293009474873543, "num_tokens": 153439.0, "step": 26 }, { "entropy": 1.3730244934558868, "epoch": 0.061626248216833095, "grad_norm": 9.0625, "learning_rate": 2.954545454545455e-06, "loss": 0.6562, "mean_token_accuracy": 0.8283357098698616, "num_tokens": 159411.0, "step": 27 }, { "entropy": 1.337988331913948, "epoch": 0.06390870185449359, "grad_norm": 8.375, "learning_rate": 3.0681818181818186e-06, "loss": 0.5966, "mean_token_accuracy": 0.837442196905613, "num_tokens": 165669.0, "step": 28 }, { "entropy": 1.4772655963897705, "epoch": 0.06619115549215407, "grad_norm": 9.6875, "learning_rate": 3.181818181818182e-06, "loss": 0.7038, "mean_token_accuracy": 0.8186220824718475, "num_tokens": 170944.0, "step": 29 }, { "entropy": 1.3892450034618378, "epoch": 0.06847360912981455, "grad_norm": 7.8125, "learning_rate": 3.2954545454545456e-06, "loss": 0.658, "mean_token_accuracy": 0.8269658461213112, "num_tokens": 176755.0, "step": 30 }, { "entropy": 1.490507110953331, "epoch": 0.07075606276747504, "grad_norm": 8.375, "learning_rate": 3.409090909090909e-06, "loss": 0.7584, "mean_token_accuracy": 0.7987356930971146, "num_tokens": 182319.0, "step": 31 }, { "entropy": 1.3267859369516373, "epoch": 0.07303851640513552, "grad_norm": 7.15625, "learning_rate": 3.522727272727273e-06, "loss": 0.6272, "mean_token_accuracy": 0.8291826993227005, "num_tokens": 188236.0, "step": 32 }, { "entropy": 1.4844342470169067, "epoch": 0.075320970042796, "grad_norm": 7.53125, "learning_rate": 3.6363636363636366e-06, "loss": 0.724, "mean_token_accuracy": 0.806972049176693, "num_tokens": 193965.0, "step": 33 }, { "entropy": 1.4742888659238815, "epoch": 0.07760342368045649, "grad_norm": 7.03125, "learning_rate": 3.7500000000000005e-06, "loss": 0.6635, "mean_token_accuracy": 0.8269493877887726, "num_tokens": 199814.0, "step": 34 }, { "entropy": 1.3930696845054626, "epoch": 0.07988587731811697, "grad_norm": 6.5625, "learning_rate": 3.863636363636364e-06, "loss": 0.6553, "mean_token_accuracy": 0.8298437520861626, "num_tokens": 205725.0, "step": 35 }, { "entropy": 1.4377078860998154, "epoch": 0.08216833095577745, "grad_norm": 6.875, "learning_rate": 3.9772727272727275e-06, "loss": 0.6647, "mean_token_accuracy": 0.8262319192290306, "num_tokens": 211044.0, "step": 36 }, { "entropy": 1.4484449177980423, "epoch": 0.08445078459343795, "grad_norm": 5.8125, "learning_rate": 4.0909090909090915e-06, "loss": 0.6505, "mean_token_accuracy": 0.8214789107441902, "num_tokens": 217143.0, "step": 37 }, { "entropy": 1.3406399488449097, "epoch": 0.08673323823109844, "grad_norm": 5.5, "learning_rate": 4.204545454545455e-06, "loss": 0.5331, "mean_token_accuracy": 0.8669695928692818, "num_tokens": 224084.0, "step": 38 }, { "entropy": 1.465222254395485, "epoch": 0.08901569186875892, "grad_norm": 6.09375, "learning_rate": 4.3181818181818185e-06, "loss": 0.5913, "mean_token_accuracy": 0.8346145749092102, "num_tokens": 229446.0, "step": 39 }, { "entropy": 1.4082716703414917, "epoch": 0.0912981455064194, "grad_norm": 5.4375, "learning_rate": 4.4318181818181824e-06, "loss": 0.4967, "mean_token_accuracy": 0.8573063313961029, "num_tokens": 235250.0, "step": 40 }, { "entropy": 1.4065438956022263, "epoch": 0.09358059914407989, "grad_norm": 4.53125, "learning_rate": 4.5454545454545455e-06, "loss": 0.5228, "mean_token_accuracy": 0.8517210483551025, "num_tokens": 241666.0, "step": 41 }, { "entropy": 1.4178601205348969, "epoch": 0.09586305278174037, "grad_norm": 4.875, "learning_rate": 4.6590909090909095e-06, "loss": 0.5534, "mean_token_accuracy": 0.8581771478056908, "num_tokens": 247901.0, "step": 42 }, { "entropy": 1.4665435552597046, "epoch": 0.09814550641940085, "grad_norm": 4.84375, "learning_rate": 4.772727272727273e-06, "loss": 0.524, "mean_token_accuracy": 0.8218341246247292, "num_tokens": 253273.0, "step": 43 }, { "entropy": 1.4858266711235046, "epoch": 0.10042796005706134, "grad_norm": 5.1875, "learning_rate": 4.8863636363636365e-06, "loss": 0.6752, "mean_token_accuracy": 0.826298251748085, "num_tokens": 258616.0, "step": 44 }, { "entropy": 1.3626787662506104, "epoch": 0.10271041369472182, "grad_norm": 4.5, "learning_rate": 5e-06, "loss": 0.5618, "mean_token_accuracy": 0.8469245880842209, "num_tokens": 264408.0, "step": 45 }, { "entropy": 1.3205972537398338, "epoch": 0.1049928673323823, "grad_norm": 3.796875, "learning_rate": 4.99998226312344e-06, "loss": 0.4616, "mean_token_accuracy": 0.8739962726831436, "num_tokens": 270566.0, "step": 46 }, { "entropy": 1.3779225647449493, "epoch": 0.10727532097004279, "grad_norm": 3.484375, "learning_rate": 4.999929052745434e-06, "loss": 0.4547, "mean_token_accuracy": 0.8725937232375145, "num_tokens": 276849.0, "step": 47 }, { "entropy": 1.5054886192083359, "epoch": 0.10955777460770329, "grad_norm": 4.71875, "learning_rate": 4.999840369621011e-06, "loss": 0.5994, "mean_token_accuracy": 0.8370054960250854, "num_tokens": 283205.0, "step": 48 }, { "entropy": 1.5157189071178436, "epoch": 0.11184022824536377, "grad_norm": 4.65625, "learning_rate": 4.999716215008542e-06, "loss": 0.5843, "mean_token_accuracy": 0.8259787857532501, "num_tokens": 288059.0, "step": 49 }, { "entropy": 1.38004170358181, "epoch": 0.11412268188302425, "grad_norm": 3.8125, "learning_rate": 4.999556590669718e-06, "loss": 0.405, "mean_token_accuracy": 0.8887585029006004, "num_tokens": 293798.0, "step": 50 }, { "entropy": 1.6085818111896515, "epoch": 0.11640513552068474, "grad_norm": 6.21875, "learning_rate": 4.99936149886953e-06, "loss": 0.5947, "mean_token_accuracy": 0.8224818632006645, "num_tokens": 298157.0, "step": 51 }, { "entropy": 1.4853103458881378, "epoch": 0.11868758915834522, "grad_norm": 3.453125, "learning_rate": 4.999130942376232e-06, "loss": 0.4428, "mean_token_accuracy": 0.8794936537742615, "num_tokens": 304309.0, "step": 52 }, { "entropy": 1.6272333711385727, "epoch": 0.1209700427960057, "grad_norm": 4.6875, "learning_rate": 4.998864924461305e-06, "loss": 0.5762, "mean_token_accuracy": 0.8293572887778282, "num_tokens": 309756.0, "step": 53 }, { "entropy": 1.289240226149559, "epoch": 0.12325249643366619, "grad_norm": 3.265625, "learning_rate": 4.998563448899413e-06, "loss": 0.4, "mean_token_accuracy": 0.8821459114551544, "num_tokens": 316395.0, "step": 54 }, { "entropy": 1.555517390370369, "epoch": 0.12553495007132667, "grad_norm": 4.5, "learning_rate": 4.998226519968341e-06, "loss": 0.5261, "mean_token_accuracy": 0.8417777121067047, "num_tokens": 321365.0, "step": 55 }, { "entropy": 1.4511889964342117, "epoch": 0.12781740370898717, "grad_norm": 3.828125, "learning_rate": 4.997854142448944e-06, "loss": 0.5362, "mean_token_accuracy": 0.8543838635087013, "num_tokens": 327850.0, "step": 56 }, { "entropy": 1.480227530002594, "epoch": 0.13009985734664764, "grad_norm": 4.90625, "learning_rate": 4.9974463216250735e-06, "loss": 0.6281, "mean_token_accuracy": 0.8336407989263535, "num_tokens": 332724.0, "step": 57 }, { "entropy": 1.4882567524909973, "epoch": 0.13238231098430814, "grad_norm": 4.03125, "learning_rate": 4.997003063283503e-06, "loss": 0.5103, "mean_token_accuracy": 0.854725182056427, "num_tokens": 338496.0, "step": 58 }, { "entropy": 1.3099189698696136, "epoch": 0.1346647646219686, "grad_norm": 3.546875, "learning_rate": 4.996524373713848e-06, "loss": 0.4035, "mean_token_accuracy": 0.8902565762400627, "num_tokens": 344181.0, "step": 59 }, { "entropy": 1.554222896695137, "epoch": 0.1369472182596291, "grad_norm": 4.28125, "learning_rate": 4.996010259708475e-06, "loss": 0.5154, "mean_token_accuracy": 0.8221362680196762, "num_tokens": 349987.0, "step": 60 }, { "entropy": 1.3615255653858185, "epoch": 0.13922967189728958, "grad_norm": 4.125, "learning_rate": 4.995460728562403e-06, "loss": 0.5219, "mean_token_accuracy": 0.8591368719935417, "num_tokens": 355808.0, "step": 61 }, { "entropy": 1.5018275529146194, "epoch": 0.14151212553495007, "grad_norm": 3.8125, "learning_rate": 4.994875788073207e-06, "loss": 0.4981, "mean_token_accuracy": 0.8580456078052521, "num_tokens": 361358.0, "step": 62 }, { "entropy": 1.3897339552640915, "epoch": 0.14379457917261054, "grad_norm": 3.984375, "learning_rate": 4.9942554465409e-06, "loss": 0.4961, "mean_token_accuracy": 0.8571888878941536, "num_tokens": 366798.0, "step": 63 }, { "entropy": 1.3545932322740555, "epoch": 0.14607703281027104, "grad_norm": 3.34375, "learning_rate": 4.99359971276782e-06, "loss": 0.4023, "mean_token_accuracy": 0.8760626539587975, "num_tokens": 373039.0, "step": 64 }, { "entropy": 1.4018055945634842, "epoch": 0.14835948644793154, "grad_norm": 3.484375, "learning_rate": 4.992908596058501e-06, "loss": 0.4874, "mean_token_accuracy": 0.8551009446382523, "num_tokens": 379151.0, "step": 65 }, { "entropy": 1.408715844154358, "epoch": 0.150641940085592, "grad_norm": 3.78125, "learning_rate": 4.9921821062195445e-06, "loss": 0.5979, "mean_token_accuracy": 0.8376783430576324, "num_tokens": 385466.0, "step": 66 }, { "entropy": 1.478136882185936, "epoch": 0.1529243937232525, "grad_norm": 3.34375, "learning_rate": 4.9914202535594795e-06, "loss": 0.4359, "mean_token_accuracy": 0.8765653073787689, "num_tokens": 391861.0, "step": 67 }, { "entropy": 1.3361108154058456, "epoch": 0.15520684736091298, "grad_norm": 3.453125, "learning_rate": 4.990623048888615e-06, "loss": 0.4471, "mean_token_accuracy": 0.8761897683143616, "num_tokens": 397602.0, "step": 68 }, { "entropy": 1.5057465434074402, "epoch": 0.15748930099857347, "grad_norm": 3.984375, "learning_rate": 4.989790503518888e-06, "loss": 0.5262, "mean_token_accuracy": 0.8583421856164932, "num_tokens": 403847.0, "step": 69 }, { "entropy": 1.5415615290403366, "epoch": 0.15977175463623394, "grad_norm": 4.03125, "learning_rate": 4.988922629263701e-06, "loss": 0.598, "mean_token_accuracy": 0.8401808813214302, "num_tokens": 409563.0, "step": 70 }, { "entropy": 1.433893471956253, "epoch": 0.16205420827389444, "grad_norm": 3.875, "learning_rate": 4.988019438437759e-06, "loss": 0.5086, "mean_token_accuracy": 0.8572655767202377, "num_tokens": 415590.0, "step": 71 }, { "entropy": 1.5654226392507553, "epoch": 0.1643366619115549, "grad_norm": 4.65625, "learning_rate": 4.987080943856887e-06, "loss": 0.6098, "mean_token_accuracy": 0.8376531600952148, "num_tokens": 421266.0, "step": 72 }, { "entropy": 1.513851910829544, "epoch": 0.1666191155492154, "grad_norm": 3.78125, "learning_rate": 4.9861071588378565e-06, "loss": 0.4454, "mean_token_accuracy": 0.8665637820959091, "num_tokens": 426394.0, "step": 73 }, { "entropy": 1.5542047619819641, "epoch": 0.1689015691868759, "grad_norm": 4.71875, "learning_rate": 4.9850980971981914e-06, "loss": 0.6814, "mean_token_accuracy": 0.808769017457962, "num_tokens": 431932.0, "step": 74 }, { "entropy": 1.4060749858617783, "epoch": 0.17118402282453637, "grad_norm": 3.53125, "learning_rate": 4.984053773255971e-06, "loss": 0.4207, "mean_token_accuracy": 0.8581205531954765, "num_tokens": 437984.0, "step": 75 }, { "entropy": 1.4776476472616196, "epoch": 0.17346647646219687, "grad_norm": 3.9375, "learning_rate": 4.9829742018296335e-06, "loss": 0.5346, "mean_token_accuracy": 0.8503594622015953, "num_tokens": 444584.0, "step": 76 }, { "entropy": 1.3919195085763931, "epoch": 0.17574893009985734, "grad_norm": 3.21875, "learning_rate": 4.981859398237758e-06, "loss": 0.4565, "mean_token_accuracy": 0.8721787855029106, "num_tokens": 450943.0, "step": 77 }, { "entropy": 1.4689613282680511, "epoch": 0.17803138373751784, "grad_norm": 3.9375, "learning_rate": 4.980709378298851e-06, "loss": 0.5434, "mean_token_accuracy": 0.8531812652945518, "num_tokens": 456471.0, "step": 78 }, { "entropy": 1.474008470773697, "epoch": 0.1803138373751783, "grad_norm": 4.09375, "learning_rate": 4.979524158331123e-06, "loss": 0.531, "mean_token_accuracy": 0.8535453379154205, "num_tokens": 462328.0, "step": 79 }, { "entropy": 1.3587582856416702, "epoch": 0.1825962910128388, "grad_norm": 4.03125, "learning_rate": 4.978303755152254e-06, "loss": 0.4992, "mean_token_accuracy": 0.8549595400691032, "num_tokens": 468402.0, "step": 80 }, { "entropy": 1.3619231432676315, "epoch": 0.18487874465049928, "grad_norm": 3.359375, "learning_rate": 4.977048186079155e-06, "loss": 0.4981, "mean_token_accuracy": 0.8575711026787758, "num_tokens": 473714.0, "step": 81 }, { "entropy": 1.4384445995092392, "epoch": 0.18716119828815977, "grad_norm": 3.328125, "learning_rate": 4.975757468927727e-06, "loss": 0.4181, "mean_token_accuracy": 0.8731885701417923, "num_tokens": 479842.0, "step": 82 }, { "entropy": 1.5311954617500305, "epoch": 0.18944365192582024, "grad_norm": 4.34375, "learning_rate": 4.974431622012601e-06, "loss": 0.6287, "mean_token_accuracy": 0.821938157081604, "num_tokens": 485680.0, "step": 83 }, { "entropy": 1.358711913228035, "epoch": 0.19172610556348074, "grad_norm": 3.65625, "learning_rate": 4.973070664146885e-06, "loss": 0.4416, "mean_token_accuracy": 0.873858779668808, "num_tokens": 491390.0, "step": 84 }, { "entropy": 1.4033315032720566, "epoch": 0.19400855920114124, "grad_norm": 3.890625, "learning_rate": 4.971674614641891e-06, "loss": 0.4835, "mean_token_accuracy": 0.861111544072628, "num_tokens": 497469.0, "step": 85 }, { "entropy": 1.373718798160553, "epoch": 0.1962910128388017, "grad_norm": 3.46875, "learning_rate": 4.970243493306865e-06, "loss": 0.4599, "mean_token_accuracy": 0.8647707998752594, "num_tokens": 503754.0, "step": 86 }, { "entropy": 1.4159798175096512, "epoch": 0.1985734664764622, "grad_norm": 3.71875, "learning_rate": 4.968777320448707e-06, "loss": 0.41, "mean_token_accuracy": 0.8731393367052078, "num_tokens": 509255.0, "step": 87 }, { "entropy": 1.397733435034752, "epoch": 0.20085592011412268, "grad_norm": 4.0, "learning_rate": 4.9672761168716766e-06, "loss": 0.4607, "mean_token_accuracy": 0.8771609216928482, "num_tokens": 515162.0, "step": 88 }, { "entropy": 1.3901693522930145, "epoch": 0.20313837375178317, "grad_norm": 3.703125, "learning_rate": 4.9657399038771045e-06, "loss": 0.4985, "mean_token_accuracy": 0.8564205095171928, "num_tokens": 520980.0, "step": 89 }, { "entropy": 1.470759555697441, "epoch": 0.20542082738944364, "grad_norm": 4.09375, "learning_rate": 4.964168703263086e-06, "loss": 0.5552, "mean_token_accuracy": 0.834749348461628, "num_tokens": 526901.0, "step": 90 }, { "entropy": 1.5493524819612503, "epoch": 0.20770328102710414, "grad_norm": 4.09375, "learning_rate": 4.962562537324176e-06, "loss": 0.5276, "mean_token_accuracy": 0.8242713585495949, "num_tokens": 532502.0, "step": 91 }, { "entropy": 1.4955510944128036, "epoch": 0.2099857346647646, "grad_norm": 4.5, "learning_rate": 4.960921428851066e-06, "loss": 0.6117, "mean_token_accuracy": 0.8246004208922386, "num_tokens": 538159.0, "step": 92 }, { "entropy": 1.4567335098981857, "epoch": 0.2122681883024251, "grad_norm": 3.0, "learning_rate": 4.959245401130269e-06, "loss": 0.3503, "mean_token_accuracy": 0.8856313973665237, "num_tokens": 544079.0, "step": 93 }, { "entropy": 1.458535224199295, "epoch": 0.21455064194008558, "grad_norm": 3.625, "learning_rate": 4.957534477943782e-06, "loss": 0.4434, "mean_token_accuracy": 0.858425110578537, "num_tokens": 550037.0, "step": 94 }, { "entropy": 1.3983053117990494, "epoch": 0.21683309557774608, "grad_norm": 3.375, "learning_rate": 4.955788683568749e-06, "loss": 0.4004, "mean_token_accuracy": 0.8748428821563721, "num_tokens": 556585.0, "step": 95 }, { "entropy": 1.481145054101944, "epoch": 0.21911554921540657, "grad_norm": 3.3125, "learning_rate": 4.954008042777125e-06, "loss": 0.409, "mean_token_accuracy": 0.8758149892091751, "num_tokens": 562355.0, "step": 96 }, { "entropy": 1.6243803054094315, "epoch": 0.22139800285306704, "grad_norm": 4.75, "learning_rate": 4.952192580835313e-06, "loss": 0.6636, "mean_token_accuracy": 0.7973536550998688, "num_tokens": 568202.0, "step": 97 }, { "entropy": 1.575976401567459, "epoch": 0.22368045649072754, "grad_norm": 4.59375, "learning_rate": 4.950342323503812e-06, "loss": 0.6046, "mean_token_accuracy": 0.813086025416851, "num_tokens": 573655.0, "step": 98 }, { "entropy": 1.5205018073320389, "epoch": 0.225962910128388, "grad_norm": 3.953125, "learning_rate": 4.9484572970368516e-06, "loss": 0.5502, "mean_token_accuracy": 0.8478811085224152, "num_tokens": 579742.0, "step": 99 }, { "entropy": 1.5319028943777084, "epoch": 0.2282453637660485, "grad_norm": 4.71875, "learning_rate": 4.946537528182017e-06, "loss": 0.6014, "mean_token_accuracy": 0.8344146087765694, "num_tokens": 584824.0, "step": 100 }, { "epoch": 0.2282453637660485, "eval_entropy": 1.4501528475019667, "eval_loss": 0.5052191615104675, "eval_mean_token_accuracy": 0.8605326036612193, "eval_num_tokens": 584824.0, "eval_runtime": 4.4666, "eval_samples_per_second": 20.149, "eval_steps_per_second": 20.149, "step": 100 }, { "entropy": 1.3917143046855927, "epoch": 0.23052781740370898, "grad_norm": 3.203125, "learning_rate": 4.944583044179871e-06, "loss": 0.3933, "mean_token_accuracy": 0.8733155429363251, "num_tokens": 590608.0, "step": 101 }, { "entropy": 1.3328562825918198, "epoch": 0.23281027104136948, "grad_norm": 3.0625, "learning_rate": 4.942593872763566e-06, "loss": 0.3922, "mean_token_accuracy": 0.8770610764622688, "num_tokens": 596918.0, "step": 102 }, { "entropy": 1.3897913247346878, "epoch": 0.23509272467902995, "grad_norm": 3.4375, "learning_rate": 4.940570042158454e-06, "loss": 0.4864, "mean_token_accuracy": 0.8629380613565445, "num_tokens": 602674.0, "step": 103 }, { "entropy": 1.5906241983175278, "epoch": 0.23737517831669044, "grad_norm": 4.46875, "learning_rate": 4.93851158108168e-06, "loss": 0.6066, "mean_token_accuracy": 0.8188068121671677, "num_tokens": 608041.0, "step": 104 }, { "entropy": 1.421783059835434, "epoch": 0.2396576319543509, "grad_norm": 3.453125, "learning_rate": 4.93641851874178e-06, "loss": 0.4813, "mean_token_accuracy": 0.8542051687836647, "num_tokens": 613908.0, "step": 105 }, { "entropy": 1.4839935898780823, "epoch": 0.2419400855920114, "grad_norm": 4.34375, "learning_rate": 4.934290884838266e-06, "loss": 0.539, "mean_token_accuracy": 0.8587613850831985, "num_tokens": 620475.0, "step": 106 }, { "entropy": 1.4981091767549515, "epoch": 0.2442225392296719, "grad_norm": 3.40625, "learning_rate": 4.932128709561202e-06, "loss": 0.4702, "mean_token_accuracy": 0.866189256310463, "num_tokens": 626833.0, "step": 107 }, { "entropy": 1.47100168466568, "epoch": 0.24650499286733238, "grad_norm": 3.453125, "learning_rate": 4.929932023590776e-06, "loss": 0.4146, "mean_token_accuracy": 0.8706357181072235, "num_tokens": 632605.0, "step": 108 }, { "entropy": 1.4089600145816803, "epoch": 0.24878744650499288, "grad_norm": 2.921875, "learning_rate": 4.9277008580968665e-06, "loss": 0.4052, "mean_token_accuracy": 0.8793638423085213, "num_tokens": 639026.0, "step": 109 }, { "entropy": 1.4623335748910904, "epoch": 0.25106990014265335, "grad_norm": 3.109375, "learning_rate": 4.925435244738599e-06, "loss": 0.4251, "mean_token_accuracy": 0.8607661128044128, "num_tokens": 645661.0, "step": 110 }, { "entropy": 1.469603717327118, "epoch": 0.25335235378031384, "grad_norm": 3.203125, "learning_rate": 4.923135215663897e-06, "loss": 0.4562, "mean_token_accuracy": 0.8637586832046509, "num_tokens": 652088.0, "step": 111 }, { "entropy": 1.4699177891016006, "epoch": 0.25563480741797434, "grad_norm": 3.78125, "learning_rate": 4.920800803509026e-06, "loss": 0.4358, "mean_token_accuracy": 0.8661052659153938, "num_tokens": 657148.0, "step": 112 }, { "entropy": 1.4687887877225876, "epoch": 0.2579172610556348, "grad_norm": 4.15625, "learning_rate": 4.91843204139813e-06, "loss": 0.4832, "mean_token_accuracy": 0.87067711353302, "num_tokens": 662846.0, "step": 113 }, { "entropy": 1.3910206109285355, "epoch": 0.2601997146932953, "grad_norm": 3.96875, "learning_rate": 4.916028962942763e-06, "loss": 0.4606, "mean_token_accuracy": 0.8688057661056519, "num_tokens": 668283.0, "step": 114 }, { "entropy": 1.4946473091840744, "epoch": 0.2624821683309558, "grad_norm": 4.4375, "learning_rate": 4.913591602241409e-06, "loss": 0.5177, "mean_token_accuracy": 0.8503523468971252, "num_tokens": 673962.0, "step": 115 }, { "entropy": 1.4268899112939835, "epoch": 0.2647646219686163, "grad_norm": 3.734375, "learning_rate": 4.911119993878999e-06, "loss": 0.4608, "mean_token_accuracy": 0.8624838441610336, "num_tokens": 679433.0, "step": 116 }, { "entropy": 1.4775933474302292, "epoch": 0.2670470756062768, "grad_norm": 3.359375, "learning_rate": 4.908614172926426e-06, "loss": 0.373, "mean_token_accuracy": 0.8674890100955963, "num_tokens": 685178.0, "step": 117 }, { "entropy": 1.4562716633081436, "epoch": 0.2693295292439372, "grad_norm": 3.890625, "learning_rate": 4.906074174940038e-06, "loss": 0.5465, "mean_token_accuracy": 0.8421404510736465, "num_tokens": 691044.0, "step": 118 }, { "entropy": 1.404031679034233, "epoch": 0.2716119828815977, "grad_norm": 3.625, "learning_rate": 4.903500035961139e-06, "loss": 0.4888, "mean_token_accuracy": 0.8540224209427834, "num_tokens": 697301.0, "step": 119 }, { "entropy": 1.421856850385666, "epoch": 0.2738944365192582, "grad_norm": 3.328125, "learning_rate": 4.9008917925154795e-06, "loss": 0.438, "mean_token_accuracy": 0.8775565698742867, "num_tokens": 704275.0, "step": 120 }, { "entropy": 1.5078845471143723, "epoch": 0.2761768901569187, "grad_norm": 3.640625, "learning_rate": 4.89824948161273e-06, "loss": 0.4837, "mean_token_accuracy": 0.8578910827636719, "num_tokens": 710429.0, "step": 121 }, { "entropy": 1.4396383464336395, "epoch": 0.27845934379457915, "grad_norm": 3.71875, "learning_rate": 4.895573140745967e-06, "loss": 0.5219, "mean_token_accuracy": 0.8433092087507248, "num_tokens": 715838.0, "step": 122 }, { "entropy": 1.4553385972976685, "epoch": 0.28074179743223965, "grad_norm": 3.578125, "learning_rate": 4.892862807891131e-06, "loss": 0.4401, "mean_token_accuracy": 0.869629830121994, "num_tokens": 721249.0, "step": 123 }, { "entropy": 1.4222912788391113, "epoch": 0.28302425106990015, "grad_norm": 3.921875, "learning_rate": 4.890118521506494e-06, "loss": 0.5689, "mean_token_accuracy": 0.8471446335315704, "num_tokens": 727806.0, "step": 124 }, { "entropy": 1.4638441801071167, "epoch": 0.28530670470756064, "grad_norm": 3.59375, "learning_rate": 4.8873403205321115e-06, "loss": 0.4898, "mean_token_accuracy": 0.8609614819288254, "num_tokens": 733588.0, "step": 125 }, { "entropy": 1.360969141125679, "epoch": 0.2875891583452211, "grad_norm": 4.9375, "learning_rate": 4.884528244389269e-06, "loss": 0.5004, "mean_token_accuracy": 0.8577578216791153, "num_tokens": 739069.0, "step": 126 }, { "entropy": 1.4701900631189346, "epoch": 0.2898716119828816, "grad_norm": 3.890625, "learning_rate": 4.881682332979925e-06, "loss": 0.4782, "mean_token_accuracy": 0.8597236052155495, "num_tokens": 744612.0, "step": 127 }, { "entropy": 1.484321504831314, "epoch": 0.2921540656205421, "grad_norm": 4.34375, "learning_rate": 4.878802626686141e-06, "loss": 0.5044, "mean_token_accuracy": 0.8599332422018051, "num_tokens": 750198.0, "step": 128 }, { "entropy": 1.4526187181472778, "epoch": 0.2944365192582026, "grad_norm": 4.25, "learning_rate": 4.8758891663695165e-06, "loss": 0.5283, "mean_token_accuracy": 0.8519927933812141, "num_tokens": 755825.0, "step": 129 }, { "entropy": 1.487746685743332, "epoch": 0.2967189728958631, "grad_norm": 3.859375, "learning_rate": 4.872941993370598e-06, "loss": 0.4834, "mean_token_accuracy": 0.865722268819809, "num_tokens": 762609.0, "step": 130 }, { "entropy": 1.4334597885608673, "epoch": 0.2990014265335235, "grad_norm": 3.609375, "learning_rate": 4.869961149508301e-06, "loss": 0.462, "mean_token_accuracy": 0.8797513917088509, "num_tokens": 768825.0, "step": 131 }, { "entropy": 1.5593868792057037, "epoch": 0.301283880171184, "grad_norm": 3.75, "learning_rate": 4.866946677079314e-06, "loss": 0.4398, "mean_token_accuracy": 0.8622937723994255, "num_tokens": 774231.0, "step": 132 }, { "entropy": 1.582775130867958, "epoch": 0.3035663338088445, "grad_norm": 4.1875, "learning_rate": 4.8638986188574955e-06, "loss": 0.5733, "mean_token_accuracy": 0.8216232135891914, "num_tokens": 779217.0, "step": 133 }, { "entropy": 1.4957093298435211, "epoch": 0.305848787446505, "grad_norm": 3.875, "learning_rate": 4.8608170180932725e-06, "loss": 0.4983, "mean_token_accuracy": 0.8560524433851242, "num_tokens": 785209.0, "step": 134 }, { "entropy": 1.4334331154823303, "epoch": 0.30813124108416545, "grad_norm": 3.375, "learning_rate": 4.857701918513023e-06, "loss": 0.4457, "mean_token_accuracy": 0.8704549074172974, "num_tokens": 791251.0, "step": 135 }, { "entropy": 1.4960424304008484, "epoch": 0.31041369472182595, "grad_norm": 3.546875, "learning_rate": 4.854553364318456e-06, "loss": 0.4823, "mean_token_accuracy": 0.869213730096817, "num_tokens": 797202.0, "step": 136 }, { "entropy": 1.3933140188455582, "epoch": 0.31269614835948645, "grad_norm": 3.0, "learning_rate": 4.851371400185986e-06, "loss": 0.4387, "mean_token_accuracy": 0.8605329319834709, "num_tokens": 804144.0, "step": 137 }, { "entropy": 1.4915095120668411, "epoch": 0.31497860199714695, "grad_norm": 4.03125, "learning_rate": 4.848156071266095e-06, "loss": 0.404, "mean_token_accuracy": 0.8624937981367111, "num_tokens": 809125.0, "step": 138 }, { "entropy": 1.422121912240982, "epoch": 0.31726105563480744, "grad_norm": 3.5, "learning_rate": 4.844907423182699e-06, "loss": 0.3698, "mean_token_accuracy": 0.8753552809357643, "num_tokens": 814420.0, "step": 139 }, { "entropy": 1.3587403669953346, "epoch": 0.3195435092724679, "grad_norm": 3.65625, "learning_rate": 4.841625502032495e-06, "loss": 0.4201, "mean_token_accuracy": 0.8749541118741035, "num_tokens": 819445.0, "step": 140 }, { "entropy": 1.5249932259321213, "epoch": 0.3218259629101284, "grad_norm": 3.765625, "learning_rate": 4.838310354384304e-06, "loss": 0.4569, "mean_token_accuracy": 0.8636204749345779, "num_tokens": 825423.0, "step": 141 }, { "entropy": 1.4787572473287582, "epoch": 0.3241084165477889, "grad_norm": 4.28125, "learning_rate": 4.834962027278418e-06, "loss": 0.4271, "mean_token_accuracy": 0.8966826573014259, "num_tokens": 830608.0, "step": 142 }, { "entropy": 1.3900313079357147, "epoch": 0.3263908701854494, "grad_norm": 3.3125, "learning_rate": 4.831580568225931e-06, "loss": 0.4272, "mean_token_accuracy": 0.8754951432347298, "num_tokens": 837069.0, "step": 143 }, { "entropy": 1.4659005105495453, "epoch": 0.3286733238231098, "grad_norm": 3.71875, "learning_rate": 4.828166025208059e-06, "loss": 0.4788, "mean_token_accuracy": 0.8542606756091118, "num_tokens": 842779.0, "step": 144 }, { "entropy": 1.4241313189268112, "epoch": 0.3309557774607703, "grad_norm": 3.5, "learning_rate": 4.824718446675465e-06, "loss": 0.4501, "mean_token_accuracy": 0.8673816919326782, "num_tokens": 848075.0, "step": 145 }, { "entropy": 1.3615167737007141, "epoch": 0.3332382310984308, "grad_norm": 3.984375, "learning_rate": 4.821237881547567e-06, "loss": 0.4803, "mean_token_accuracy": 0.8680660426616669, "num_tokens": 853972.0, "step": 146 }, { "entropy": 1.4747860878705978, "epoch": 0.3355206847360913, "grad_norm": 3.890625, "learning_rate": 4.8177243792118515e-06, "loss": 0.4336, "mean_token_accuracy": 0.8747361823916435, "num_tokens": 859859.0, "step": 147 }, { "entropy": 1.5414969474077225, "epoch": 0.3378031383737518, "grad_norm": 3.484375, "learning_rate": 4.814177989523162e-06, "loss": 0.4489, "mean_token_accuracy": 0.8644633367657661, "num_tokens": 865836.0, "step": 148 }, { "entropy": 1.6249495893716812, "epoch": 0.34008559201141225, "grad_norm": 3.53125, "learning_rate": 4.810598762803e-06, "loss": 0.5226, "mean_token_accuracy": 0.8477596640586853, "num_tokens": 872086.0, "step": 149 }, { "entropy": 1.4743667244911194, "epoch": 0.34236804564907275, "grad_norm": 3.75, "learning_rate": 4.8069867498388066e-06, "loss": 0.4693, "mean_token_accuracy": 0.8513918668031693, "num_tokens": 877138.0, "step": 150 }, { "entropy": 1.3822671622037888, "epoch": 0.34465049928673325, "grad_norm": 3.21875, "learning_rate": 4.803342001883247e-06, "loss": 0.408, "mean_token_accuracy": 0.8763712868094444, "num_tokens": 883268.0, "step": 151 }, { "entropy": 1.4955266863107681, "epoch": 0.34693295292439374, "grad_norm": 4.15625, "learning_rate": 4.799664570653473e-06, "loss": 0.5271, "mean_token_accuracy": 0.8504318669438362, "num_tokens": 889206.0, "step": 152 }, { "entropy": 1.6125495880842209, "epoch": 0.3492154065620542, "grad_norm": 5.71875, "learning_rate": 4.795954508330403e-06, "loss": 0.6248, "mean_token_accuracy": 0.8179907724261284, "num_tokens": 894476.0, "step": 153 }, { "entropy": 1.5931424498558044, "epoch": 0.3514978601997147, "grad_norm": 4.75, "learning_rate": 4.792211867557969e-06, "loss": 0.4888, "mean_token_accuracy": 0.8579384312033653, "num_tokens": 899026.0, "step": 154 }, { "entropy": 1.4209279268980026, "epoch": 0.3537803138373752, "grad_norm": 3.484375, "learning_rate": 4.788436701442378e-06, "loss": 0.4354, "mean_token_accuracy": 0.8708065152168274, "num_tokens": 905347.0, "step": 155 }, { "entropy": 1.4381519109010696, "epoch": 0.3560627674750357, "grad_norm": 3.703125, "learning_rate": 4.784629063551354e-06, "loss": 0.5609, "mean_token_accuracy": 0.8458188697695732, "num_tokens": 911400.0, "step": 156 }, { "entropy": 1.4265454858541489, "epoch": 0.3583452211126961, "grad_norm": 3.5, "learning_rate": 4.780789007913379e-06, "loss": 0.516, "mean_token_accuracy": 0.8464484214782715, "num_tokens": 917633.0, "step": 157 }, { "entropy": 1.6952187418937683, "epoch": 0.3606276747503566, "grad_norm": 4.46875, "learning_rate": 4.776916589016928e-06, "loss": 0.6655, "mean_token_accuracy": 0.8154120817780495, "num_tokens": 922878.0, "step": 158 }, { "entropy": 1.4849806427955627, "epoch": 0.3629101283880171, "grad_norm": 3.984375, "learning_rate": 4.773011861809694e-06, "loss": 0.5529, "mean_token_accuracy": 0.8317237794399261, "num_tokens": 928432.0, "step": 159 }, { "entropy": 1.3825362920761108, "epoch": 0.3651925820256776, "grad_norm": 3.1875, "learning_rate": 4.769074881697806e-06, "loss": 0.422, "mean_token_accuracy": 0.8742568120360374, "num_tokens": 934019.0, "step": 160 }, { "entropy": 1.429061233997345, "epoch": 0.3674750356633381, "grad_norm": 3.328125, "learning_rate": 4.765105704545052e-06, "loss": 0.4181, "mean_token_accuracy": 0.8700381815433502, "num_tokens": 940405.0, "step": 161 }, { "entropy": 1.4522172808647156, "epoch": 0.36975748930099855, "grad_norm": 3.296875, "learning_rate": 4.761104386672074e-06, "loss": 0.4664, "mean_token_accuracy": 0.8705998063087463, "num_tokens": 946891.0, "step": 162 }, { "entropy": 1.4823334366083145, "epoch": 0.37203994293865905, "grad_norm": 3.171875, "learning_rate": 4.757070984855577e-06, "loss": 0.3902, "mean_token_accuracy": 0.8853188008069992, "num_tokens": 954063.0, "step": 163 }, { "entropy": 1.4951584190130234, "epoch": 0.37432239657631955, "grad_norm": 4.25, "learning_rate": 4.7530055563275225e-06, "loss": 0.4601, "mean_token_accuracy": 0.8618411421775818, "num_tokens": 959914.0, "step": 164 }, { "entropy": 1.4573408663272858, "epoch": 0.37660485021398005, "grad_norm": 4.625, "learning_rate": 4.748908158774312e-06, "loss": 0.5381, "mean_token_accuracy": 0.8516411259770393, "num_tokens": 965145.0, "step": 165 }, { "entropy": 1.4346065074205399, "epoch": 0.3788873038516405, "grad_norm": 3.421875, "learning_rate": 4.744778850335974e-06, "loss": 0.4718, "mean_token_accuracy": 0.8635387867689133, "num_tokens": 971469.0, "step": 166 }, { "entropy": 1.4204413443803787, "epoch": 0.381169757489301, "grad_norm": 3.6875, "learning_rate": 4.7406176896053356e-06, "loss": 0.4281, "mean_token_accuracy": 0.8760756626725197, "num_tokens": 976905.0, "step": 167 }, { "entropy": 1.4582399874925613, "epoch": 0.3834522111269615, "grad_norm": 3.625, "learning_rate": 4.736424735627193e-06, "loss": 0.472, "mean_token_accuracy": 0.8653873577713966, "num_tokens": 982797.0, "step": 168 }, { "entropy": 1.4145529568195343, "epoch": 0.385734664764622, "grad_norm": 4.15625, "learning_rate": 4.73220004789747e-06, "loss": 0.4677, "mean_token_accuracy": 0.8689080029726028, "num_tokens": 988588.0, "step": 169 }, { "entropy": 1.4675364196300507, "epoch": 0.3880171184022825, "grad_norm": 3.796875, "learning_rate": 4.7279436863623805e-06, "loss": 0.4218, "mean_token_accuracy": 0.8724250420928001, "num_tokens": 994490.0, "step": 170 }, { "entropy": 1.4822284132242203, "epoch": 0.3902995720399429, "grad_norm": 3.25, "learning_rate": 4.7236557114175705e-06, "loss": 0.4036, "mean_token_accuracy": 0.8729385659098625, "num_tokens": 1000341.0, "step": 171 }, { "entropy": 1.5275023579597473, "epoch": 0.3925820256776034, "grad_norm": 3.71875, "learning_rate": 4.719336183907266e-06, "loss": 0.5107, "mean_token_accuracy": 0.846622422337532, "num_tokens": 1005552.0, "step": 172 }, { "entropy": 1.4371494799852371, "epoch": 0.3948644793152639, "grad_norm": 3.859375, "learning_rate": 4.7149851651234085e-06, "loss": 0.4761, "mean_token_accuracy": 0.856620728969574, "num_tokens": 1011272.0, "step": 173 }, { "entropy": 1.4481075257062912, "epoch": 0.3971469329529244, "grad_norm": 3.265625, "learning_rate": 4.710602716804784e-06, "loss": 0.4907, "mean_token_accuracy": 0.8551308736205101, "num_tokens": 1018025.0, "step": 174 }, { "entropy": 1.4776830077171326, "epoch": 0.39942938659058486, "grad_norm": 3.484375, "learning_rate": 4.706188901136148e-06, "loss": 0.4157, "mean_token_accuracy": 0.8659848943352699, "num_tokens": 1023559.0, "step": 175 }, { "entropy": 1.3460393995046616, "epoch": 0.40171184022824535, "grad_norm": 3.15625, "learning_rate": 4.701743780747345e-06, "loss": 0.3891, "mean_token_accuracy": 0.8979940786957741, "num_tokens": 1029587.0, "step": 176 }, { "entropy": 1.5323508977890015, "epoch": 0.40399429386590585, "grad_norm": 3.671875, "learning_rate": 4.697267418712415e-06, "loss": 0.5064, "mean_token_accuracy": 0.8600496724247932, "num_tokens": 1035523.0, "step": 177 }, { "entropy": 1.3961755633354187, "epoch": 0.40627674750356635, "grad_norm": 3.4375, "learning_rate": 4.6927598785487026e-06, "loss": 0.4937, "mean_token_accuracy": 0.8478540182113647, "num_tokens": 1041403.0, "step": 178 }, { "entropy": 1.4182656705379486, "epoch": 0.40855920114122685, "grad_norm": 3.015625, "learning_rate": 4.6882212242159555e-06, "loss": 0.3456, "mean_token_accuracy": 0.8982625529170036, "num_tokens": 1047682.0, "step": 179 }, { "entropy": 1.548415094614029, "epoch": 0.4108416547788873, "grad_norm": 4.3125, "learning_rate": 4.683651520115414e-06, "loss": 0.5678, "mean_token_accuracy": 0.8428888395428658, "num_tokens": 1053172.0, "step": 180 }, { "entropy": 1.396517127752304, "epoch": 0.4131241084165478, "grad_norm": 3.46875, "learning_rate": 4.679050831088902e-06, "loss": 0.4803, "mean_token_accuracy": 0.856790341436863, "num_tokens": 1059373.0, "step": 181 }, { "entropy": 1.3589655607938766, "epoch": 0.4154065620542083, "grad_norm": 3.53125, "learning_rate": 4.674419222417899e-06, "loss": 0.3944, "mean_token_accuracy": 0.8856743425130844, "num_tokens": 1065347.0, "step": 182 }, { "entropy": 1.5359989404678345, "epoch": 0.4176890156918688, "grad_norm": 4.15625, "learning_rate": 4.669756759822625e-06, "loss": 0.4896, "mean_token_accuracy": 0.8504308834671974, "num_tokens": 1070311.0, "step": 183 }, { "entropy": 1.3297200053930283, "epoch": 0.4199714693295292, "grad_norm": 3.21875, "learning_rate": 4.665063509461098e-06, "loss": 0.3047, "mean_token_accuracy": 0.9152820706367493, "num_tokens": 1076590.0, "step": 184 }, { "entropy": 1.3356045931577682, "epoch": 0.4222539229671897, "grad_norm": 2.9375, "learning_rate": 4.660339537928198e-06, "loss": 0.3891, "mean_token_accuracy": 0.8858283907175064, "num_tokens": 1082550.0, "step": 185 }, { "entropy": 1.518212452530861, "epoch": 0.4245363766048502, "grad_norm": 3.078125, "learning_rate": 4.655584912254727e-06, "loss": 0.393, "mean_token_accuracy": 0.8758783265948296, "num_tokens": 1088391.0, "step": 186 }, { "entropy": 1.488260880112648, "epoch": 0.4268188302425107, "grad_norm": 3.65625, "learning_rate": 4.650799699906452e-06, "loss": 0.4005, "mean_token_accuracy": 0.871321365237236, "num_tokens": 1093823.0, "step": 187 }, { "entropy": 1.4447910338640213, "epoch": 0.42910128388017116, "grad_norm": 3.21875, "learning_rate": 4.645983968783148e-06, "loss": 0.3873, "mean_token_accuracy": 0.8878121376037598, "num_tokens": 1099347.0, "step": 188 }, { "entropy": 1.4393097907304764, "epoch": 0.43138373751783166, "grad_norm": 3.40625, "learning_rate": 4.64113778721764e-06, "loss": 0.3712, "mean_token_accuracy": 0.8943579867482185, "num_tokens": 1104941.0, "step": 189 }, { "entropy": 1.5411454141139984, "epoch": 0.43366619115549215, "grad_norm": 4.15625, "learning_rate": 4.636261223974826e-06, "loss": 0.498, "mean_token_accuracy": 0.8571378961205482, "num_tokens": 1110031.0, "step": 190 }, { "entropy": 1.3604239225387573, "epoch": 0.43594864479315265, "grad_norm": 3.53125, "learning_rate": 4.631354348250706e-06, "loss": 0.4366, "mean_token_accuracy": 0.8668901473283768, "num_tokens": 1116176.0, "step": 191 }, { "entropy": 1.4267419427633286, "epoch": 0.43823109843081315, "grad_norm": 3.125, "learning_rate": 4.626417229671401e-06, "loss": 0.4324, "mean_token_accuracy": 0.8729524612426758, "num_tokens": 1122065.0, "step": 192 }, { "entropy": 1.554912507534027, "epoch": 0.4405135520684736, "grad_norm": 4.34375, "learning_rate": 4.621449938292159e-06, "loss": 0.5843, "mean_token_accuracy": 0.8273278325796127, "num_tokens": 1127506.0, "step": 193 }, { "entropy": 1.3502502888441086, "epoch": 0.4427960057061341, "grad_norm": 2.828125, "learning_rate": 4.616452544596367e-06, "loss": 0.3874, "mean_token_accuracy": 0.8785886839032173, "num_tokens": 1133494.0, "step": 194 }, { "entropy": 1.4718603789806366, "epoch": 0.4450784593437946, "grad_norm": 3.90625, "learning_rate": 4.611425119494552e-06, "loss": 0.4499, "mean_token_accuracy": 0.8621420189738274, "num_tokens": 1139036.0, "step": 195 }, { "entropy": 1.592808559536934, "epoch": 0.4473609129814551, "grad_norm": 4.5625, "learning_rate": 4.606367734323365e-06, "loss": 0.5667, "mean_token_accuracy": 0.832310289144516, "num_tokens": 1144022.0, "step": 196 }, { "entropy": 1.410594865679741, "epoch": 0.4496433666191155, "grad_norm": 4.03125, "learning_rate": 4.601280460844583e-06, "loss": 0.5266, "mean_token_accuracy": 0.855924166738987, "num_tokens": 1150011.0, "step": 197 }, { "entropy": 1.4304940402507782, "epoch": 0.451925820256776, "grad_norm": 4.46875, "learning_rate": 4.596163371244076e-06, "loss": 0.5302, "mean_token_accuracy": 0.8468711525201797, "num_tokens": 1155938.0, "step": 198 }, { "entropy": 1.4850642681121826, "epoch": 0.4542082738944365, "grad_norm": 3.8125, "learning_rate": 4.591016538130796e-06, "loss": 0.5296, "mean_token_accuracy": 0.8607726991176605, "num_tokens": 1161187.0, "step": 199 }, { "entropy": 1.495200276374817, "epoch": 0.456490727532097, "grad_norm": 3.71875, "learning_rate": 4.585840034535736e-06, "loss": 0.4806, "mean_token_accuracy": 0.865336537361145, "num_tokens": 1167354.0, "step": 200 }, { "epoch": 0.456490727532097, "eval_entropy": 1.4645510156949362, "eval_loss": 0.48574092984199524, "eval_mean_token_accuracy": 0.8648963557349311, "eval_num_tokens": 1167354.0, "eval_runtime": 4.6146, "eval_samples_per_second": 19.503, "eval_steps_per_second": 19.503, "step": 200 }, { "entropy": 1.5986905246973038, "epoch": 0.4587731811697575, "grad_norm": 3.984375, "learning_rate": 4.580633933910901e-06, "loss": 0.4827, "mean_token_accuracy": 0.8589570224285126, "num_tokens": 1173168.0, "step": 201 }, { "entropy": 1.4304189831018448, "epoch": 0.46105563480741796, "grad_norm": 3.328125, "learning_rate": 4.575398310128263e-06, "loss": 0.432, "mean_token_accuracy": 0.870637446641922, "num_tokens": 1178884.0, "step": 202 }, { "entropy": 1.5412327647209167, "epoch": 0.46333808844507846, "grad_norm": 4.03125, "learning_rate": 4.570133237478711e-06, "loss": 0.5089, "mean_token_accuracy": 0.8491686582565308, "num_tokens": 1184480.0, "step": 203 }, { "entropy": 1.4805094599723816, "epoch": 0.46562054208273895, "grad_norm": 3.453125, "learning_rate": 4.564838790671e-06, "loss": 0.5336, "mean_token_accuracy": 0.8480750620365143, "num_tokens": 1190484.0, "step": 204 }, { "entropy": 1.4799759984016418, "epoch": 0.46790299572039945, "grad_norm": 3.359375, "learning_rate": 4.55951504483069e-06, "loss": 0.4372, "mean_token_accuracy": 0.8827960044145584, "num_tokens": 1195901.0, "step": 205 }, { "entropy": 1.5237813293933868, "epoch": 0.4701854493580599, "grad_norm": 4.125, "learning_rate": 4.55416207549908e-06, "loss": 0.613, "mean_token_accuracy": 0.8307090178132057, "num_tokens": 1201383.0, "step": 206 }, { "entropy": 1.438712790608406, "epoch": 0.4724679029957204, "grad_norm": 3.21875, "learning_rate": 4.548779958632134e-06, "loss": 0.5351, "mean_token_accuracy": 0.8520702794194221, "num_tokens": 1207874.0, "step": 207 }, { "entropy": 1.4036246687173843, "epoch": 0.4747503566333809, "grad_norm": 3.390625, "learning_rate": 4.543368770599406e-06, "loss": 0.346, "mean_token_accuracy": 0.8787712529301643, "num_tokens": 1213989.0, "step": 208 }, { "entropy": 1.486038789153099, "epoch": 0.4770328102710414, "grad_norm": 3.71875, "learning_rate": 4.537928588182955e-06, "loss": 0.5211, "mean_token_accuracy": 0.8482290953397751, "num_tokens": 1219525.0, "step": 209 }, { "entropy": 1.444077506661415, "epoch": 0.4793152639087018, "grad_norm": 2.953125, "learning_rate": 4.532459488576258e-06, "loss": 0.3976, "mean_token_accuracy": 0.8832324147224426, "num_tokens": 1226231.0, "step": 210 }, { "entropy": 1.5054399818181992, "epoch": 0.4815977175463623, "grad_norm": 3.671875, "learning_rate": 4.526961549383109e-06, "loss": 0.4581, "mean_token_accuracy": 0.8546851649880409, "num_tokens": 1232271.0, "step": 211 }, { "entropy": 1.4887232929468155, "epoch": 0.4838801711840228, "grad_norm": 3.1875, "learning_rate": 4.521434848616523e-06, "loss": 0.4776, "mean_token_accuracy": 0.8665826469659805, "num_tokens": 1239076.0, "step": 212 }, { "entropy": 1.4471513032913208, "epoch": 0.4861626248216833, "grad_norm": 3.140625, "learning_rate": 4.515879464697629e-06, "loss": 0.3437, "mean_token_accuracy": 0.9033405035734177, "num_tokens": 1245117.0, "step": 213 }, { "entropy": 1.4255793392658234, "epoch": 0.4884450784593438, "grad_norm": 3.21875, "learning_rate": 4.5102954764545525e-06, "loss": 0.3922, "mean_token_accuracy": 0.879116877913475, "num_tokens": 1251024.0, "step": 214 }, { "entropy": 1.4146728217601776, "epoch": 0.49072753209700426, "grad_norm": 3.21875, "learning_rate": 4.5046829631213014e-06, "loss": 0.4581, "mean_token_accuracy": 0.8701305538415909, "num_tokens": 1257738.0, "step": 215 }, { "entropy": 1.4356386065483093, "epoch": 0.49300998573466476, "grad_norm": 2.796875, "learning_rate": 4.499042004336642e-06, "loss": 0.4283, "mean_token_accuracy": 0.8771600425243378, "num_tokens": 1265254.0, "step": 216 }, { "entropy": 1.3496776968240738, "epoch": 0.49529243937232525, "grad_norm": 2.859375, "learning_rate": 4.4933726801429665e-06, "loss": 0.3705, "mean_token_accuracy": 0.8920829594135284, "num_tokens": 1271970.0, "step": 217 }, { "entropy": 1.5127773433923721, "epoch": 0.49757489300998575, "grad_norm": 3.78125, "learning_rate": 4.487675070985156e-06, "loss": 0.4624, "mean_token_accuracy": 0.8566678315401077, "num_tokens": 1277606.0, "step": 218 }, { "entropy": 1.4766086488962173, "epoch": 0.4998573466476462, "grad_norm": 3.796875, "learning_rate": 4.481949257709442e-06, "loss": 0.4412, "mean_token_accuracy": 0.8686520978808403, "num_tokens": 1283617.0, "step": 219 }, { "entropy": 1.5000656843185425, "epoch": 0.5021398002853067, "grad_norm": 4.40625, "learning_rate": 4.476195321562262e-06, "loss": 0.5898, "mean_token_accuracy": 0.8323855772614479, "num_tokens": 1289328.0, "step": 220 }, { "entropy": 1.4562593698501587, "epoch": 0.5044222539229671, "grad_norm": 3.625, "learning_rate": 4.470413344189098e-06, "loss": 0.4657, "mean_token_accuracy": 0.8688141480088234, "num_tokens": 1294897.0, "step": 221 }, { "entropy": 1.412929117679596, "epoch": 0.5067047075606277, "grad_norm": 3.515625, "learning_rate": 4.464603407633326e-06, "loss": 0.4717, "mean_token_accuracy": 0.8586973398923874, "num_tokens": 1300887.0, "step": 222 }, { "entropy": 1.5253776609897614, "epoch": 0.5089871611982881, "grad_norm": 3.390625, "learning_rate": 4.458765594335048e-06, "loss": 0.473, "mean_token_accuracy": 0.8543320819735527, "num_tokens": 1306712.0, "step": 223 }, { "entropy": 1.5946801453828812, "epoch": 0.5112696148359487, "grad_norm": 3.90625, "learning_rate": 4.452899987129922e-06, "loss": 0.5303, "mean_token_accuracy": 0.8440029099583626, "num_tokens": 1311955.0, "step": 224 }, { "entropy": 1.3364089578390121, "epoch": 0.5135520684736091, "grad_norm": 3.140625, "learning_rate": 4.44700666924799e-06, "loss": 0.3431, "mean_token_accuracy": 0.8987620249390602, "num_tokens": 1318460.0, "step": 225 }, { "entropy": 1.4394992887973785, "epoch": 0.5158345221112696, "grad_norm": 3.0, "learning_rate": 4.441085724312494e-06, "loss": 0.4805, "mean_token_accuracy": 0.861751489341259, "num_tokens": 1325269.0, "step": 226 }, { "entropy": 1.4739690721035004, "epoch": 0.5181169757489301, "grad_norm": 3.5, "learning_rate": 4.435137236338688e-06, "loss": 0.4712, "mean_token_accuracy": 0.8692339286208153, "num_tokens": 1331087.0, "step": 227 }, { "entropy": 1.408553659915924, "epoch": 0.5203994293865906, "grad_norm": 3.703125, "learning_rate": 4.42916128973265e-06, "loss": 0.545, "mean_token_accuracy": 0.8480049669742584, "num_tokens": 1336928.0, "step": 228 }, { "entropy": 1.4906915128231049, "epoch": 0.5226818830242511, "grad_norm": 4.3125, "learning_rate": 4.423157969290081e-06, "loss": 0.4943, "mean_token_accuracy": 0.8629228696227074, "num_tokens": 1341951.0, "step": 229 }, { "entropy": 1.5799495428800583, "epoch": 0.5249643366619116, "grad_norm": 3.875, "learning_rate": 4.417127360195107e-06, "loss": 0.454, "mean_token_accuracy": 0.8446270450949669, "num_tokens": 1346983.0, "step": 230 }, { "entropy": 1.3668962121009827, "epoch": 0.527246790299572, "grad_norm": 3.28125, "learning_rate": 4.41106954801906e-06, "loss": 0.3977, "mean_token_accuracy": 0.8871706500649452, "num_tokens": 1354122.0, "step": 231 }, { "entropy": 1.5603487640619278, "epoch": 0.5295292439372326, "grad_norm": 3.359375, "learning_rate": 4.404984618719275e-06, "loss": 0.4717, "mean_token_accuracy": 0.8657551482319832, "num_tokens": 1359608.0, "step": 232 }, { "entropy": 1.4570914506912231, "epoch": 0.531811697574893, "grad_norm": 3.1875, "learning_rate": 4.398872658637863e-06, "loss": 0.4311, "mean_token_accuracy": 0.8685552924871445, "num_tokens": 1365590.0, "step": 233 }, { "entropy": 1.329675242304802, "epoch": 0.5340941512125535, "grad_norm": 3.09375, "learning_rate": 4.39273375450049e-06, "loss": 0.4566, "mean_token_accuracy": 0.8627236634492874, "num_tokens": 1372145.0, "step": 234 }, { "entropy": 1.4357402175664902, "epoch": 0.536376604850214, "grad_norm": 3.5, "learning_rate": 4.386567993415144e-06, "loss": 0.4507, "mean_token_accuracy": 0.8667884543538094, "num_tokens": 1377900.0, "step": 235 }, { "entropy": 1.5077559649944305, "epoch": 0.5386590584878744, "grad_norm": 3.984375, "learning_rate": 4.3803754628708995e-06, "loss": 0.5176, "mean_token_accuracy": 0.8583211898803711, "num_tokens": 1383999.0, "step": 236 }, { "entropy": 1.3777508586645126, "epoch": 0.540941512125535, "grad_norm": 3.265625, "learning_rate": 4.3741562507366754e-06, "loss": 0.3431, "mean_token_accuracy": 0.8923545554280281, "num_tokens": 1390419.0, "step": 237 }, { "entropy": 1.4933728128671646, "epoch": 0.5432239657631954, "grad_norm": 3.203125, "learning_rate": 4.367910445259991e-06, "loss": 0.4044, "mean_token_accuracy": 0.8686385452747345, "num_tokens": 1396684.0, "step": 238 }, { "entropy": 1.4653480350971222, "epoch": 0.5455064194008559, "grad_norm": 3.25, "learning_rate": 4.361638135065711e-06, "loss": 0.4561, "mean_token_accuracy": 0.8716481998562813, "num_tokens": 1402830.0, "step": 239 }, { "entropy": 1.5274227857589722, "epoch": 0.5477888730385164, "grad_norm": 3.6875, "learning_rate": 4.355339409154788e-06, "loss": 0.5069, "mean_token_accuracy": 0.8373076170682907, "num_tokens": 1408506.0, "step": 240 }, { "entropy": 1.4511406421661377, "epoch": 0.5500713266761769, "grad_norm": 3.3125, "learning_rate": 4.3490143569030025e-06, "loss": 0.4684, "mean_token_accuracy": 0.8665965721011162, "num_tokens": 1414792.0, "step": 241 }, { "entropy": 1.3838857859373093, "epoch": 0.5523537803138374, "grad_norm": 3.4375, "learning_rate": 4.34266306805969e-06, "loss": 0.4547, "mean_token_accuracy": 0.8690644651651382, "num_tokens": 1420524.0, "step": 242 }, { "entropy": 1.4130767732858658, "epoch": 0.5546362339514979, "grad_norm": 3.46875, "learning_rate": 4.336285632746472e-06, "loss": 0.471, "mean_token_accuracy": 0.8564508408308029, "num_tokens": 1426426.0, "step": 243 }, { "entropy": 1.618276908993721, "epoch": 0.5569186875891583, "grad_norm": 4.03125, "learning_rate": 4.329882141455974e-06, "loss": 0.5143, "mean_token_accuracy": 0.8403759598731995, "num_tokens": 1431586.0, "step": 244 }, { "entropy": 1.4412871301174164, "epoch": 0.5592011412268189, "grad_norm": 3.90625, "learning_rate": 4.323452685050545e-06, "loss": 0.4539, "mean_token_accuracy": 0.863670825958252, "num_tokens": 1437354.0, "step": 245 }, { "entropy": 1.4914350509643555, "epoch": 0.5614835948644793, "grad_norm": 3.03125, "learning_rate": 4.316997354760965e-06, "loss": 0.3826, "mean_token_accuracy": 0.8802237138152122, "num_tokens": 1443221.0, "step": 246 }, { "entropy": 1.5026773810386658, "epoch": 0.5637660485021398, "grad_norm": 3.359375, "learning_rate": 4.3105162421851494e-06, "loss": 0.4275, "mean_token_accuracy": 0.8739782869815826, "num_tokens": 1448716.0, "step": 247 }, { "entropy": 1.467271402478218, "epoch": 0.5660485021398003, "grad_norm": 3.765625, "learning_rate": 4.304009439286855e-06, "loss": 0.4786, "mean_token_accuracy": 0.8454955220222473, "num_tokens": 1453607.0, "step": 248 }, { "entropy": 1.3084248155355453, "epoch": 0.5683309557774607, "grad_norm": 3.171875, "learning_rate": 4.297477038394368e-06, "loss": 0.4264, "mean_token_accuracy": 0.8782637789845467, "num_tokens": 1460122.0, "step": 249 }, { "entropy": 1.4157914519309998, "epoch": 0.5706134094151213, "grad_norm": 3.046875, "learning_rate": 4.2909191321992e-06, "loss": 0.4883, "mean_token_accuracy": 0.8630497455596924, "num_tokens": 1466789.0, "step": 250 }, { "entropy": 1.3701231330633163, "epoch": 0.5728958630527817, "grad_norm": 3.53125, "learning_rate": 4.28433581375477e-06, "loss": 0.4331, "mean_token_accuracy": 0.874555304646492, "num_tokens": 1472752.0, "step": 251 }, { "entropy": 1.5737513154745102, "epoch": 0.5751783166904422, "grad_norm": 3.625, "learning_rate": 4.2777271764750805e-06, "loss": 0.4553, "mean_token_accuracy": 0.8664311170578003, "num_tokens": 1478473.0, "step": 252 }, { "entropy": 1.525623768568039, "epoch": 0.5774607703281027, "grad_norm": 3.390625, "learning_rate": 4.271093314133401e-06, "loss": 0.466, "mean_token_accuracy": 0.8556927219033241, "num_tokens": 1484284.0, "step": 253 }, { "entropy": 1.4639706760644913, "epoch": 0.5797432239657632, "grad_norm": 3.75, "learning_rate": 4.264434320860929e-06, "loss": 0.5532, "mean_token_accuracy": 0.844054289162159, "num_tokens": 1490166.0, "step": 254 }, { "entropy": 1.5366946905851364, "epoch": 0.5820256776034237, "grad_norm": 3.65625, "learning_rate": 4.257750291145457e-06, "loss": 0.5268, "mean_token_accuracy": 0.8521439135074615, "num_tokens": 1495689.0, "step": 255 }, { "entropy": 1.5063273757696152, "epoch": 0.5843081312410842, "grad_norm": 3.796875, "learning_rate": 4.251041319830034e-06, "loss": 0.5244, "mean_token_accuracy": 0.8497593402862549, "num_tokens": 1501104.0, "step": 256 }, { "entropy": 1.5439026057720184, "epoch": 0.5865905848787446, "grad_norm": 3.3125, "learning_rate": 4.2443075021116166e-06, "loss": 0.3605, "mean_token_accuracy": 0.8726519420742989, "num_tokens": 1506924.0, "step": 257 }, { "entropy": 1.4876836389303207, "epoch": 0.5888730385164052, "grad_norm": 3.640625, "learning_rate": 4.237548933539718e-06, "loss": 0.4703, "mean_token_accuracy": 0.866664931178093, "num_tokens": 1512828.0, "step": 258 }, { "entropy": 1.480648323893547, "epoch": 0.5911554921540656, "grad_norm": 3.3125, "learning_rate": 4.230765710015058e-06, "loss": 0.466, "mean_token_accuracy": 0.8522143065929413, "num_tokens": 1518522.0, "step": 259 }, { "entropy": 1.4419532120227814, "epoch": 0.5934379457917262, "grad_norm": 3.703125, "learning_rate": 4.223957927788195e-06, "loss": 0.4973, "mean_token_accuracy": 0.8543191030621529, "num_tokens": 1523970.0, "step": 260 }, { "entropy": 1.4034761041402817, "epoch": 0.5957203994293866, "grad_norm": 2.984375, "learning_rate": 4.217125683458162e-06, "loss": 0.3724, "mean_token_accuracy": 0.8887425437569618, "num_tokens": 1530150.0, "step": 261 }, { "entropy": 1.5668024867773056, "epoch": 0.598002853067047, "grad_norm": 4.5, "learning_rate": 4.210269073971098e-06, "loss": 0.4921, "mean_token_accuracy": 0.8630413040518761, "num_tokens": 1535368.0, "step": 262 }, { "entropy": 1.4702572673559189, "epoch": 0.6002853067047076, "grad_norm": 3.40625, "learning_rate": 4.203388196618874e-06, "loss": 0.3834, "mean_token_accuracy": 0.8823850229382515, "num_tokens": 1541388.0, "step": 263 }, { "entropy": 1.353348970413208, "epoch": 0.602567760342368, "grad_norm": 3.03125, "learning_rate": 4.196483149037707e-06, "loss": 0.3882, "mean_token_accuracy": 0.8797778934240341, "num_tokens": 1547245.0, "step": 264 }, { "entropy": 1.3397300243377686, "epoch": 0.6048502139800286, "grad_norm": 3.453125, "learning_rate": 4.1895540292067765e-06, "loss": 0.4969, "mean_token_accuracy": 0.8677136451005936, "num_tokens": 1553007.0, "step": 265 }, { "entropy": 1.533875733613968, "epoch": 0.607132667617689, "grad_norm": 4.15625, "learning_rate": 4.18260093544684e-06, "loss": 0.5423, "mean_token_accuracy": 0.8619329035282135, "num_tokens": 1559044.0, "step": 266 }, { "entropy": 1.4415863156318665, "epoch": 0.6094151212553495, "grad_norm": 3.296875, "learning_rate": 4.1756239664188275e-06, "loss": 0.4586, "mean_token_accuracy": 0.8679523020982742, "num_tokens": 1565121.0, "step": 267 }, { "entropy": 1.5389132052659988, "epoch": 0.61169757489301, "grad_norm": 3.515625, "learning_rate": 4.168623221122451e-06, "loss": 0.3954, "mean_token_accuracy": 0.8800017014145851, "num_tokens": 1570839.0, "step": 268 }, { "entropy": 1.4849002212285995, "epoch": 0.6139800285306705, "grad_norm": 4.46875, "learning_rate": 4.161598798894795e-06, "loss": 0.5272, "mean_token_accuracy": 0.842116691172123, "num_tokens": 1576765.0, "step": 269 }, { "entropy": 1.526948407292366, "epoch": 0.6162624821683309, "grad_norm": 3.796875, "learning_rate": 4.154550799408906e-06, "loss": 0.4815, "mean_token_accuracy": 0.8517501726746559, "num_tokens": 1582404.0, "step": 270 }, { "entropy": 1.5471256375312805, "epoch": 0.6185449358059915, "grad_norm": 4.0625, "learning_rate": 4.147479322672383e-06, "loss": 0.5704, "mean_token_accuracy": 0.8349821045994759, "num_tokens": 1588027.0, "step": 271 }, { "entropy": 1.3742996156215668, "epoch": 0.6208273894436519, "grad_norm": 3.03125, "learning_rate": 4.1403844690259544e-06, "loss": 0.4357, "mean_token_accuracy": 0.8906814530491829, "num_tokens": 1594482.0, "step": 272 }, { "entropy": 1.7183980494737625, "epoch": 0.6231098430813125, "grad_norm": 4.625, "learning_rate": 4.1332663391420515e-06, "loss": 0.6023, "mean_token_accuracy": 0.8240282908082008, "num_tokens": 1599978.0, "step": 273 }, { "entropy": 1.4364304840564728, "epoch": 0.6253922967189729, "grad_norm": 3.140625, "learning_rate": 4.126125034023392e-06, "loss": 0.4642, "mean_token_accuracy": 0.8591607213020325, "num_tokens": 1606427.0, "step": 274 }, { "entropy": 1.4346765726804733, "epoch": 0.6276747503566333, "grad_norm": 3.28125, "learning_rate": 4.11896065500153e-06, "loss": 0.4256, "mean_token_accuracy": 0.8701624721288681, "num_tokens": 1612618.0, "step": 275 }, { "entropy": 1.625702291727066, "epoch": 0.6299572039942939, "grad_norm": 5.125, "learning_rate": 4.111773303735432e-06, "loss": 0.4558, "mean_token_accuracy": 0.8545658215880394, "num_tokens": 1617388.0, "step": 276 }, { "entropy": 1.4506097733974457, "epoch": 0.6322396576319543, "grad_norm": 3.046875, "learning_rate": 4.104563082210028e-06, "loss": 0.4293, "mean_token_accuracy": 0.8728143572807312, "num_tokens": 1623851.0, "step": 277 }, { "entropy": 1.5303080081939697, "epoch": 0.6345221112696149, "grad_norm": 3.515625, "learning_rate": 4.097330092734765e-06, "loss": 0.5024, "mean_token_accuracy": 0.8505230322480202, "num_tokens": 1629428.0, "step": 278 }, { "entropy": 1.4354898631572723, "epoch": 0.6368045649072753, "grad_norm": 3.03125, "learning_rate": 4.090074437942155e-06, "loss": 0.435, "mean_token_accuracy": 0.8785936459898949, "num_tokens": 1635769.0, "step": 279 }, { "entropy": 1.547384113073349, "epoch": 0.6390870185449358, "grad_norm": 4.0, "learning_rate": 4.082796220786324e-06, "loss": 0.5469, "mean_token_accuracy": 0.8383355513215065, "num_tokens": 1641791.0, "step": 280 }, { "entropy": 1.480806604027748, "epoch": 0.6413694721825963, "grad_norm": 4.4375, "learning_rate": 4.0754955445415405e-06, "loss": 0.4233, "mean_token_accuracy": 0.8961210995912552, "num_tokens": 1646709.0, "step": 281 }, { "entropy": 1.4669694900512695, "epoch": 0.6436519258202568, "grad_norm": 4.0, "learning_rate": 4.06817251280076e-06, "loss": 0.4288, "mean_token_accuracy": 0.8806118816137314, "num_tokens": 1651676.0, "step": 282 }, { "entropy": 1.4136276096105576, "epoch": 0.6459343794579172, "grad_norm": 3.15625, "learning_rate": 4.06082722947415e-06, "loss": 0.4005, "mean_token_accuracy": 0.8672489523887634, "num_tokens": 1657293.0, "step": 283 }, { "entropy": 1.4642555862665176, "epoch": 0.6482168330955778, "grad_norm": 3.421875, "learning_rate": 4.053459798787619e-06, "loss": 0.4534, "mean_token_accuracy": 0.8670831546187401, "num_tokens": 1662778.0, "step": 284 }, { "entropy": 1.4143490493297577, "epoch": 0.6504992867332382, "grad_norm": 3.015625, "learning_rate": 4.046070325281333e-06, "loss": 0.4511, "mean_token_accuracy": 0.8704198077321053, "num_tokens": 1669050.0, "step": 285 }, { "entropy": 1.402878537774086, "epoch": 0.6527817403708988, "grad_norm": 2.78125, "learning_rate": 4.038658913808235e-06, "loss": 0.3552, "mean_token_accuracy": 0.8852335959672928, "num_tokens": 1675168.0, "step": 286 }, { "entropy": 1.4332346022129059, "epoch": 0.6550641940085592, "grad_norm": 3.359375, "learning_rate": 4.031225669532558e-06, "loss": 0.4411, "mean_token_accuracy": 0.8605756536126137, "num_tokens": 1680716.0, "step": 287 }, { "entropy": 1.4855122715234756, "epoch": 0.6573466476462196, "grad_norm": 3.890625, "learning_rate": 4.0237706979283306e-06, "loss": 0.5067, "mean_token_accuracy": 0.8480587676167488, "num_tokens": 1686358.0, "step": 288 }, { "entropy": 1.415476381778717, "epoch": 0.6596291012838802, "grad_norm": 3.109375, "learning_rate": 4.016294104777883e-06, "loss": 0.3724, "mean_token_accuracy": 0.8872483521699905, "num_tokens": 1692477.0, "step": 289 }, { "entropy": 1.4918617755174637, "epoch": 0.6619115549215406, "grad_norm": 3.546875, "learning_rate": 4.008795996170341e-06, "loss": 0.481, "mean_token_accuracy": 0.8568604290485382, "num_tokens": 1698377.0, "step": 290 }, { "entropy": 1.3961764425039291, "epoch": 0.6641940085592012, "grad_norm": 3.015625, "learning_rate": 4.001276478500127e-06, "loss": 0.3972, "mean_token_accuracy": 0.885112538933754, "num_tokens": 1704209.0, "step": 291 }, { "entropy": 1.4769706726074219, "epoch": 0.6664764621968616, "grad_norm": 3.640625, "learning_rate": 3.993735658465446e-06, "loss": 0.5053, "mean_token_accuracy": 0.8577989414334297, "num_tokens": 1710422.0, "step": 292 }, { "entropy": 1.3838878571987152, "epoch": 0.6687589158345221, "grad_norm": 2.96875, "learning_rate": 3.986173643066774e-06, "loss": 0.3759, "mean_token_accuracy": 0.8760515302419662, "num_tokens": 1716105.0, "step": 293 }, { "entropy": 1.3878977000713348, "epoch": 0.6710413694721826, "grad_norm": 2.796875, "learning_rate": 3.978590539605338e-06, "loss": 0.329, "mean_token_accuracy": 0.8979349583387375, "num_tokens": 1723015.0, "step": 294 }, { "entropy": 1.4417504221200943, "epoch": 0.6733238231098431, "grad_norm": 3.453125, "learning_rate": 3.970986455681593e-06, "loss": 0.5339, "mean_token_accuracy": 0.854948602616787, "num_tokens": 1729102.0, "step": 295 }, { "entropy": 1.4643060863018036, "epoch": 0.6756062767475036, "grad_norm": 3.171875, "learning_rate": 3.963361499193699e-06, "loss": 0.4545, "mean_token_accuracy": 0.8652586191892624, "num_tokens": 1734903.0, "step": 296 }, { "entropy": 1.4911223948001862, "epoch": 0.6778887303851641, "grad_norm": 3.25, "learning_rate": 3.955715778335984e-06, "loss": 0.4584, "mean_token_accuracy": 0.8684913441538811, "num_tokens": 1740820.0, "step": 297 }, { "entropy": 1.414558470249176, "epoch": 0.6801711840228245, "grad_norm": 3.046875, "learning_rate": 3.948049401597414e-06, "loss": 0.4304, "mean_token_accuracy": 0.8772279694676399, "num_tokens": 1747285.0, "step": 298 }, { "entropy": 1.4684519618749619, "epoch": 0.6824536376604851, "grad_norm": 3.5, "learning_rate": 3.9403624777600526e-06, "loss": 0.3402, "mean_token_accuracy": 0.8974613174796104, "num_tokens": 1752238.0, "step": 299 }, { "entropy": 1.513798087835312, "epoch": 0.6847360912981455, "grad_norm": 4.4375, "learning_rate": 3.932655115897513e-06, "loss": 0.518, "mean_token_accuracy": 0.8387879729270935, "num_tokens": 1757263.0, "step": 300 }, { "epoch": 0.6847360912981455, "eval_entropy": 1.4728518161508772, "eval_loss": 0.4787273108959198, "eval_mean_token_accuracy": 0.8652989500098758, "eval_num_tokens": 1757263.0, "eval_runtime": 4.4526, "eval_samples_per_second": 20.213, "eval_steps_per_second": 20.213, "step": 300 }, { "entropy": 1.5329459309577942, "epoch": 0.6870185449358059, "grad_norm": 3.0625, "learning_rate": 3.924927425373417e-06, "loss": 0.3762, "mean_token_accuracy": 0.8721340969204903, "num_tokens": 1762777.0, "step": 301 }, { "entropy": 1.3736970275640488, "epoch": 0.6893009985734665, "grad_norm": 2.84375, "learning_rate": 3.91717951583984e-06, "loss": 0.403, "mean_token_accuracy": 0.8769481182098389, "num_tokens": 1769311.0, "step": 302 }, { "entropy": 1.4778434038162231, "epoch": 0.6915834522111269, "grad_norm": 3.265625, "learning_rate": 3.909411497235752e-06, "loss": 0.4176, "mean_token_accuracy": 0.8799067437648773, "num_tokens": 1775618.0, "step": 303 }, { "entropy": 1.493824690580368, "epoch": 0.6938659058487875, "grad_norm": 3.046875, "learning_rate": 3.901623479785465e-06, "loss": 0.4883, "mean_token_accuracy": 0.8613429367542267, "num_tokens": 1782559.0, "step": 304 }, { "entropy": 1.412913128733635, "epoch": 0.6961483594864479, "grad_norm": 4.28125, "learning_rate": 3.89381557399706e-06, "loss": 0.4606, "mean_token_accuracy": 0.8659727945923805, "num_tokens": 1788268.0, "step": 305 }, { "entropy": 1.3977010250091553, "epoch": 0.6984308131241084, "grad_norm": 3.046875, "learning_rate": 3.885987890660828e-06, "loss": 0.3609, "mean_token_accuracy": 0.8855833634734154, "num_tokens": 1794289.0, "step": 306 }, { "entropy": 1.4351555556058884, "epoch": 0.7007132667617689, "grad_norm": 3.203125, "learning_rate": 3.87814054084769e-06, "loss": 0.3922, "mean_token_accuracy": 0.882360152900219, "num_tokens": 1800100.0, "step": 307 }, { "entropy": 1.4589954763650894, "epoch": 0.7029957203994294, "grad_norm": 3.84375, "learning_rate": 3.8702736359076265e-06, "loss": 0.4728, "mean_token_accuracy": 0.8583435043692589, "num_tokens": 1806175.0, "step": 308 }, { "entropy": 1.497866302728653, "epoch": 0.7052781740370899, "grad_norm": 3.984375, "learning_rate": 3.862387287468095e-06, "loss": 0.5149, "mean_token_accuracy": 0.8527609705924988, "num_tokens": 1811406.0, "step": 309 }, { "entropy": 1.5516266524791718, "epoch": 0.7075606276747504, "grad_norm": 3.5, "learning_rate": 3.854481607432445e-06, "loss": 0.4476, "mean_token_accuracy": 0.8626842275261879, "num_tokens": 1816804.0, "step": 310 }, { "entropy": 1.3300371170043945, "epoch": 0.7098430813124108, "grad_norm": 3.0625, "learning_rate": 3.846556707978337e-06, "loss": 0.4001, "mean_token_accuracy": 0.8860765770077705, "num_tokens": 1823102.0, "step": 311 }, { "entropy": 1.5138549208641052, "epoch": 0.7121255349500714, "grad_norm": 3.71875, "learning_rate": 3.838612701556138e-06, "loss": 0.4696, "mean_token_accuracy": 0.8707823753356934, "num_tokens": 1828740.0, "step": 312 }, { "entropy": 1.4780635386705399, "epoch": 0.7144079885877318, "grad_norm": 3.140625, "learning_rate": 3.830649700887339e-06, "loss": 0.4598, "mean_token_accuracy": 0.8627598807215691, "num_tokens": 1835314.0, "step": 313 }, { "entropy": 1.483512207865715, "epoch": 0.7166904422253922, "grad_norm": 3.859375, "learning_rate": 3.822667818962948e-06, "loss": 0.3944, "mean_token_accuracy": 0.8666610270738602, "num_tokens": 1840589.0, "step": 314 }, { "entropy": 1.351017713546753, "epoch": 0.7189728958630528, "grad_norm": 3.046875, "learning_rate": 3.814667169041887e-06, "loss": 0.4589, "mean_token_accuracy": 0.8681119009852409, "num_tokens": 1846865.0, "step": 315 }, { "entropy": 1.454156056046486, "epoch": 0.7212553495007132, "grad_norm": 2.859375, "learning_rate": 3.8066478646493898e-06, "loss": 0.3616, "mean_token_accuracy": 0.887380801141262, "num_tokens": 1853343.0, "step": 316 }, { "entropy": 1.3507359623908997, "epoch": 0.7235378031383738, "grad_norm": 3.109375, "learning_rate": 3.798610019575384e-06, "loss": 0.3908, "mean_token_accuracy": 0.8893059492111206, "num_tokens": 1859535.0, "step": 317 }, { "entropy": 1.5166684240102768, "epoch": 0.7258202567760342, "grad_norm": 4.3125, "learning_rate": 3.790553747872885e-06, "loss": 0.5235, "mean_token_accuracy": 0.8411901965737343, "num_tokens": 1864957.0, "step": 318 }, { "entropy": 1.4589732587337494, "epoch": 0.7281027104136947, "grad_norm": 3.34375, "learning_rate": 3.7824791638563674e-06, "loss": 0.4074, "mean_token_accuracy": 0.8821713030338287, "num_tokens": 1870586.0, "step": 319 }, { "entropy": 1.429191216826439, "epoch": 0.7303851640513552, "grad_norm": 3.40625, "learning_rate": 3.7743863821001538e-06, "loss": 0.4902, "mean_token_accuracy": 0.8597285747528076, "num_tokens": 1876572.0, "step": 320 }, { "entropy": 1.52955062687397, "epoch": 0.7326676176890157, "grad_norm": 4.09375, "learning_rate": 3.766275517436779e-06, "loss": 0.5007, "mean_token_accuracy": 0.8509823232889175, "num_tokens": 1881581.0, "step": 321 }, { "entropy": 1.6073177456855774, "epoch": 0.7349500713266762, "grad_norm": 4.9375, "learning_rate": 3.7581466849553685e-06, "loss": 0.5742, "mean_token_accuracy": 0.8330699577927589, "num_tokens": 1886980.0, "step": 322 }, { "entropy": 1.490510642528534, "epoch": 0.7372325249643367, "grad_norm": 4.15625, "learning_rate": 3.7500000000000005e-06, "loss": 0.5597, "mean_token_accuracy": 0.8421717286109924, "num_tokens": 1892848.0, "step": 323 }, { "entropy": 1.4249206632375717, "epoch": 0.7395149786019971, "grad_norm": 3.59375, "learning_rate": 3.741835578168071e-06, "loss": 0.5289, "mean_token_accuracy": 0.8406483083963394, "num_tokens": 1899057.0, "step": 324 }, { "entropy": 1.4169852286577225, "epoch": 0.7417974322396577, "grad_norm": 3.421875, "learning_rate": 3.7336535353086546e-06, "loss": 0.4855, "mean_token_accuracy": 0.8616788312792778, "num_tokens": 1905042.0, "step": 325 }, { "entropy": 1.5189264565706253, "epoch": 0.7440798858773181, "grad_norm": 3.59375, "learning_rate": 3.7254539875208577e-06, "loss": 0.5092, "mean_token_accuracy": 0.8563691675662994, "num_tokens": 1910608.0, "step": 326 }, { "entropy": 1.4049191176891327, "epoch": 0.7463623395149787, "grad_norm": 3.40625, "learning_rate": 3.717237051152175e-06, "loss": 0.4253, "mean_token_accuracy": 0.8755350038409233, "num_tokens": 1916900.0, "step": 327 }, { "entropy": 1.4406355023384094, "epoch": 0.7486447931526391, "grad_norm": 3.3125, "learning_rate": 3.7090028427968343e-06, "loss": 0.5454, "mean_token_accuracy": 0.8430257961153984, "num_tokens": 1923487.0, "step": 328 }, { "entropy": 1.4147418439388275, "epoch": 0.7509272467902995, "grad_norm": 3.734375, "learning_rate": 3.7007514792941462e-06, "loss": 0.4328, "mean_token_accuracy": 0.873896099627018, "num_tokens": 1929126.0, "step": 329 }, { "entropy": 1.4407319128513336, "epoch": 0.7532097004279601, "grad_norm": 3.609375, "learning_rate": 3.692483077726843e-06, "loss": 0.4482, "mean_token_accuracy": 0.8734828159213066, "num_tokens": 1935299.0, "step": 330 }, { "entropy": 1.3825494647026062, "epoch": 0.7554921540656205, "grad_norm": 3.046875, "learning_rate": 3.684197755419419e-06, "loss": 0.3914, "mean_token_accuracy": 0.8881862461566925, "num_tokens": 1941583.0, "step": 331 }, { "entropy": 1.5679688155651093, "epoch": 0.757774607703281, "grad_norm": 3.5, "learning_rate": 3.6758956299364643e-06, "loss": 0.5205, "mean_token_accuracy": 0.850575216114521, "num_tokens": 1947719.0, "step": 332 }, { "entropy": 1.478807806968689, "epoch": 0.7600570613409415, "grad_norm": 3.953125, "learning_rate": 3.6675768190810023e-06, "loss": 0.5383, "mean_token_accuracy": 0.8558880761265755, "num_tokens": 1952792.0, "step": 333 }, { "entropy": 1.4567435085773468, "epoch": 0.762339514978602, "grad_norm": 3.625, "learning_rate": 3.659241440892806e-06, "loss": 0.4479, "mean_token_accuracy": 0.8747463598847389, "num_tokens": 1959114.0, "step": 334 }, { "entropy": 1.3806256204843521, "epoch": 0.7646219686162625, "grad_norm": 2.90625, "learning_rate": 3.6508896136467376e-06, "loss": 0.3259, "mean_token_accuracy": 0.9004263803362846, "num_tokens": 1965297.0, "step": 335 }, { "entropy": 1.3886072635650635, "epoch": 0.766904422253923, "grad_norm": 2.75, "learning_rate": 3.642521455851058e-06, "loss": 0.3218, "mean_token_accuracy": 0.8972492516040802, "num_tokens": 1972145.0, "step": 336 }, { "entropy": 1.4320484548807144, "epoch": 0.7691868758915834, "grad_norm": 3.671875, "learning_rate": 3.634137086245754e-06, "loss": 0.4502, "mean_token_accuracy": 0.8562175408005714, "num_tokens": 1977851.0, "step": 337 }, { "entropy": 1.606041207909584, "epoch": 0.771469329529244, "grad_norm": 3.921875, "learning_rate": 3.625736623800849e-06, "loss": 0.5698, "mean_token_accuracy": 0.8275244310498238, "num_tokens": 1983459.0, "step": 338 }, { "entropy": 1.396391972899437, "epoch": 0.7737517831669044, "grad_norm": 3.125, "learning_rate": 3.6173201877147134e-06, "loss": 0.4157, "mean_token_accuracy": 0.8768060877919197, "num_tokens": 1989443.0, "step": 339 }, { "entropy": 1.2999221831560135, "epoch": 0.776034236804565, "grad_norm": 2.65625, "learning_rate": 3.6088878974123796e-06, "loss": 0.3211, "mean_token_accuracy": 0.9015626162290573, "num_tokens": 1996081.0, "step": 340 }, { "entropy": 1.4438531249761581, "epoch": 0.7783166904422254, "grad_norm": 3.109375, "learning_rate": 3.6004398725438406e-06, "loss": 0.4224, "mean_token_accuracy": 0.8693163841962814, "num_tokens": 2002046.0, "step": 341 }, { "entropy": 1.5661528557538986, "epoch": 0.7805991440798858, "grad_norm": 3.8125, "learning_rate": 3.5919762329823556e-06, "loss": 0.4583, "mean_token_accuracy": 0.8407174274325371, "num_tokens": 2007992.0, "step": 342 }, { "entropy": 1.5423270612955093, "epoch": 0.7828815977175464, "grad_norm": 3.828125, "learning_rate": 3.5834970988227484e-06, "loss": 0.5046, "mean_token_accuracy": 0.8615901097655296, "num_tokens": 2013678.0, "step": 343 }, { "entropy": 1.3757345080375671, "epoch": 0.7851640513552068, "grad_norm": 3.078125, "learning_rate": 3.5750025903797053e-06, "loss": 0.435, "mean_token_accuracy": 0.8637730702757835, "num_tokens": 2019976.0, "step": 344 }, { "entropy": 1.5496114045381546, "epoch": 0.7874465049928673, "grad_norm": 4.03125, "learning_rate": 3.566492828186063e-06, "loss": 0.466, "mean_token_accuracy": 0.861820325255394, "num_tokens": 2025396.0, "step": 345 }, { "entropy": 1.4001742899417877, "epoch": 0.7897289586305278, "grad_norm": 3.265625, "learning_rate": 3.5579679329911025e-06, "loss": 0.4244, "mean_token_accuracy": 0.8774027079343796, "num_tokens": 2031341.0, "step": 346 }, { "entropy": 1.4246700257062912, "epoch": 0.7920114122681883, "grad_norm": 2.59375, "learning_rate": 3.5494280257588367e-06, "loss": 0.3573, "mean_token_accuracy": 0.8994497805833817, "num_tokens": 2038154.0, "step": 347 }, { "entropy": 1.4771685898303986, "epoch": 0.7942938659058488, "grad_norm": 3.5625, "learning_rate": 3.5408732276662882e-06, "loss": 0.4837, "mean_token_accuracy": 0.8569220453500748, "num_tokens": 2043977.0, "step": 348 }, { "entropy": 1.3758689016103745, "epoch": 0.7965763195435093, "grad_norm": 3.484375, "learning_rate": 3.532303660101776e-06, "loss": 0.4086, "mean_token_accuracy": 0.8799771890044212, "num_tokens": 2049581.0, "step": 349 }, { "entropy": 1.4391580671072006, "epoch": 0.7988587731811697, "grad_norm": 3.4375, "learning_rate": 3.5237194446631883e-06, "loss": 0.4414, "mean_token_accuracy": 0.8686051443219185, "num_tokens": 2054885.0, "step": 350 }, { "entropy": 1.572434514760971, "epoch": 0.8011412268188303, "grad_norm": 3.578125, "learning_rate": 3.515120703156264e-06, "loss": 0.4561, "mean_token_accuracy": 0.869783990085125, "num_tokens": 2060752.0, "step": 351 }, { "entropy": 1.3927340656518936, "epoch": 0.8034236804564907, "grad_norm": 3.25, "learning_rate": 3.506507557592853e-06, "loss": 0.3986, "mean_token_accuracy": 0.8710938170552254, "num_tokens": 2066701.0, "step": 352 }, { "entropy": 1.6066904217004776, "epoch": 0.8057061340941513, "grad_norm": 4.5, "learning_rate": 3.4978801301891972e-06, "loss": 0.5213, "mean_token_accuracy": 0.8417335525155067, "num_tokens": 2072037.0, "step": 353 }, { "entropy": 1.5368521958589554, "epoch": 0.8079885877318117, "grad_norm": 3.6875, "learning_rate": 3.4892385433641875e-06, "loss": 0.5679, "mean_token_accuracy": 0.8372282758355141, "num_tokens": 2077090.0, "step": 354 }, { "entropy": 1.4477348923683167, "epoch": 0.8102710413694721, "grad_norm": 3.140625, "learning_rate": 3.480582919737631e-06, "loss": 0.4322, "mean_token_accuracy": 0.8827796950936317, "num_tokens": 2083157.0, "step": 355 }, { "entropy": 1.449633464217186, "epoch": 0.8125534950071327, "grad_norm": 3.578125, "learning_rate": 3.4719133821285108e-06, "loss": 0.497, "mean_token_accuracy": 0.8483736291527748, "num_tokens": 2089047.0, "step": 356 }, { "entropy": 1.4000667333602905, "epoch": 0.8148359486447931, "grad_norm": 3.109375, "learning_rate": 3.4632300535532415e-06, "loss": 0.5416, "mean_token_accuracy": 0.8374148234724998, "num_tokens": 2095911.0, "step": 357 }, { "entropy": 1.5335423648357391, "epoch": 0.8171184022824537, "grad_norm": 3.703125, "learning_rate": 3.4545330572239234e-06, "loss": 0.4418, "mean_token_accuracy": 0.8705498203635216, "num_tokens": 2101062.0, "step": 358 }, { "entropy": 1.4877882897853851, "epoch": 0.8194008559201141, "grad_norm": 2.96875, "learning_rate": 3.445822516546598e-06, "loss": 0.382, "mean_token_accuracy": 0.885826900601387, "num_tokens": 2107503.0, "step": 359 }, { "entropy": 1.5615941286087036, "epoch": 0.8216833095577746, "grad_norm": 3.515625, "learning_rate": 3.437098555119493e-06, "loss": 0.4703, "mean_token_accuracy": 0.8597147017717361, "num_tokens": 2112957.0, "step": 360 }, { "entropy": 1.4338414072990417, "epoch": 0.8239657631954351, "grad_norm": 3.65625, "learning_rate": 3.4283612967312692e-06, "loss": 0.4431, "mean_token_accuracy": 0.8747149705886841, "num_tokens": 2119534.0, "step": 361 }, { "entropy": 1.3991961032152176, "epoch": 0.8262482168330956, "grad_norm": 2.8125, "learning_rate": 3.4196108653592662e-06, "loss": 0.3343, "mean_token_accuracy": 0.9073175340890884, "num_tokens": 2125905.0, "step": 362 }, { "entropy": 1.4029065370559692, "epoch": 0.828530670470756, "grad_norm": 3.265625, "learning_rate": 3.4108473851677408e-06, "loss": 0.3691, "mean_token_accuracy": 0.8828721046447754, "num_tokens": 2132517.0, "step": 363 }, { "entropy": 1.4478721916675568, "epoch": 0.8308131241084166, "grad_norm": 3.0, "learning_rate": 3.4020709805061066e-06, "loss": 0.399, "mean_token_accuracy": 0.8760695457458496, "num_tokens": 2138908.0, "step": 364 }, { "entropy": 1.470540538430214, "epoch": 0.833095577746077, "grad_norm": 3.59375, "learning_rate": 3.3932817759071666e-06, "loss": 0.4839, "mean_token_accuracy": 0.8647991716861725, "num_tokens": 2144936.0, "step": 365 }, { "entropy": 1.3821264803409576, "epoch": 0.8353780313837376, "grad_norm": 3.40625, "learning_rate": 3.3844798960853533e-06, "loss": 0.4712, "mean_token_accuracy": 0.8681535720825195, "num_tokens": 2151022.0, "step": 366 }, { "entropy": 1.4431174248456955, "epoch": 0.837660485021398, "grad_norm": 3.484375, "learning_rate": 3.3756654659349487e-06, "loss": 0.4008, "mean_token_accuracy": 0.8728353902697563, "num_tokens": 2156626.0, "step": 367 }, { "entropy": 1.3731088489294052, "epoch": 0.8399429386590584, "grad_norm": 3.40625, "learning_rate": 3.3668386105283226e-06, "loss": 0.4741, "mean_token_accuracy": 0.863268293440342, "num_tokens": 2163234.0, "step": 368 }, { "entropy": 1.4210239797830582, "epoch": 0.842225392296719, "grad_norm": 2.921875, "learning_rate": 3.357999455114148e-06, "loss": 0.4039, "mean_token_accuracy": 0.8817742839455605, "num_tokens": 2169749.0, "step": 369 }, { "entropy": 1.4794443249702454, "epoch": 0.8445078459343794, "grad_norm": 3.25, "learning_rate": 3.3491481251156355e-06, "loss": 0.4879, "mean_token_accuracy": 0.8580229580402374, "num_tokens": 2175776.0, "step": 370 }, { "entropy": 1.6413906067609787, "epoch": 0.84679029957204, "grad_norm": 4.1875, "learning_rate": 3.34028474612874e-06, "loss": 0.4411, "mean_token_accuracy": 0.8557733818888664, "num_tokens": 2180562.0, "step": 371 }, { "entropy": 1.410418540239334, "epoch": 0.8490727532097004, "grad_norm": 3.359375, "learning_rate": 3.3314094439203903e-06, "loss": 0.4152, "mean_token_accuracy": 0.8825007230043411, "num_tokens": 2185764.0, "step": 372 }, { "entropy": 1.479749009013176, "epoch": 0.8513552068473609, "grad_norm": 3.578125, "learning_rate": 3.322522344426698e-06, "loss": 0.4534, "mean_token_accuracy": 0.8688785433769226, "num_tokens": 2191225.0, "step": 373 }, { "entropy": 1.4503730237483978, "epoch": 0.8536376604850214, "grad_norm": 3.1875, "learning_rate": 3.3136235737511715e-06, "loss": 0.3714, "mean_token_accuracy": 0.8881650194525719, "num_tokens": 2196792.0, "step": 374 }, { "entropy": 1.3789267241954803, "epoch": 0.8559201141226819, "grad_norm": 2.953125, "learning_rate": 3.3047132581629297e-06, "loss": 0.398, "mean_token_accuracy": 0.8848712220788002, "num_tokens": 2203140.0, "step": 375 }, { "entropy": 1.4894972145557404, "epoch": 0.8582025677603423, "grad_norm": 3.203125, "learning_rate": 3.295791524094906e-06, "loss": 0.3865, "mean_token_accuracy": 0.8710450083017349, "num_tokens": 2209122.0, "step": 376 }, { "entropy": 1.3985904306173325, "epoch": 0.8604850213980029, "grad_norm": 2.875, "learning_rate": 3.286858498142057e-06, "loss": 0.4158, "mean_token_accuracy": 0.878923624753952, "num_tokens": 2215258.0, "step": 377 }, { "entropy": 1.548867017030716, "epoch": 0.8627674750356633, "grad_norm": 3.484375, "learning_rate": 3.277914307059566e-06, "loss": 0.5408, "mean_token_accuracy": 0.8471002653241158, "num_tokens": 2221371.0, "step": 378 }, { "entropy": 1.4772979021072388, "epoch": 0.8650499286733239, "grad_norm": 3.25, "learning_rate": 3.2689590777610443e-06, "loss": 0.3972, "mean_token_accuracy": 0.8763172924518585, "num_tokens": 2227158.0, "step": 379 }, { "entropy": 1.5023012608289719, "epoch": 0.8673323823109843, "grad_norm": 2.96875, "learning_rate": 3.259992937316727e-06, "loss": 0.4516, "mean_token_accuracy": 0.8623324111104012, "num_tokens": 2233629.0, "step": 380 }, { "entropy": 1.5667530596256256, "epoch": 0.8696148359486447, "grad_norm": 5.0625, "learning_rate": 3.251016012951678e-06, "loss": 0.6043, "mean_token_accuracy": 0.8312884569168091, "num_tokens": 2239082.0, "step": 381 }, { "entropy": 1.380866751074791, "epoch": 0.8718972895863053, "grad_norm": 3.140625, "learning_rate": 3.242028432043974e-06, "loss": 0.4196, "mean_token_accuracy": 0.8756621181964874, "num_tokens": 2245272.0, "step": 382 }, { "entropy": 1.4950210005044937, "epoch": 0.8741797432239657, "grad_norm": 3.265625, "learning_rate": 3.2330303221229078e-06, "loss": 0.4317, "mean_token_accuracy": 0.8579834923148155, "num_tokens": 2251010.0, "step": 383 }, { "entropy": 1.7085559666156769, "epoch": 0.8764621968616263, "grad_norm": 4.6875, "learning_rate": 3.2240218108671683e-06, "loss": 0.6511, "mean_token_accuracy": 0.8028427958488464, "num_tokens": 2256288.0, "step": 384 }, { "entropy": 1.579810380935669, "epoch": 0.8787446504992867, "grad_norm": 3.5, "learning_rate": 3.2150030261030414e-06, "loss": 0.4849, "mean_token_accuracy": 0.8453002646565437, "num_tokens": 2262186.0, "step": 385 }, { "entropy": 1.5028070509433746, "epoch": 0.8810271041369472, "grad_norm": 3.6875, "learning_rate": 3.205974095802582e-06, "loss": 0.5576, "mean_token_accuracy": 0.8453918322920799, "num_tokens": 2268003.0, "step": 386 }, { "entropy": 1.50083489716053, "epoch": 0.8833095577746077, "grad_norm": 3.859375, "learning_rate": 3.196935148081808e-06, "loss": 0.5821, "mean_token_accuracy": 0.8238921985030174, "num_tokens": 2273238.0, "step": 387 }, { "entropy": 1.460751935839653, "epoch": 0.8855920114122682, "grad_norm": 3.078125, "learning_rate": 3.187886311198881e-06, "loss": 0.463, "mean_token_accuracy": 0.8708171024918556, "num_tokens": 2279778.0, "step": 388 }, { "entropy": 1.3422992527484894, "epoch": 0.8878744650499286, "grad_norm": 3.28125, "learning_rate": 3.178827713552281e-06, "loss": 0.4008, "mean_token_accuracy": 0.875513955950737, "num_tokens": 2286016.0, "step": 389 }, { "entropy": 1.5027628540992737, "epoch": 0.8901569186875892, "grad_norm": 3.328125, "learning_rate": 3.1697594836789924e-06, "loss": 0.5086, "mean_token_accuracy": 0.8417061790823936, "num_tokens": 2291896.0, "step": 390 }, { "entropy": 1.5571343451738358, "epoch": 0.8924393723252496, "grad_norm": 4.53125, "learning_rate": 3.160681750252674e-06, "loss": 0.5863, "mean_token_accuracy": 0.8346568569540977, "num_tokens": 2296989.0, "step": 391 }, { "entropy": 1.4478174448013306, "epoch": 0.8947218259629102, "grad_norm": 3.265625, "learning_rate": 3.1515946420818343e-06, "loss": 0.4618, "mean_token_accuracy": 0.8564577624201775, "num_tokens": 2303240.0, "step": 392 }, { "entropy": 1.4417100101709366, "epoch": 0.8970042796005706, "grad_norm": 3.3125, "learning_rate": 3.142498288108007e-06, "loss": 0.5086, "mean_token_accuracy": 0.8544816300272942, "num_tokens": 2308819.0, "step": 393 }, { "entropy": 1.549110621213913, "epoch": 0.899286733238231, "grad_norm": 3.234375, "learning_rate": 3.133392817403919e-06, "loss": 0.4943, "mean_token_accuracy": 0.8492691740393639, "num_tokens": 2315199.0, "step": 394 }, { "entropy": 1.437395378947258, "epoch": 0.9015691868758916, "grad_norm": 3.265625, "learning_rate": 3.124278359171657e-06, "loss": 0.4162, "mean_token_accuracy": 0.8790151923894882, "num_tokens": 2321449.0, "step": 395 }, { "entropy": 1.4882071912288666, "epoch": 0.903851640513552, "grad_norm": 3.15625, "learning_rate": 3.1151550427408383e-06, "loss": 0.3974, "mean_token_accuracy": 0.8646276146173477, "num_tokens": 2327198.0, "step": 396 }, { "entropy": 1.414357990026474, "epoch": 0.9061340941512126, "grad_norm": 3.640625, "learning_rate": 3.1060229975667716e-06, "loss": 0.3884, "mean_token_accuracy": 0.874775730073452, "num_tokens": 2333184.0, "step": 397 }, { "entropy": 1.5017937868833542, "epoch": 0.908416547788873, "grad_norm": 3.09375, "learning_rate": 3.0968823532286246e-06, "loss": 0.4596, "mean_token_accuracy": 0.8661977797746658, "num_tokens": 2339353.0, "step": 398 }, { "entropy": 1.4912959188222885, "epoch": 0.9106990014265335, "grad_norm": 3.015625, "learning_rate": 3.0877332394275806e-06, "loss": 0.3845, "mean_token_accuracy": 0.8872612118721008, "num_tokens": 2345323.0, "step": 399 }, { "entropy": 1.5040694773197174, "epoch": 0.912981455064194, "grad_norm": 3.84375, "learning_rate": 3.0785757859850025e-06, "loss": 0.4793, "mean_token_accuracy": 0.8584380373358727, "num_tokens": 2350382.0, "step": 400 }, { "epoch": 0.912981455064194, "eval_entropy": 1.4835859013928308, "eval_loss": 0.47563549876213074, "eval_mean_token_accuracy": 0.8651414997047848, "eval_num_tokens": 2350382.0, "eval_runtime": 4.4144, "eval_samples_per_second": 20.388, "eval_steps_per_second": 20.388, "step": 400 }, { "entropy": 1.4684801995754242, "epoch": 0.9152639087018545, "grad_norm": 3.765625, "learning_rate": 3.069410122840585e-06, "loss": 0.4838, "mean_token_accuracy": 0.8577789217233658, "num_tokens": 2356642.0, "step": 401 }, { "entropy": 1.4736972451210022, "epoch": 0.917546362339515, "grad_norm": 3.09375, "learning_rate": 3.0602363800505198e-06, "loss": 0.4626, "mean_token_accuracy": 0.8666577711701393, "num_tokens": 2363069.0, "step": 402 }, { "entropy": 1.4170372486114502, "epoch": 0.9198288159771755, "grad_norm": 2.96875, "learning_rate": 3.05105468778564e-06, "loss": 0.4183, "mean_token_accuracy": 0.8878279328346252, "num_tokens": 2369558.0, "step": 403 }, { "entropy": 1.2785319834947586, "epoch": 0.9221112696148359, "grad_norm": 3.0, "learning_rate": 3.041865176329579e-06, "loss": 0.383, "mean_token_accuracy": 0.8874974772334099, "num_tokens": 2376487.0, "step": 404 }, { "entropy": 1.5108132362365723, "epoch": 0.9243937232524965, "grad_norm": 3.796875, "learning_rate": 3.032667976076923e-06, "loss": 0.5087, "mean_token_accuracy": 0.8496776968240738, "num_tokens": 2382047.0, "step": 405 }, { "entropy": 1.4732455164194107, "epoch": 0.9266761768901569, "grad_norm": 2.84375, "learning_rate": 3.0234632175313537e-06, "loss": 0.3808, "mean_token_accuracy": 0.8731858357787132, "num_tokens": 2388697.0, "step": 406 }, { "entropy": 1.428204596042633, "epoch": 0.9289586305278174, "grad_norm": 2.96875, "learning_rate": 3.0142510313038057e-06, "loss": 0.3893, "mean_token_accuracy": 0.8852085620164871, "num_tokens": 2395175.0, "step": 407 }, { "entropy": 1.3948392271995544, "epoch": 0.9312410841654779, "grad_norm": 3.015625, "learning_rate": 3.0050315481106074e-06, "loss": 0.4367, "mean_token_accuracy": 0.8680780380964279, "num_tokens": 2401107.0, "step": 408 }, { "entropy": 1.4686945080757141, "epoch": 0.9335235378031383, "grad_norm": 3.078125, "learning_rate": 2.9958048987716266e-06, "loss": 0.4492, "mean_token_accuracy": 0.8716259375214577, "num_tokens": 2407315.0, "step": 409 }, { "entropy": 1.5125146508216858, "epoch": 0.9358059914407989, "grad_norm": 3.5625, "learning_rate": 2.9865712142084145e-06, "loss": 0.5313, "mean_token_accuracy": 0.8568686470389366, "num_tokens": 2413259.0, "step": 410 }, { "entropy": 1.433497592806816, "epoch": 0.9380884450784593, "grad_norm": 3.0, "learning_rate": 2.977330625442352e-06, "loss": 0.412, "mean_token_accuracy": 0.8721762746572495, "num_tokens": 2419468.0, "step": 411 }, { "entropy": 1.4551435112953186, "epoch": 0.9403708987161198, "grad_norm": 2.90625, "learning_rate": 2.9680832635927824e-06, "loss": 0.472, "mean_token_accuracy": 0.8528627678751945, "num_tokens": 2426271.0, "step": 412 }, { "entropy": 1.447442203760147, "epoch": 0.9426533523537803, "grad_norm": 2.71875, "learning_rate": 2.95882925987516e-06, "loss": 0.3598, "mean_token_accuracy": 0.8820754066109657, "num_tokens": 2432887.0, "step": 413 }, { "entropy": 1.5209446549415588, "epoch": 0.9449358059914408, "grad_norm": 3.46875, "learning_rate": 2.949568745599182e-06, "loss": 0.4893, "mean_token_accuracy": 0.8616260290145874, "num_tokens": 2438656.0, "step": 414 }, { "entropy": 1.4069498479366302, "epoch": 0.9472182596291013, "grad_norm": 3.46875, "learning_rate": 2.9403018521669256e-06, "loss": 0.5104, "mean_token_accuracy": 0.8574993088841438, "num_tokens": 2444704.0, "step": 415 }, { "entropy": 1.487932413816452, "epoch": 0.9495007132667618, "grad_norm": 3.21875, "learning_rate": 2.9310287110709895e-06, "loss": 0.4016, "mean_token_accuracy": 0.8731286600232124, "num_tokens": 2450361.0, "step": 416 }, { "entropy": 1.5046747326850891, "epoch": 0.9517831669044222, "grad_norm": 3.34375, "learning_rate": 2.921749453892618e-06, "loss": 0.4286, "mean_token_accuracy": 0.8756372630596161, "num_tokens": 2456532.0, "step": 417 }, { "entropy": 1.5569333881139755, "epoch": 0.9540656205420828, "grad_norm": 3.4375, "learning_rate": 2.9124642122998453e-06, "loss": 0.5047, "mean_token_accuracy": 0.8422510251402855, "num_tokens": 2462276.0, "step": 418 }, { "entropy": 1.477220967411995, "epoch": 0.9563480741797432, "grad_norm": 3.015625, "learning_rate": 2.903173118045616e-06, "loss": 0.4585, "mean_token_accuracy": 0.8631913363933563, "num_tokens": 2468621.0, "step": 419 }, { "entropy": 1.3926943019032478, "epoch": 0.9586305278174037, "grad_norm": 3.53125, "learning_rate": 2.893876302965925e-06, "loss": 0.4379, "mean_token_accuracy": 0.8661207035183907, "num_tokens": 2474234.0, "step": 420 }, { "entropy": 1.5482182949781418, "epoch": 0.9609129814550642, "grad_norm": 3.78125, "learning_rate": 2.884573898977941e-06, "loss": 0.507, "mean_token_accuracy": 0.8496933579444885, "num_tokens": 2479680.0, "step": 421 }, { "entropy": 1.360275536775589, "epoch": 0.9631954350927246, "grad_norm": 3.3125, "learning_rate": 2.8752660380781367e-06, "loss": 0.4307, "mean_token_accuracy": 0.8788939565420151, "num_tokens": 2485907.0, "step": 422 }, { "entropy": 1.3031716644763947, "epoch": 0.9654778887303852, "grad_norm": 2.875, "learning_rate": 2.865952852340417e-06, "loss": 0.3625, "mean_token_accuracy": 0.8956428542733192, "num_tokens": 2492467.0, "step": 423 }, { "entropy": 1.541382610797882, "epoch": 0.9677603423680456, "grad_norm": 3.8125, "learning_rate": 2.856634473914242e-06, "loss": 0.5266, "mean_token_accuracy": 0.8559072092175484, "num_tokens": 2498045.0, "step": 424 }, { "entropy": 1.4921831041574478, "epoch": 0.9700427960057061, "grad_norm": 3.28125, "learning_rate": 2.8473110350227536e-06, "loss": 0.3466, "mean_token_accuracy": 0.8902567103505135, "num_tokens": 2503553.0, "step": 425 }, { "entropy": 1.470309928059578, "epoch": 0.9723252496433666, "grad_norm": 3.375, "learning_rate": 2.8379826679609e-06, "loss": 0.4556, "mean_token_accuracy": 0.8601387813687325, "num_tokens": 2509707.0, "step": 426 }, { "entropy": 1.3546678721904755, "epoch": 0.9746077032810271, "grad_norm": 2.828125, "learning_rate": 2.828649505093558e-06, "loss": 0.3985, "mean_token_accuracy": 0.8941172435879707, "num_tokens": 2516288.0, "step": 427 }, { "entropy": 1.4447802305221558, "epoch": 0.9768901569186876, "grad_norm": 3.421875, "learning_rate": 2.819311678853652e-06, "loss": 0.4776, "mean_token_accuracy": 0.8569598346948624, "num_tokens": 2521956.0, "step": 428 }, { "entropy": 1.6203635483980179, "epoch": 0.9791726105563481, "grad_norm": 3.734375, "learning_rate": 2.8099693217402807e-06, "loss": 0.4593, "mean_token_accuracy": 0.8529090061783791, "num_tokens": 2526920.0, "step": 429 }, { "entropy": 1.473097711801529, "epoch": 0.9814550641940085, "grad_norm": 3.265625, "learning_rate": 2.800622566316831e-06, "loss": 0.5033, "mean_token_accuracy": 0.8560734689235687, "num_tokens": 2533504.0, "step": 430 }, { "entropy": 1.5207239985466003, "epoch": 0.9837375178316691, "grad_norm": 4.09375, "learning_rate": 2.7912715452091014e-06, "loss": 0.5041, "mean_token_accuracy": 0.8554971441626549, "num_tokens": 2538535.0, "step": 431 }, { "entropy": 1.5741059184074402, "epoch": 0.9860199714693295, "grad_norm": 4.0, "learning_rate": 2.7819163911034175e-06, "loss": 0.4511, "mean_token_accuracy": 0.8700136467814445, "num_tokens": 2543371.0, "step": 432 }, { "entropy": 1.3865297734737396, "epoch": 0.9883024251069901, "grad_norm": 3.4375, "learning_rate": 2.77255723674475e-06, "loss": 0.4648, "mean_token_accuracy": 0.8642655313014984, "num_tokens": 2549303.0, "step": 433 }, { "entropy": 1.484322428703308, "epoch": 0.9905848787446505, "grad_norm": 3.453125, "learning_rate": 2.7631942149348313e-06, "loss": 0.5178, "mean_token_accuracy": 0.8604016155004501, "num_tokens": 2554892.0, "step": 434 }, { "entropy": 1.4711394906044006, "epoch": 0.992867332382311, "grad_norm": 3.1875, "learning_rate": 2.7538274585302707e-06, "loss": 0.5105, "mean_token_accuracy": 0.8574899211525917, "num_tokens": 2561168.0, "step": 435 }, { "entropy": 1.4003391563892365, "epoch": 0.9951497860199715, "grad_norm": 2.890625, "learning_rate": 2.74445710044067e-06, "loss": 0.3995, "mean_token_accuracy": 0.8786035105586052, "num_tokens": 2567401.0, "step": 436 }, { "entropy": 1.4778650850057602, "epoch": 0.997432239657632, "grad_norm": 3.25, "learning_rate": 2.735083273626738e-06, "loss": 0.5094, "mean_token_accuracy": 0.8610806316137314, "num_tokens": 2573896.0, "step": 437 }, { "entropy": 1.5298404842615128, "epoch": 0.9997146932952924, "grad_norm": 3.765625, "learning_rate": 2.7257061110984005e-06, "loss": 0.5801, "mean_token_accuracy": 0.8354984298348427, "num_tokens": 2579575.0, "step": 438 }, { "entropy": 1.2647957801818848, "epoch": 1.0, "grad_norm": 7.5, "learning_rate": 2.7163257459129184e-06, "loss": 0.3378, "mean_token_accuracy": 0.9111570119857788, "num_tokens": 2580462.0, "step": 439 }, { "entropy": 1.5493428707122803, "epoch": 1.0022824536376604, "grad_norm": 3.53125, "learning_rate": 2.7069423111729948e-06, "loss": 0.482, "mean_token_accuracy": 0.8536929711699486, "num_tokens": 2586104.0, "step": 440 }, { "entropy": 1.6429398506879807, "epoch": 1.0045649072753209, "grad_norm": 3.765625, "learning_rate": 2.6975559400248876e-06, "loss": 0.5162, "mean_token_accuracy": 0.8646445199847221, "num_tokens": 2591601.0, "step": 441 }, { "entropy": 1.3536241203546524, "epoch": 1.0068473609129815, "grad_norm": 2.53125, "learning_rate": 2.688166765656523e-06, "loss": 0.3578, "mean_token_accuracy": 0.8843531683087349, "num_tokens": 2598127.0, "step": 442 }, { "entropy": 1.4669701904058456, "epoch": 1.009129814550642, "grad_norm": 3.921875, "learning_rate": 2.6787749212956023e-06, "loss": 0.5313, "mean_token_accuracy": 0.8472650721669197, "num_tokens": 2603447.0, "step": 443 }, { "entropy": 1.4554204195737839, "epoch": 1.0114122681883024, "grad_norm": 3.78125, "learning_rate": 2.6693805402077123e-06, "loss": 0.5817, "mean_token_accuracy": 0.83076561242342, "num_tokens": 2609040.0, "step": 444 }, { "entropy": 1.4986287206411362, "epoch": 1.0136947218259629, "grad_norm": 3.546875, "learning_rate": 2.6599837556944353e-06, "loss": 0.498, "mean_token_accuracy": 0.8590250089764595, "num_tokens": 2615545.0, "step": 445 }, { "entropy": 1.5251432359218597, "epoch": 1.0159771754636233, "grad_norm": 4.0, "learning_rate": 2.6505847010914575e-06, "loss": 0.633, "mean_token_accuracy": 0.8183507323265076, "num_tokens": 2621930.0, "step": 446 }, { "entropy": 1.4970913529396057, "epoch": 1.018259629101284, "grad_norm": 3.203125, "learning_rate": 2.641183509766675e-06, "loss": 0.3988, "mean_token_accuracy": 0.8723035603761673, "num_tokens": 2627761.0, "step": 447 }, { "entropy": 1.4567296206951141, "epoch": 1.0205420827389444, "grad_norm": 3.296875, "learning_rate": 2.6317803151183053e-06, "loss": 0.4201, "mean_token_accuracy": 0.8818748518824577, "num_tokens": 2633748.0, "step": 448 }, { "entropy": 1.4635232239961624, "epoch": 1.0228245363766049, "grad_norm": 3.109375, "learning_rate": 2.6223752505729884e-06, "loss": 0.452, "mean_token_accuracy": 0.8645489439368248, "num_tokens": 2639662.0, "step": 449 }, { "entropy": 1.4294497519731522, "epoch": 1.0251069900142653, "grad_norm": 3.28125, "learning_rate": 2.6129684495839013e-06, "loss": 0.5102, "mean_token_accuracy": 0.8570954278111458, "num_tokens": 2645946.0, "step": 450 }, { "entropy": 1.3900626301765442, "epoch": 1.0273894436519257, "grad_norm": 2.9375, "learning_rate": 2.6035600456288573e-06, "loss": 0.3859, "mean_token_accuracy": 0.8834785372018814, "num_tokens": 2652364.0, "step": 451 }, { "entropy": 1.4409504532814026, "epoch": 1.0296718972895864, "grad_norm": 3.1875, "learning_rate": 2.594150172208417e-06, "loss": 0.4641, "mean_token_accuracy": 0.8652448132634163, "num_tokens": 2658338.0, "step": 452 }, { "entropy": 1.5055885165929794, "epoch": 1.0319543509272469, "grad_norm": 3.625, "learning_rate": 2.5847389628439905e-06, "loss": 0.426, "mean_token_accuracy": 0.8645097240805626, "num_tokens": 2663620.0, "step": 453 }, { "entropy": 1.5077017843723297, "epoch": 1.0342368045649073, "grad_norm": 3.15625, "learning_rate": 2.575326551075945e-06, "loss": 0.4288, "mean_token_accuracy": 0.8733096942305565, "num_tokens": 2669362.0, "step": 454 }, { "entropy": 1.3824554234743118, "epoch": 1.0365192582025677, "grad_norm": 3.03125, "learning_rate": 2.5659130704617092e-06, "loss": 0.4209, "mean_token_accuracy": 0.8664216324687004, "num_tokens": 2675587.0, "step": 455 }, { "entropy": 1.4790180027484894, "epoch": 1.0388017118402282, "grad_norm": 3.09375, "learning_rate": 2.5564986545738767e-06, "loss": 0.3928, "mean_token_accuracy": 0.8827410265803337, "num_tokens": 2681742.0, "step": 456 }, { "entropy": 1.4870340526103973, "epoch": 1.0410841654778886, "grad_norm": 3.734375, "learning_rate": 2.547083436998316e-06, "loss": 0.3968, "mean_token_accuracy": 0.8777871504426003, "num_tokens": 2687070.0, "step": 457 }, { "entropy": 1.492873653769493, "epoch": 1.0433666191155493, "grad_norm": 3.375, "learning_rate": 2.5376675513322665e-06, "loss": 0.4273, "mean_token_accuracy": 0.8743336573243141, "num_tokens": 2693415.0, "step": 458 }, { "entropy": 1.5607992857694626, "epoch": 1.0456490727532097, "grad_norm": 4.0, "learning_rate": 2.52825113118245e-06, "loss": 0.5436, "mean_token_accuracy": 0.8444447070360184, "num_tokens": 2699241.0, "step": 459 }, { "entropy": 1.4991340637207031, "epoch": 1.0479315263908702, "grad_norm": 3.0, "learning_rate": 2.5188343101631717e-06, "loss": 0.4713, "mean_token_accuracy": 0.8594570085406303, "num_tokens": 2705629.0, "step": 460 }, { "entropy": 1.4429044276475906, "epoch": 1.0502139800285306, "grad_norm": 3.28125, "learning_rate": 2.5094172218944276e-06, "loss": 0.5136, "mean_token_accuracy": 0.8507946282625198, "num_tokens": 2711944.0, "step": 461 }, { "entropy": 1.5478469878435135, "epoch": 1.052496433666191, "grad_norm": 3.21875, "learning_rate": 2.5e-06, "loss": 0.4498, "mean_token_accuracy": 0.8698392882943153, "num_tokens": 2717870.0, "step": 462 }, { "entropy": 1.4724483042955399, "epoch": 1.0547788873038517, "grad_norm": 4.09375, "learning_rate": 2.4905827781055733e-06, "loss": 0.5091, "mean_token_accuracy": 0.8364823833107948, "num_tokens": 2722955.0, "step": 463 }, { "entropy": 1.4399842321872711, "epoch": 1.0570613409415122, "grad_norm": 2.96875, "learning_rate": 2.4811656898368287e-06, "loss": 0.4118, "mean_token_accuracy": 0.8793508112430573, "num_tokens": 2729267.0, "step": 464 }, { "entropy": 1.4447701424360275, "epoch": 1.0593437945791726, "grad_norm": 3.3125, "learning_rate": 2.4717488688175513e-06, "loss": 0.4089, "mean_token_accuracy": 0.8816163316369057, "num_tokens": 2735200.0, "step": 465 }, { "entropy": 1.507298544049263, "epoch": 1.061626248216833, "grad_norm": 3.71875, "learning_rate": 2.4623324486677352e-06, "loss": 0.5426, "mean_token_accuracy": 0.8359150066971779, "num_tokens": 2740627.0, "step": 466 }, { "entropy": 1.4749993681907654, "epoch": 1.0639087018544935, "grad_norm": 3.28125, "learning_rate": 2.4529165630016855e-06, "loss": 0.4186, "mean_token_accuracy": 0.8762158378958702, "num_tokens": 2745817.0, "step": 467 }, { "entropy": 1.5043630599975586, "epoch": 1.0661911554921542, "grad_norm": 3.25, "learning_rate": 2.4435013454261246e-06, "loss": 0.4691, "mean_token_accuracy": 0.8595764860510826, "num_tokens": 2752047.0, "step": 468 }, { "entropy": 1.464219182729721, "epoch": 1.0684736091298146, "grad_norm": 3.609375, "learning_rate": 2.4340869295382924e-06, "loss": 0.4847, "mean_token_accuracy": 0.8647123128175735, "num_tokens": 2758030.0, "step": 469 }, { "entropy": 1.5525110363960266, "epoch": 1.070756062767475, "grad_norm": 3.40625, "learning_rate": 2.4246734489240554e-06, "loss": 0.4389, "mean_token_accuracy": 0.871659129858017, "num_tokens": 2763739.0, "step": 470 }, { "entropy": 1.441315084695816, "epoch": 1.0730385164051355, "grad_norm": 3.09375, "learning_rate": 2.4152610371560095e-06, "loss": 0.4706, "mean_token_accuracy": 0.8659368455410004, "num_tokens": 2770144.0, "step": 471 }, { "entropy": 1.5431715548038483, "epoch": 1.075320970042796, "grad_norm": 3.671875, "learning_rate": 2.4058498277915835e-06, "loss": 0.5396, "mean_token_accuracy": 0.8234963491559029, "num_tokens": 2776060.0, "step": 472 }, { "entropy": 1.3775285333395004, "epoch": 1.0776034236804566, "grad_norm": 3.0625, "learning_rate": 2.3964399543711427e-06, "loss": 0.3289, "mean_token_accuracy": 0.8977130725979805, "num_tokens": 2782100.0, "step": 473 }, { "entropy": 1.424841582775116, "epoch": 1.079885877318117, "grad_norm": 3.125, "learning_rate": 2.3870315504160995e-06, "loss": 0.4425, "mean_token_accuracy": 0.8671782091259956, "num_tokens": 2787965.0, "step": 474 }, { "entropy": 1.4423463493585587, "epoch": 1.0821683309557775, "grad_norm": 2.953125, "learning_rate": 2.377624749427012e-06, "loss": 0.3595, "mean_token_accuracy": 0.8889539316296577, "num_tokens": 2794165.0, "step": 475 }, { "entropy": 1.4992396533489227, "epoch": 1.084450784593438, "grad_norm": 3.875, "learning_rate": 2.3682196848816955e-06, "loss": 0.4793, "mean_token_accuracy": 0.8694660887122154, "num_tokens": 2800010.0, "step": 476 }, { "entropy": 1.4096488505601883, "epoch": 1.0867332382310984, "grad_norm": 3.03125, "learning_rate": 2.358816490233326e-06, "loss": 0.3516, "mean_token_accuracy": 0.8974229022860527, "num_tokens": 2805889.0, "step": 477 }, { "entropy": 1.4805195033550262, "epoch": 1.089015691868759, "grad_norm": 3.34375, "learning_rate": 2.3494152989085433e-06, "loss": 0.5061, "mean_token_accuracy": 0.8679251745343208, "num_tokens": 2811684.0, "step": 478 }, { "entropy": 1.5036189705133438, "epoch": 1.0912981455064195, "grad_norm": 3.546875, "learning_rate": 2.3400162443055655e-06, "loss": 0.5221, "mean_token_accuracy": 0.8420342952013016, "num_tokens": 2817131.0, "step": 479 }, { "entropy": 1.594360738992691, "epoch": 1.09358059914408, "grad_norm": 4.0, "learning_rate": 2.330619459792289e-06, "loss": 0.5052, "mean_token_accuracy": 0.8538608327507973, "num_tokens": 2822205.0, "step": 480 }, { "entropy": 1.3911210894584656, "epoch": 1.0958630527817403, "grad_norm": 2.796875, "learning_rate": 2.321225078704399e-06, "loss": 0.3525, "mean_token_accuracy": 0.8852925226092339, "num_tokens": 2828146.0, "step": 481 }, { "entropy": 1.5996953547000885, "epoch": 1.0981455064194008, "grad_norm": 3.4375, "learning_rate": 2.311833234343478e-06, "loss": 0.4677, "mean_token_accuracy": 0.8572832494974136, "num_tokens": 2833879.0, "step": 482 }, { "entropy": 1.5117892771959305, "epoch": 1.1004279600570612, "grad_norm": 4.09375, "learning_rate": 2.3024440599751132e-06, "loss": 0.4467, "mean_token_accuracy": 0.8582476228475571, "num_tokens": 2839173.0, "step": 483 }, { "entropy": 1.433998242020607, "epoch": 1.102710413694722, "grad_norm": 2.90625, "learning_rate": 2.293057688827007e-06, "loss": 0.3942, "mean_token_accuracy": 0.8847835510969162, "num_tokens": 2845616.0, "step": 484 }, { "entropy": 1.5342581421136856, "epoch": 1.1049928673323823, "grad_norm": 3.078125, "learning_rate": 2.283674254087082e-06, "loss": 0.4659, "mean_token_accuracy": 0.8615615218877792, "num_tokens": 2851949.0, "step": 485 }, { "entropy": 1.5389353781938553, "epoch": 1.1072753209700428, "grad_norm": 3.421875, "learning_rate": 2.274293888901599e-06, "loss": 0.4388, "mean_token_accuracy": 0.871217891573906, "num_tokens": 2857358.0, "step": 486 }, { "entropy": 1.4772920906543732, "epoch": 1.1095577746077032, "grad_norm": 4.03125, "learning_rate": 2.264916726373263e-06, "loss": 0.5044, "mean_token_accuracy": 0.8598240464925766, "num_tokens": 2862299.0, "step": 487 }, { "entropy": 1.4805989265441895, "epoch": 1.1118402282453639, "grad_norm": 2.890625, "learning_rate": 2.2555428995593303e-06, "loss": 0.444, "mean_token_accuracy": 0.8689677938818932, "num_tokens": 2868820.0, "step": 488 }, { "entropy": 1.4840258061885834, "epoch": 1.1141226818830243, "grad_norm": 3.421875, "learning_rate": 2.24617254146973e-06, "loss": 0.4531, "mean_token_accuracy": 0.8679408878087997, "num_tokens": 2874968.0, "step": 489 }, { "entropy": 1.4381522238254547, "epoch": 1.1164051355206848, "grad_norm": 3.125, "learning_rate": 2.23680578506517e-06, "loss": 0.4115, "mean_token_accuracy": 0.8769493475556374, "num_tokens": 2880835.0, "step": 490 }, { "entropy": 1.4330200850963593, "epoch": 1.1186875891583452, "grad_norm": 2.90625, "learning_rate": 2.2274427632552507e-06, "loss": 0.4123, "mean_token_accuracy": 0.8793010637164116, "num_tokens": 2887529.0, "step": 491 }, { "entropy": 1.3696521073579788, "epoch": 1.1209700427960057, "grad_norm": 2.9375, "learning_rate": 2.2180836088965833e-06, "loss": 0.3384, "mean_token_accuracy": 0.8860399350523949, "num_tokens": 2893458.0, "step": 492 }, { "entropy": 1.4893521070480347, "epoch": 1.123252496433666, "grad_norm": 3.0, "learning_rate": 2.208728454790899e-06, "loss": 0.4691, "mean_token_accuracy": 0.8572286292910576, "num_tokens": 2899716.0, "step": 493 }, { "entropy": 1.3807679414749146, "epoch": 1.1255349500713268, "grad_norm": 3.015625, "learning_rate": 2.1993774336831696e-06, "loss": 0.4068, "mean_token_accuracy": 0.8788377121090889, "num_tokens": 2906271.0, "step": 494 }, { "entropy": 1.4945531785488129, "epoch": 1.1278174037089872, "grad_norm": 3.078125, "learning_rate": 2.19003067825972e-06, "loss": 0.4081, "mean_token_accuracy": 0.8731363192200661, "num_tokens": 2912348.0, "step": 495 }, { "entropy": 1.5495448559522629, "epoch": 1.1300998573466476, "grad_norm": 3.921875, "learning_rate": 2.180688321146349e-06, "loss": 0.601, "mean_token_accuracy": 0.8166243210434914, "num_tokens": 2918060.0, "step": 496 }, { "entropy": 1.5690300911664963, "epoch": 1.132382310984308, "grad_norm": 3.5, "learning_rate": 2.1713504949064433e-06, "loss": 0.4601, "mean_token_accuracy": 0.85266974568367, "num_tokens": 2923409.0, "step": 497 }, { "entropy": 1.3820966184139252, "epoch": 1.1346647646219685, "grad_norm": 2.703125, "learning_rate": 2.1620173320391007e-06, "loss": 0.2558, "mean_token_accuracy": 0.9106499254703522, "num_tokens": 2929722.0, "step": 498 }, { "entropy": 1.540186420083046, "epoch": 1.1369472182596292, "grad_norm": 3.21875, "learning_rate": 2.1526889649772477e-06, "loss": 0.4437, "mean_token_accuracy": 0.8645635023713112, "num_tokens": 2935812.0, "step": 499 }, { "entropy": 1.435683935880661, "epoch": 1.1392296718972896, "grad_norm": 3.234375, "learning_rate": 2.143365526085759e-06, "loss": 0.48, "mean_token_accuracy": 0.8664367198944092, "num_tokens": 2942222.0, "step": 500 }, { "epoch": 1.1392296718972896, "eval_entropy": 1.4798295431666904, "eval_loss": 0.4741344451904297, "eval_mean_token_accuracy": 0.8666040844387478, "eval_num_tokens": 2942222.0, "eval_runtime": 4.4417, "eval_samples_per_second": 20.262, "eval_steps_per_second": 20.262, "step": 500 }, { "entropy": 1.4722786843776703, "epoch": 1.14151212553495, "grad_norm": 3.484375, "learning_rate": 2.1340471476595836e-06, "loss": 0.4947, "mean_token_accuracy": 0.8604092225432396, "num_tokens": 2947869.0, "step": 501 }, { "entropy": 1.5302625745534897, "epoch": 1.1437945791726105, "grad_norm": 3.765625, "learning_rate": 2.124733961921864e-06, "loss": 0.5213, "mean_token_accuracy": 0.8443537354469299, "num_tokens": 2953787.0, "step": 502 }, { "entropy": 1.482778623700142, "epoch": 1.146077032810271, "grad_norm": 4.0, "learning_rate": 2.11542610102206e-06, "loss": 0.5494, "mean_token_accuracy": 0.8402880057692528, "num_tokens": 2958803.0, "step": 503 }, { "entropy": 1.4486149698495865, "epoch": 1.1483594864479316, "grad_norm": 3.203125, "learning_rate": 2.1061236970340756e-06, "loss": 0.4747, "mean_token_accuracy": 0.8640668168663979, "num_tokens": 2965403.0, "step": 504 }, { "entropy": 1.4366931170225143, "epoch": 1.150641940085592, "grad_norm": 3.078125, "learning_rate": 2.096826881954385e-06, "loss": 0.4002, "mean_token_accuracy": 0.869108684360981, "num_tokens": 2971085.0, "step": 505 }, { "entropy": 1.4204550981521606, "epoch": 1.1529243937232525, "grad_norm": 2.890625, "learning_rate": 2.0875357877001556e-06, "loss": 0.3827, "mean_token_accuracy": 0.8868636935949326, "num_tokens": 2976577.0, "step": 506 }, { "entropy": 1.484930396080017, "epoch": 1.155206847360913, "grad_norm": 3.578125, "learning_rate": 2.0782505461073822e-06, "loss": 0.4416, "mean_token_accuracy": 0.8644617721438408, "num_tokens": 2981977.0, "step": 507 }, { "entropy": 1.5487978011369705, "epoch": 1.1574893009985734, "grad_norm": 3.359375, "learning_rate": 2.0689712889290114e-06, "loss": 0.4142, "mean_token_accuracy": 0.8582484424114227, "num_tokens": 2987315.0, "step": 508 }, { "entropy": 1.4167566150426865, "epoch": 1.159771754636234, "grad_norm": 3.046875, "learning_rate": 2.059698147833075e-06, "loss": 0.4121, "mean_token_accuracy": 0.8841976970434189, "num_tokens": 2993295.0, "step": 509 }, { "entropy": 1.3966283351182938, "epoch": 1.1620542082738945, "grad_norm": 3.1875, "learning_rate": 2.0504312544008193e-06, "loss": 0.4939, "mean_token_accuracy": 0.8544362857937813, "num_tokens": 2999720.0, "step": 510 }, { "entropy": 1.5113223046064377, "epoch": 1.164336661911555, "grad_norm": 3.578125, "learning_rate": 2.0411707401248406e-06, "loss": 0.4498, "mean_token_accuracy": 0.8582001850008965, "num_tokens": 3004838.0, "step": 511 }, { "entropy": 1.4698415398597717, "epoch": 1.1666191155492154, "grad_norm": 3.453125, "learning_rate": 2.0319167364072184e-06, "loss": 0.4023, "mean_token_accuracy": 0.8724709004163742, "num_tokens": 3010321.0, "step": 512 }, { "entropy": 1.5211911350488663, "epoch": 1.1689015691868758, "grad_norm": 4.71875, "learning_rate": 2.0226693745576494e-06, "loss": 0.5156, "mean_token_accuracy": 0.8473959043622017, "num_tokens": 3015170.0, "step": 513 }, { "entropy": 1.3687680065631866, "epoch": 1.1711840228245363, "grad_norm": 3.15625, "learning_rate": 2.0134287857915864e-06, "loss": 0.4614, "mean_token_accuracy": 0.8563283011317253, "num_tokens": 3021067.0, "step": 514 }, { "entropy": 1.4768206179141998, "epoch": 1.173466476462197, "grad_norm": 4.0625, "learning_rate": 2.004195101228374e-06, "loss": 0.5225, "mean_token_accuracy": 0.8456647023558617, "num_tokens": 3026317.0, "step": 515 }, { "entropy": 1.5133604854345322, "epoch": 1.1757489300998574, "grad_norm": 3.359375, "learning_rate": 1.9949684518893926e-06, "loss": 0.4637, "mean_token_accuracy": 0.8587842807173729, "num_tokens": 3032462.0, "step": 516 }, { "entropy": 1.5985838025808334, "epoch": 1.1780313837375178, "grad_norm": 3.5, "learning_rate": 1.985748968696194e-06, "loss": 0.4668, "mean_token_accuracy": 0.8562392815947533, "num_tokens": 3037823.0, "step": 517 }, { "entropy": 1.2920548766851425, "epoch": 1.1803138373751783, "grad_norm": 2.453125, "learning_rate": 1.9765367824686467e-06, "loss": 0.3451, "mean_token_accuracy": 0.8893763497471809, "num_tokens": 3044938.0, "step": 518 }, { "entropy": 1.5204766243696213, "epoch": 1.182596291012839, "grad_norm": 3.203125, "learning_rate": 1.9673320239230783e-06, "loss": 0.4753, "mean_token_accuracy": 0.8598108664155006, "num_tokens": 3051301.0, "step": 519 }, { "entropy": 1.445823684334755, "epoch": 1.1848787446504994, "grad_norm": 3.875, "learning_rate": 1.9581348236704217e-06, "loss": 0.4797, "mean_token_accuracy": 0.8649851009249687, "num_tokens": 3056991.0, "step": 520 }, { "entropy": 1.4846927672624588, "epoch": 1.1871611982881598, "grad_norm": 3.875, "learning_rate": 1.9489453122143605e-06, "loss": 0.5029, "mean_token_accuracy": 0.8675966411828995, "num_tokens": 3062974.0, "step": 521 }, { "entropy": 1.4466316848993301, "epoch": 1.1894436519258202, "grad_norm": 3.421875, "learning_rate": 1.939763619949481e-06, "loss": 0.4049, "mean_token_accuracy": 0.8771371468901634, "num_tokens": 3068426.0, "step": 522 }, { "entropy": 1.5384458899497986, "epoch": 1.1917261055634807, "grad_norm": 3.484375, "learning_rate": 1.930589877159415e-06, "loss": 0.454, "mean_token_accuracy": 0.864221066236496, "num_tokens": 3074213.0, "step": 523 }, { "entropy": 1.541999727487564, "epoch": 1.1940085592011411, "grad_norm": 2.984375, "learning_rate": 1.9214242140149987e-06, "loss": 0.3965, "mean_token_accuracy": 0.874009445309639, "num_tokens": 3080429.0, "step": 524 }, { "entropy": 1.4880231320858002, "epoch": 1.1962910128388018, "grad_norm": 4.1875, "learning_rate": 1.9122667605724202e-06, "loss": 0.5623, "mean_token_accuracy": 0.8356714621186256, "num_tokens": 3085713.0, "step": 525 }, { "entropy": 1.5347374975681305, "epoch": 1.1985734664764622, "grad_norm": 2.890625, "learning_rate": 1.9031176467713763e-06, "loss": 0.3592, "mean_token_accuracy": 0.8790554702281952, "num_tokens": 3092191.0, "step": 526 }, { "entropy": 1.4829518347978592, "epoch": 1.2008559201141227, "grad_norm": 3.078125, "learning_rate": 1.8939770024332294e-06, "loss": 0.3886, "mean_token_accuracy": 0.882826641201973, "num_tokens": 3098756.0, "step": 527 }, { "entropy": 1.4315824955701828, "epoch": 1.2031383737517831, "grad_norm": 3.203125, "learning_rate": 1.884844957259163e-06, "loss": 0.4995, "mean_token_accuracy": 0.8524395078420639, "num_tokens": 3104965.0, "step": 528 }, { "entropy": 1.4298695474863052, "epoch": 1.2054208273894436, "grad_norm": 2.8125, "learning_rate": 1.875721640828344e-06, "loss": 0.3871, "mean_token_accuracy": 0.8858682960271835, "num_tokens": 3111490.0, "step": 529 }, { "entropy": 1.4648047238588333, "epoch": 1.2077032810271042, "grad_norm": 2.859375, "learning_rate": 1.866607182596081e-06, "loss": 0.3277, "mean_token_accuracy": 0.8968348726630211, "num_tokens": 3117215.0, "step": 530 }, { "entropy": 1.5635619461536407, "epoch": 1.2099857346647647, "grad_norm": 3.46875, "learning_rate": 1.857501711891993e-06, "loss": 0.4185, "mean_token_accuracy": 0.8711593821644783, "num_tokens": 3123093.0, "step": 531 }, { "entropy": 1.4186049550771713, "epoch": 1.212268188302425, "grad_norm": 3.109375, "learning_rate": 1.848405357918166e-06, "loss": 0.4707, "mean_token_accuracy": 0.8640479817986488, "num_tokens": 3129377.0, "step": 532 }, { "entropy": 1.439442053437233, "epoch": 1.2145506419400856, "grad_norm": 3.046875, "learning_rate": 1.8393182497473271e-06, "loss": 0.3726, "mean_token_accuracy": 0.8774393498897552, "num_tokens": 3135006.0, "step": 533 }, { "entropy": 1.5040159970521927, "epoch": 1.216833095577746, "grad_norm": 3.734375, "learning_rate": 1.830240516321008e-06, "loss": 0.5652, "mean_token_accuracy": 0.8349686115980148, "num_tokens": 3140699.0, "step": 534 }, { "entropy": 1.5229474604129791, "epoch": 1.2191155492154067, "grad_norm": 3.109375, "learning_rate": 1.8211722864477197e-06, "loss": 0.4583, "mean_token_accuracy": 0.8692138940095901, "num_tokens": 3147116.0, "step": 535 }, { "entropy": 1.4866357445716858, "epoch": 1.221398002853067, "grad_norm": 3.390625, "learning_rate": 1.8121136888011198e-06, "loss": 0.5026, "mean_token_accuracy": 0.8499261438846588, "num_tokens": 3153155.0, "step": 536 }, { "entropy": 1.4372419267892838, "epoch": 1.2236804564907275, "grad_norm": 3.453125, "learning_rate": 1.8030648519181926e-06, "loss": 0.4709, "mean_token_accuracy": 0.8507603630423546, "num_tokens": 3158699.0, "step": 537 }, { "entropy": 1.4247355163097382, "epoch": 1.225962910128388, "grad_norm": 2.875, "learning_rate": 1.7940259041974189e-06, "loss": 0.4764, "mean_token_accuracy": 0.8748277649283409, "num_tokens": 3165422.0, "step": 538 }, { "entropy": 1.4786742329597473, "epoch": 1.2282453637660484, "grad_norm": 3.5625, "learning_rate": 1.7849969738969592e-06, "loss": 0.4736, "mean_token_accuracy": 0.8629911243915558, "num_tokens": 3171419.0, "step": 539 }, { "entropy": 1.5304382294416428, "epoch": 1.230527817403709, "grad_norm": 3.375, "learning_rate": 1.7759781891328321e-06, "loss": 0.494, "mean_token_accuracy": 0.8473329395055771, "num_tokens": 3177530.0, "step": 540 }, { "entropy": 1.3744118362665176, "epoch": 1.2328102710413695, "grad_norm": 2.96875, "learning_rate": 1.766969677877094e-06, "loss": 0.4123, "mean_token_accuracy": 0.8834565728902817, "num_tokens": 3184220.0, "step": 541 }, { "entropy": 1.3755813837051392, "epoch": 1.23509272467903, "grad_norm": 2.796875, "learning_rate": 1.7579715679560273e-06, "loss": 0.4265, "mean_token_accuracy": 0.8768275752663612, "num_tokens": 3190613.0, "step": 542 }, { "entropy": 1.4333829581737518, "epoch": 1.2373751783166904, "grad_norm": 3.484375, "learning_rate": 1.7489839870483236e-06, "loss": 0.4931, "mean_token_accuracy": 0.8496510609984398, "num_tokens": 3196269.0, "step": 543 }, { "entropy": 1.4510899037122726, "epoch": 1.2396576319543509, "grad_norm": 3.078125, "learning_rate": 1.7400070626832732e-06, "loss": 0.3757, "mean_token_accuracy": 0.8865254819393158, "num_tokens": 3201924.0, "step": 544 }, { "entropy": 1.4932819455862045, "epoch": 1.2419400855920113, "grad_norm": 3.484375, "learning_rate": 1.7310409222389563e-06, "loss": 0.4531, "mean_token_accuracy": 0.850062184035778, "num_tokens": 3207808.0, "step": 545 }, { "entropy": 1.5299255549907684, "epoch": 1.244222539229672, "grad_norm": 3.328125, "learning_rate": 1.7220856929404342e-06, "loss": 0.4531, "mean_token_accuracy": 0.8687416762113571, "num_tokens": 3213083.0, "step": 546 }, { "entropy": 1.5315914154052734, "epoch": 1.2465049928673324, "grad_norm": 3.84375, "learning_rate": 1.713141501857943e-06, "loss": 0.504, "mean_token_accuracy": 0.850853443145752, "num_tokens": 3218803.0, "step": 547 }, { "entropy": 1.5325356125831604, "epoch": 1.2487874465049928, "grad_norm": 3.578125, "learning_rate": 1.7042084759050948e-06, "loss": 0.495, "mean_token_accuracy": 0.8577945232391357, "num_tokens": 3224187.0, "step": 548 }, { "entropy": 1.3780454993247986, "epoch": 1.2510699001426533, "grad_norm": 3.140625, "learning_rate": 1.6952867418370707e-06, "loss": 0.4453, "mean_token_accuracy": 0.8700388446450233, "num_tokens": 3230589.0, "step": 549 }, { "entropy": 1.466676115989685, "epoch": 1.253352353780314, "grad_norm": 3.296875, "learning_rate": 1.6863764262488292e-06, "loss": 0.496, "mean_token_accuracy": 0.8478997200727463, "num_tokens": 3237256.0, "step": 550 }, { "entropy": 1.4295217841863632, "epoch": 1.2556348074179744, "grad_norm": 3.109375, "learning_rate": 1.677477655573303e-06, "loss": 0.4455, "mean_token_accuracy": 0.8676532134413719, "num_tokens": 3243578.0, "step": 551 }, { "entropy": 1.4432758837938309, "epoch": 1.2579172610556348, "grad_norm": 3.46875, "learning_rate": 1.6685905560796101e-06, "loss": 0.4933, "mean_token_accuracy": 0.8503763899207115, "num_tokens": 3249344.0, "step": 552 }, { "entropy": 1.4768379628658295, "epoch": 1.2601997146932953, "grad_norm": 3.578125, "learning_rate": 1.6597152538712608e-06, "loss": 0.5331, "mean_token_accuracy": 0.8477922007441521, "num_tokens": 3256038.0, "step": 553 }, { "entropy": 1.4157912582159042, "epoch": 1.2624821683309557, "grad_norm": 3.515625, "learning_rate": 1.6508518748843651e-06, "loss": 0.5013, "mean_token_accuracy": 0.860062412917614, "num_tokens": 3261703.0, "step": 554 }, { "entropy": 1.403880551457405, "epoch": 1.2647646219686162, "grad_norm": 3.234375, "learning_rate": 1.6420005448858522e-06, "loss": 0.5094, "mean_token_accuracy": 0.8528245538473129, "num_tokens": 3268063.0, "step": 555 }, { "entropy": 1.5064998269081116, "epoch": 1.2670470756062768, "grad_norm": 3.0625, "learning_rate": 1.6331613894716787e-06, "loss": 0.4452, "mean_token_accuracy": 0.8757540956139565, "num_tokens": 3274092.0, "step": 556 }, { "entropy": 1.4100589752197266, "epoch": 1.2693295292439373, "grad_norm": 3.203125, "learning_rate": 1.6243345340650523e-06, "loss": 0.4675, "mean_token_accuracy": 0.8688594177365303, "num_tokens": 3280661.0, "step": 557 }, { "entropy": 1.5577640682458878, "epoch": 1.2716119828815977, "grad_norm": 3.40625, "learning_rate": 1.6155201039146478e-06, "loss": 0.4195, "mean_token_accuracy": 0.8589218854904175, "num_tokens": 3286601.0, "step": 558 }, { "entropy": 1.3485192209482193, "epoch": 1.2738944365192582, "grad_norm": 2.890625, "learning_rate": 1.6067182240928332e-06, "loss": 0.3449, "mean_token_accuracy": 0.8934107944369316, "num_tokens": 3292073.0, "step": 559 }, { "entropy": 1.5532638430595398, "epoch": 1.2761768901569188, "grad_norm": 3.328125, "learning_rate": 1.5979290194938938e-06, "loss": 0.4331, "mean_token_accuracy": 0.8702542334794998, "num_tokens": 3298200.0, "step": 560 }, { "entropy": 1.5261798650026321, "epoch": 1.2784593437945793, "grad_norm": 3.109375, "learning_rate": 1.5891526148322594e-06, "loss": 0.4389, "mean_token_accuracy": 0.862305723130703, "num_tokens": 3304356.0, "step": 561 }, { "entropy": 1.536175400018692, "epoch": 1.2807417974322397, "grad_norm": 4.3125, "learning_rate": 1.5803891346407342e-06, "loss": 0.5677, "mean_token_accuracy": 0.8316505700349808, "num_tokens": 3309722.0, "step": 562 }, { "entropy": 1.4453733563423157, "epoch": 1.2830242510699001, "grad_norm": 3.1875, "learning_rate": 1.5716387032687314e-06, "loss": 0.3941, "mean_token_accuracy": 0.8798687309026718, "num_tokens": 3315076.0, "step": 563 }, { "entropy": 1.5081749856472015, "epoch": 1.2853067047075606, "grad_norm": 2.96875, "learning_rate": 1.562901444880508e-06, "loss": 0.4143, "mean_token_accuracy": 0.8727659210562706, "num_tokens": 3320848.0, "step": 564 }, { "entropy": 1.5081788897514343, "epoch": 1.287589158345221, "grad_norm": 3.1875, "learning_rate": 1.5541774834534024e-06, "loss": 0.4623, "mean_token_accuracy": 0.8562600538134575, "num_tokens": 3327236.0, "step": 565 }, { "entropy": 1.4714922159910202, "epoch": 1.2898716119828815, "grad_norm": 3.46875, "learning_rate": 1.5454669427760774e-06, "loss": 0.4112, "mean_token_accuracy": 0.8714669123291969, "num_tokens": 3333039.0, "step": 566 }, { "entropy": 1.496582642197609, "epoch": 1.2921540656205421, "grad_norm": 3.328125, "learning_rate": 1.5367699464467596e-06, "loss": 0.4667, "mean_token_accuracy": 0.8694412559270859, "num_tokens": 3339578.0, "step": 567 }, { "entropy": 1.453754335641861, "epoch": 1.2944365192582026, "grad_norm": 3.234375, "learning_rate": 1.5280866178714898e-06, "loss": 0.4655, "mean_token_accuracy": 0.8703877553343773, "num_tokens": 3346073.0, "step": 568 }, { "entropy": 1.496316447854042, "epoch": 1.296718972895863, "grad_norm": 3.3125, "learning_rate": 1.5194170802623692e-06, "loss": 0.403, "mean_token_accuracy": 0.8825008124113083, "num_tokens": 3351735.0, "step": 569 }, { "entropy": 1.5532702058553696, "epoch": 1.2990014265335235, "grad_norm": 3.375, "learning_rate": 1.5107614566358136e-06, "loss": 0.5159, "mean_token_accuracy": 0.872811533510685, "num_tokens": 3358008.0, "step": 570 }, { "entropy": 1.3984228074550629, "epoch": 1.3012838801711841, "grad_norm": 2.90625, "learning_rate": 1.5021198698108038e-06, "loss": 0.4531, "mean_token_accuracy": 0.8692669570446014, "num_tokens": 3364752.0, "step": 571 }, { "entropy": 1.500732660293579, "epoch": 1.3035663338088446, "grad_norm": 3.28125, "learning_rate": 1.4934924424071479e-06, "loss": 0.3973, "mean_token_accuracy": 0.8750224709510803, "num_tokens": 3369908.0, "step": 572 }, { "entropy": 1.4046034514904022, "epoch": 1.305848787446505, "grad_norm": 2.984375, "learning_rate": 1.4848792968437376e-06, "loss": 0.407, "mean_token_accuracy": 0.8775566592812538, "num_tokens": 3376101.0, "step": 573 }, { "entropy": 1.4599164128303528, "epoch": 1.3081312410841655, "grad_norm": 3.15625, "learning_rate": 1.4762805553368115e-06, "loss": 0.4068, "mean_token_accuracy": 0.8896359950304031, "num_tokens": 3381766.0, "step": 574 }, { "entropy": 1.5650553405284882, "epoch": 1.310413694721826, "grad_norm": 3.90625, "learning_rate": 1.4676963398982248e-06, "loss": 0.526, "mean_token_accuracy": 0.8529334291815758, "num_tokens": 3387045.0, "step": 575 }, { "entropy": 1.4292816668748856, "epoch": 1.3126961483594863, "grad_norm": 3.1875, "learning_rate": 1.4591267723337122e-06, "loss": 0.4427, "mean_token_accuracy": 0.8748316466808319, "num_tokens": 3393002.0, "step": 576 }, { "entropy": 1.5142599791288376, "epoch": 1.314978601997147, "grad_norm": 3.03125, "learning_rate": 1.4505719742411644e-06, "loss": 0.3505, "mean_token_accuracy": 0.8907722160220146, "num_tokens": 3398389.0, "step": 577 }, { "entropy": 1.3882330507040024, "epoch": 1.3172610556348074, "grad_norm": 2.734375, "learning_rate": 1.4420320670088977e-06, "loss": 0.3516, "mean_token_accuracy": 0.891185887157917, "num_tokens": 3404815.0, "step": 578 }, { "entropy": 1.5874699354171753, "epoch": 1.3195435092724679, "grad_norm": 3.6875, "learning_rate": 1.4335071718139379e-06, "loss": 0.5036, "mean_token_accuracy": 0.8607900366187096, "num_tokens": 3410299.0, "step": 579 }, { "entropy": 1.5547137558460236, "epoch": 1.3218259629101283, "grad_norm": 3.578125, "learning_rate": 1.424997409620295e-06, "loss": 0.4533, "mean_token_accuracy": 0.8668412491679192, "num_tokens": 3415403.0, "step": 580 }, { "entropy": 1.3236225843429565, "epoch": 1.324108416547789, "grad_norm": 3.078125, "learning_rate": 1.4165029011772513e-06, "loss": 0.4062, "mean_token_accuracy": 0.8871461227536201, "num_tokens": 3421683.0, "step": 581 }, { "entropy": 1.469793826341629, "epoch": 1.3263908701854494, "grad_norm": 2.875, "learning_rate": 1.4080237670176456e-06, "loss": 0.4243, "mean_token_accuracy": 0.8801388815045357, "num_tokens": 3427994.0, "step": 582 }, { "entropy": 1.4647793471813202, "epoch": 1.3286733238231099, "grad_norm": 3.09375, "learning_rate": 1.3995601274561605e-06, "loss": 0.4262, "mean_token_accuracy": 0.8648821488022804, "num_tokens": 3434912.0, "step": 583 }, { "entropy": 1.4584257155656815, "epoch": 1.3309557774607703, "grad_norm": 3.171875, "learning_rate": 1.3911121025876212e-06, "loss": 0.4423, "mean_token_accuracy": 0.8798868283629417, "num_tokens": 3442058.0, "step": 584 }, { "entropy": 1.5300581902265549, "epoch": 1.3332382310984308, "grad_norm": 3.21875, "learning_rate": 1.382679812285287e-06, "loss": 0.4313, "mean_token_accuracy": 0.8496553376317024, "num_tokens": 3447771.0, "step": 585 }, { "entropy": 1.5843760669231415, "epoch": 1.3355206847360912, "grad_norm": 4.0625, "learning_rate": 1.3742633761991519e-06, "loss": 0.4945, "mean_token_accuracy": 0.8482984900474548, "num_tokens": 3452785.0, "step": 586 }, { "entropy": 1.4104232043027878, "epoch": 1.3378031383737519, "grad_norm": 2.984375, "learning_rate": 1.365862913754247e-06, "loss": 0.3925, "mean_token_accuracy": 0.8749718070030212, "num_tokens": 3458611.0, "step": 587 }, { "entropy": 1.5725494027137756, "epoch": 1.3400855920114123, "grad_norm": 3.5, "learning_rate": 1.357478544148943e-06, "loss": 0.4045, "mean_token_accuracy": 0.8671303018927574, "num_tokens": 3465091.0, "step": 588 }, { "entropy": 1.4776265919208527, "epoch": 1.3423680456490727, "grad_norm": 3.015625, "learning_rate": 1.3491103863532626e-06, "loss": 0.3392, "mean_token_accuracy": 0.9015164896845818, "num_tokens": 3470488.0, "step": 589 }, { "entropy": 1.6683387607336044, "epoch": 1.3446504992867332, "grad_norm": 4.8125, "learning_rate": 1.3407585591071944e-06, "loss": 0.5101, "mean_token_accuracy": 0.846831701695919, "num_tokens": 3475407.0, "step": 590 }, { "entropy": 1.5126305967569351, "epoch": 1.3469329529243939, "grad_norm": 3.3125, "learning_rate": 1.3324231809189985e-06, "loss": 0.4343, "mean_token_accuracy": 0.8680194914340973, "num_tokens": 3481469.0, "step": 591 }, { "entropy": 1.5650955736637115, "epoch": 1.3492154065620543, "grad_norm": 3.515625, "learning_rate": 1.3241043700635352e-06, "loss": 0.4892, "mean_token_accuracy": 0.86560869961977, "num_tokens": 3487280.0, "step": 592 }, { "entropy": 1.5224829465150833, "epoch": 1.3514978601997147, "grad_norm": 3.5625, "learning_rate": 1.3158022445805816e-06, "loss": 0.437, "mean_token_accuracy": 0.8517628982663155, "num_tokens": 3492699.0, "step": 593 }, { "entropy": 1.5090250372886658, "epoch": 1.3537803138373752, "grad_norm": 3.734375, "learning_rate": 1.3075169222731573e-06, "loss": 0.4919, "mean_token_accuracy": 0.8590176850557327, "num_tokens": 3498075.0, "step": 594 }, { "entropy": 1.3978570252656937, "epoch": 1.3560627674750356, "grad_norm": 3.375, "learning_rate": 1.2992485207058548e-06, "loss": 0.4248, "mean_token_accuracy": 0.8699210062623024, "num_tokens": 3503380.0, "step": 595 }, { "entropy": 1.4768076539039612, "epoch": 1.358345221112696, "grad_norm": 3.5625, "learning_rate": 1.2909971572031663e-06, "loss": 0.4681, "mean_token_accuracy": 0.8609839826822281, "num_tokens": 3509109.0, "step": 596 }, { "entropy": 1.4522972255945206, "epoch": 1.3606276747503565, "grad_norm": 3.296875, "learning_rate": 1.2827629488478254e-06, "loss": 0.5161, "mean_token_accuracy": 0.8707276359200478, "num_tokens": 3515057.0, "step": 597 }, { "entropy": 1.57838936150074, "epoch": 1.3629101283880172, "grad_norm": 3.484375, "learning_rate": 1.2745460124791425e-06, "loss": 0.4295, "mean_token_accuracy": 0.8608080074191093, "num_tokens": 3520795.0, "step": 598 }, { "entropy": 1.4834775626659393, "epoch": 1.3651925820256776, "grad_norm": 3.1875, "learning_rate": 1.266346464691346e-06, "loss": 0.4126, "mean_token_accuracy": 0.8710288777947426, "num_tokens": 3526380.0, "step": 599 }, { "entropy": 1.5034915506839752, "epoch": 1.367475035663338, "grad_norm": 3.484375, "learning_rate": 1.25816442183193e-06, "loss": 0.5211, "mean_token_accuracy": 0.837138943374157, "num_tokens": 3531865.0, "step": 600 }, { "epoch": 1.367475035663338, "eval_entropy": 1.4793427891201443, "eval_loss": 0.473636656999588, "eval_mean_token_accuracy": 0.8656807369656033, "eval_num_tokens": 3531865.0, "eval_runtime": 4.3898, "eval_samples_per_second": 20.502, "eval_steps_per_second": 20.502, "step": 600 } ], "logging_steps": 1, "max_steps": 878, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.360105773011712e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }