| { | |
| "best_global_step": 600, | |
| "best_metric": 0.473636656999588, | |
| "best_model_checkpoint": "./liquidaps-clean-large/checkpoint-600", | |
| "epoch": 1.367475035663338, | |
| "eval_steps": 100, | |
| "global_step": 600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.3900936841964722, | |
| "epoch": 0.002282453637660485, | |
| "grad_norm": 13.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.8776, | |
| "mean_token_accuracy": 0.7829889133572578, | |
| "num_tokens": 5919.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.4252997040748596, | |
| "epoch": 0.00456490727532097, | |
| "grad_norm": 12.875, | |
| "learning_rate": 1.1363636363636364e-07, | |
| "loss": 0.6708, | |
| "mean_token_accuracy": 0.8342809975147247, | |
| "num_tokens": 11950.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 1.398602306842804, | |
| "epoch": 0.0068473609129814554, | |
| "grad_norm": 13.375, | |
| "learning_rate": 2.2727272727272729e-07, | |
| "loss": 0.817, | |
| "mean_token_accuracy": 0.7826605513691902, | |
| "num_tokens": 17559.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 1.3683724850416183, | |
| "epoch": 0.00912981455064194, | |
| "grad_norm": 13.875, | |
| "learning_rate": 3.409090909090909e-07, | |
| "loss": 0.8089, | |
| "mean_token_accuracy": 0.8110606968402863, | |
| "num_tokens": 23355.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 1.6440566033124924, | |
| "epoch": 0.011412268188302425, | |
| "grad_norm": 16.5, | |
| "learning_rate": 4.5454545454545457e-07, | |
| "loss": 1.0826, | |
| "mean_token_accuracy": 0.7466800287365913, | |
| "num_tokens": 28342.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.2425581067800522, | |
| "epoch": 0.013694721825962911, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 5.681818181818182e-07, | |
| "loss": 0.8384, | |
| "mean_token_accuracy": 0.8118839636445045, | |
| "num_tokens": 33937.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.494078889489174, | |
| "epoch": 0.015977175463623396, | |
| "grad_norm": 14.9375, | |
| "learning_rate": 6.818181818181818e-07, | |
| "loss": 0.8747, | |
| "mean_token_accuracy": 0.800664909183979, | |
| "num_tokens": 39724.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.3064402341842651, | |
| "epoch": 0.01825962910128388, | |
| "grad_norm": 12.0, | |
| "learning_rate": 7.954545454545455e-07, | |
| "loss": 0.8043, | |
| "mean_token_accuracy": 0.8063121438026428, | |
| "num_tokens": 46054.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 1.507575884461403, | |
| "epoch": 0.020542082738944364, | |
| "grad_norm": 17.25, | |
| "learning_rate": 9.090909090909091e-07, | |
| "loss": 1.0366, | |
| "mean_token_accuracy": 0.7458265796303749, | |
| "num_tokens": 50806.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 1.3228261321783066, | |
| "epoch": 0.02282453637660485, | |
| "grad_norm": 13.0, | |
| "learning_rate": 1.0227272727272729e-06, | |
| "loss": 0.6629, | |
| "mean_token_accuracy": 0.8548868969082832, | |
| "num_tokens": 56696.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.3493094593286514, | |
| "epoch": 0.025106990014265335, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 1.1363636363636364e-06, | |
| "loss": 0.7411, | |
| "mean_token_accuracy": 0.8017316684126854, | |
| "num_tokens": 63680.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 1.3807552456855774, | |
| "epoch": 0.027389443651925822, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 1.25e-06, | |
| "loss": 0.8135, | |
| "mean_token_accuracy": 0.7994487285614014, | |
| "num_tokens": 69861.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 1.4055243730545044, | |
| "epoch": 0.029671897289586305, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 1.3636363636363636e-06, | |
| "loss": 0.9012, | |
| "mean_token_accuracy": 0.7958070710301399, | |
| "num_tokens": 75989.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 1.431694433093071, | |
| "epoch": 0.03195435092724679, | |
| "grad_norm": 13.75, | |
| "learning_rate": 1.4772727272727275e-06, | |
| "loss": 0.9413, | |
| "mean_token_accuracy": 0.7656892687082291, | |
| "num_tokens": 81844.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 1.5010923892259598, | |
| "epoch": 0.034236804564907276, | |
| "grad_norm": 15.3125, | |
| "learning_rate": 1.590909090909091e-06, | |
| "loss": 1.0155, | |
| "mean_token_accuracy": 0.7734663560986519, | |
| "num_tokens": 86897.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 1.4839733690023422, | |
| "epoch": 0.03651925820256776, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 1.7045454545454546e-06, | |
| "loss": 0.8776, | |
| "mean_token_accuracy": 0.7831285521388054, | |
| "num_tokens": 92714.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 1.3343003541231155, | |
| "epoch": 0.038801711840228244, | |
| "grad_norm": 9.375, | |
| "learning_rate": 1.8181818181818183e-06, | |
| "loss": 0.7208, | |
| "mean_token_accuracy": 0.8181507587432861, | |
| "num_tokens": 100046.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 1.488086387515068, | |
| "epoch": 0.04108416547788873, | |
| "grad_norm": 12.125, | |
| "learning_rate": 1.931818181818182e-06, | |
| "loss": 0.7636, | |
| "mean_token_accuracy": 0.7991937696933746, | |
| "num_tokens": 105549.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 1.3153499066829681, | |
| "epoch": 0.04336661911554922, | |
| "grad_norm": 11.375, | |
| "learning_rate": 2.0454545454545457e-06, | |
| "loss": 0.7598, | |
| "mean_token_accuracy": 0.8102546408772469, | |
| "num_tokens": 111552.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 1.3515659272670746, | |
| "epoch": 0.0456490727532097, | |
| "grad_norm": 11.375, | |
| "learning_rate": 2.1590909090909092e-06, | |
| "loss": 0.7113, | |
| "mean_token_accuracy": 0.810497097671032, | |
| "num_tokens": 117303.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.4470301866531372, | |
| "epoch": 0.047931526390870186, | |
| "grad_norm": 11.125, | |
| "learning_rate": 2.2727272727272728e-06, | |
| "loss": 0.8029, | |
| "mean_token_accuracy": 0.7923144474625587, | |
| "num_tokens": 123355.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 1.3571707159280777, | |
| "epoch": 0.05021398002853067, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 2.3863636363636367e-06, | |
| "loss": 0.6621, | |
| "mean_token_accuracy": 0.8315573260188103, | |
| "num_tokens": 129801.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 1.4135605692863464, | |
| "epoch": 0.05249643366619115, | |
| "grad_norm": 10.875, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.7478, | |
| "mean_token_accuracy": 0.8041789308190346, | |
| "num_tokens": 135168.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 1.4300416111946106, | |
| "epoch": 0.054778887303851644, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 2.6136363636363637e-06, | |
| "loss": 0.7541, | |
| "mean_token_accuracy": 0.8075885996222496, | |
| "num_tokens": 141202.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 1.3513601571321487, | |
| "epoch": 0.05706134094151213, | |
| "grad_norm": 9.25, | |
| "learning_rate": 2.7272727272727272e-06, | |
| "loss": 0.6913, | |
| "mean_token_accuracy": 0.8184778317809105, | |
| "num_tokens": 147326.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 1.3810700178146362, | |
| "epoch": 0.05934379457917261, | |
| "grad_norm": 9.75, | |
| "learning_rate": 2.8409090909090916e-06, | |
| "loss": 0.6849, | |
| "mean_token_accuracy": 0.8293009474873543, | |
| "num_tokens": 153439.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 1.3730244934558868, | |
| "epoch": 0.061626248216833095, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 2.954545454545455e-06, | |
| "loss": 0.6562, | |
| "mean_token_accuracy": 0.8283357098698616, | |
| "num_tokens": 159411.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 1.337988331913948, | |
| "epoch": 0.06390870185449359, | |
| "grad_norm": 8.375, | |
| "learning_rate": 3.0681818181818186e-06, | |
| "loss": 0.5966, | |
| "mean_token_accuracy": 0.837442196905613, | |
| "num_tokens": 165669.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 1.4772655963897705, | |
| "epoch": 0.06619115549215407, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 3.181818181818182e-06, | |
| "loss": 0.7038, | |
| "mean_token_accuracy": 0.8186220824718475, | |
| "num_tokens": 170944.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 1.3892450034618378, | |
| "epoch": 0.06847360912981455, | |
| "grad_norm": 7.8125, | |
| "learning_rate": 3.2954545454545456e-06, | |
| "loss": 0.658, | |
| "mean_token_accuracy": 0.8269658461213112, | |
| "num_tokens": 176755.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.490507110953331, | |
| "epoch": 0.07075606276747504, | |
| "grad_norm": 8.375, | |
| "learning_rate": 3.409090909090909e-06, | |
| "loss": 0.7584, | |
| "mean_token_accuracy": 0.7987356930971146, | |
| "num_tokens": 182319.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 1.3267859369516373, | |
| "epoch": 0.07303851640513552, | |
| "grad_norm": 7.15625, | |
| "learning_rate": 3.522727272727273e-06, | |
| "loss": 0.6272, | |
| "mean_token_accuracy": 0.8291826993227005, | |
| "num_tokens": 188236.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 1.4844342470169067, | |
| "epoch": 0.075320970042796, | |
| "grad_norm": 7.53125, | |
| "learning_rate": 3.6363636363636366e-06, | |
| "loss": 0.724, | |
| "mean_token_accuracy": 0.806972049176693, | |
| "num_tokens": 193965.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 1.4742888659238815, | |
| "epoch": 0.07760342368045649, | |
| "grad_norm": 7.03125, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.6635, | |
| "mean_token_accuracy": 0.8269493877887726, | |
| "num_tokens": 199814.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 1.3930696845054626, | |
| "epoch": 0.07988587731811697, | |
| "grad_norm": 6.5625, | |
| "learning_rate": 3.863636363636364e-06, | |
| "loss": 0.6553, | |
| "mean_token_accuracy": 0.8298437520861626, | |
| "num_tokens": 205725.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 1.4377078860998154, | |
| "epoch": 0.08216833095577745, | |
| "grad_norm": 6.875, | |
| "learning_rate": 3.9772727272727275e-06, | |
| "loss": 0.6647, | |
| "mean_token_accuracy": 0.8262319192290306, | |
| "num_tokens": 211044.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 1.4484449177980423, | |
| "epoch": 0.08445078459343795, | |
| "grad_norm": 5.8125, | |
| "learning_rate": 4.0909090909090915e-06, | |
| "loss": 0.6505, | |
| "mean_token_accuracy": 0.8214789107441902, | |
| "num_tokens": 217143.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 1.3406399488449097, | |
| "epoch": 0.08673323823109844, | |
| "grad_norm": 5.5, | |
| "learning_rate": 4.204545454545455e-06, | |
| "loss": 0.5331, | |
| "mean_token_accuracy": 0.8669695928692818, | |
| "num_tokens": 224084.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 1.465222254395485, | |
| "epoch": 0.08901569186875892, | |
| "grad_norm": 6.09375, | |
| "learning_rate": 4.3181818181818185e-06, | |
| "loss": 0.5913, | |
| "mean_token_accuracy": 0.8346145749092102, | |
| "num_tokens": 229446.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 1.4082716703414917, | |
| "epoch": 0.0912981455064194, | |
| "grad_norm": 5.4375, | |
| "learning_rate": 4.4318181818181824e-06, | |
| "loss": 0.4967, | |
| "mean_token_accuracy": 0.8573063313961029, | |
| "num_tokens": 235250.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.4065438956022263, | |
| "epoch": 0.09358059914407989, | |
| "grad_norm": 4.53125, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 0.5228, | |
| "mean_token_accuracy": 0.8517210483551025, | |
| "num_tokens": 241666.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 1.4178601205348969, | |
| "epoch": 0.09586305278174037, | |
| "grad_norm": 4.875, | |
| "learning_rate": 4.6590909090909095e-06, | |
| "loss": 0.5534, | |
| "mean_token_accuracy": 0.8581771478056908, | |
| "num_tokens": 247901.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 1.4665435552597046, | |
| "epoch": 0.09814550641940085, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 4.772727272727273e-06, | |
| "loss": 0.524, | |
| "mean_token_accuracy": 0.8218341246247292, | |
| "num_tokens": 253273.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 1.4858266711235046, | |
| "epoch": 0.10042796005706134, | |
| "grad_norm": 5.1875, | |
| "learning_rate": 4.8863636363636365e-06, | |
| "loss": 0.6752, | |
| "mean_token_accuracy": 0.826298251748085, | |
| "num_tokens": 258616.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 1.3626787662506104, | |
| "epoch": 0.10271041369472182, | |
| "grad_norm": 4.5, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5618, | |
| "mean_token_accuracy": 0.8469245880842209, | |
| "num_tokens": 264408.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 1.3205972537398338, | |
| "epoch": 0.1049928673323823, | |
| "grad_norm": 3.796875, | |
| "learning_rate": 4.99998226312344e-06, | |
| "loss": 0.4616, | |
| "mean_token_accuracy": 0.8739962726831436, | |
| "num_tokens": 270566.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 1.3779225647449493, | |
| "epoch": 0.10727532097004279, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 4.999929052745434e-06, | |
| "loss": 0.4547, | |
| "mean_token_accuracy": 0.8725937232375145, | |
| "num_tokens": 276849.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 1.5054886192083359, | |
| "epoch": 0.10955777460770329, | |
| "grad_norm": 4.71875, | |
| "learning_rate": 4.999840369621011e-06, | |
| "loss": 0.5994, | |
| "mean_token_accuracy": 0.8370054960250854, | |
| "num_tokens": 283205.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 1.5157189071178436, | |
| "epoch": 0.11184022824536377, | |
| "grad_norm": 4.65625, | |
| "learning_rate": 4.999716215008542e-06, | |
| "loss": 0.5843, | |
| "mean_token_accuracy": 0.8259787857532501, | |
| "num_tokens": 288059.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 1.38004170358181, | |
| "epoch": 0.11412268188302425, | |
| "grad_norm": 3.8125, | |
| "learning_rate": 4.999556590669718e-06, | |
| "loss": 0.405, | |
| "mean_token_accuracy": 0.8887585029006004, | |
| "num_tokens": 293798.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.6085818111896515, | |
| "epoch": 0.11640513552068474, | |
| "grad_norm": 6.21875, | |
| "learning_rate": 4.99936149886953e-06, | |
| "loss": 0.5947, | |
| "mean_token_accuracy": 0.8224818632006645, | |
| "num_tokens": 298157.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 1.4853103458881378, | |
| "epoch": 0.11868758915834522, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 4.999130942376232e-06, | |
| "loss": 0.4428, | |
| "mean_token_accuracy": 0.8794936537742615, | |
| "num_tokens": 304309.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 1.6272333711385727, | |
| "epoch": 0.1209700427960057, | |
| "grad_norm": 4.6875, | |
| "learning_rate": 4.998864924461305e-06, | |
| "loss": 0.5762, | |
| "mean_token_accuracy": 0.8293572887778282, | |
| "num_tokens": 309756.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 1.289240226149559, | |
| "epoch": 0.12325249643366619, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 4.998563448899413e-06, | |
| "loss": 0.4, | |
| "mean_token_accuracy": 0.8821459114551544, | |
| "num_tokens": 316395.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 1.555517390370369, | |
| "epoch": 0.12553495007132667, | |
| "grad_norm": 4.5, | |
| "learning_rate": 4.998226519968341e-06, | |
| "loss": 0.5261, | |
| "mean_token_accuracy": 0.8417777121067047, | |
| "num_tokens": 321365.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 1.4511889964342117, | |
| "epoch": 0.12781740370898717, | |
| "grad_norm": 3.828125, | |
| "learning_rate": 4.997854142448944e-06, | |
| "loss": 0.5362, | |
| "mean_token_accuracy": 0.8543838635087013, | |
| "num_tokens": 327850.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 1.480227530002594, | |
| "epoch": 0.13009985734664764, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 4.9974463216250735e-06, | |
| "loss": 0.6281, | |
| "mean_token_accuracy": 0.8336407989263535, | |
| "num_tokens": 332724.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 1.4882567524909973, | |
| "epoch": 0.13238231098430814, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 4.997003063283503e-06, | |
| "loss": 0.5103, | |
| "mean_token_accuracy": 0.854725182056427, | |
| "num_tokens": 338496.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 1.3099189698696136, | |
| "epoch": 0.1346647646219686, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 4.996524373713848e-06, | |
| "loss": 0.4035, | |
| "mean_token_accuracy": 0.8902565762400627, | |
| "num_tokens": 344181.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 1.554222896695137, | |
| "epoch": 0.1369472182596291, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 4.996010259708475e-06, | |
| "loss": 0.5154, | |
| "mean_token_accuracy": 0.8221362680196762, | |
| "num_tokens": 349987.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.3615255653858185, | |
| "epoch": 0.13922967189728958, | |
| "grad_norm": 4.125, | |
| "learning_rate": 4.995460728562403e-06, | |
| "loss": 0.5219, | |
| "mean_token_accuracy": 0.8591368719935417, | |
| "num_tokens": 355808.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 1.5018275529146194, | |
| "epoch": 0.14151212553495007, | |
| "grad_norm": 3.8125, | |
| "learning_rate": 4.994875788073207e-06, | |
| "loss": 0.4981, | |
| "mean_token_accuracy": 0.8580456078052521, | |
| "num_tokens": 361358.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 1.3897339552640915, | |
| "epoch": 0.14379457917261054, | |
| "grad_norm": 3.984375, | |
| "learning_rate": 4.9942554465409e-06, | |
| "loss": 0.4961, | |
| "mean_token_accuracy": 0.8571888878941536, | |
| "num_tokens": 366798.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 1.3545932322740555, | |
| "epoch": 0.14607703281027104, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 4.99359971276782e-06, | |
| "loss": 0.4023, | |
| "mean_token_accuracy": 0.8760626539587975, | |
| "num_tokens": 373039.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 1.4018055945634842, | |
| "epoch": 0.14835948644793154, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 4.992908596058501e-06, | |
| "loss": 0.4874, | |
| "mean_token_accuracy": 0.8551009446382523, | |
| "num_tokens": 379151.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 1.408715844154358, | |
| "epoch": 0.150641940085592, | |
| "grad_norm": 3.78125, | |
| "learning_rate": 4.9921821062195445e-06, | |
| "loss": 0.5979, | |
| "mean_token_accuracy": 0.8376783430576324, | |
| "num_tokens": 385466.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 1.478136882185936, | |
| "epoch": 0.1529243937232525, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 4.9914202535594795e-06, | |
| "loss": 0.4359, | |
| "mean_token_accuracy": 0.8765653073787689, | |
| "num_tokens": 391861.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 1.3361108154058456, | |
| "epoch": 0.15520684736091298, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 4.990623048888615e-06, | |
| "loss": 0.4471, | |
| "mean_token_accuracy": 0.8761897683143616, | |
| "num_tokens": 397602.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 1.5057465434074402, | |
| "epoch": 0.15748930099857347, | |
| "grad_norm": 3.984375, | |
| "learning_rate": 4.989790503518888e-06, | |
| "loss": 0.5262, | |
| "mean_token_accuracy": 0.8583421856164932, | |
| "num_tokens": 403847.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 1.5415615290403366, | |
| "epoch": 0.15977175463623394, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 4.988922629263701e-06, | |
| "loss": 0.598, | |
| "mean_token_accuracy": 0.8401808813214302, | |
| "num_tokens": 409563.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.433893471956253, | |
| "epoch": 0.16205420827389444, | |
| "grad_norm": 3.875, | |
| "learning_rate": 4.988019438437759e-06, | |
| "loss": 0.5086, | |
| "mean_token_accuracy": 0.8572655767202377, | |
| "num_tokens": 415590.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 1.5654226392507553, | |
| "epoch": 0.1643366619115549, | |
| "grad_norm": 4.65625, | |
| "learning_rate": 4.987080943856887e-06, | |
| "loss": 0.6098, | |
| "mean_token_accuracy": 0.8376531600952148, | |
| "num_tokens": 421266.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 1.513851910829544, | |
| "epoch": 0.1666191155492154, | |
| "grad_norm": 3.78125, | |
| "learning_rate": 4.9861071588378565e-06, | |
| "loss": 0.4454, | |
| "mean_token_accuracy": 0.8665637820959091, | |
| "num_tokens": 426394.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 1.5542047619819641, | |
| "epoch": 0.1689015691868759, | |
| "grad_norm": 4.71875, | |
| "learning_rate": 4.9850980971981914e-06, | |
| "loss": 0.6814, | |
| "mean_token_accuracy": 0.808769017457962, | |
| "num_tokens": 431932.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 1.4060749858617783, | |
| "epoch": 0.17118402282453637, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 4.984053773255971e-06, | |
| "loss": 0.4207, | |
| "mean_token_accuracy": 0.8581205531954765, | |
| "num_tokens": 437984.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 1.4776476472616196, | |
| "epoch": 0.17346647646219687, | |
| "grad_norm": 3.9375, | |
| "learning_rate": 4.9829742018296335e-06, | |
| "loss": 0.5346, | |
| "mean_token_accuracy": 0.8503594622015953, | |
| "num_tokens": 444584.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 1.3919195085763931, | |
| "epoch": 0.17574893009985734, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 4.981859398237758e-06, | |
| "loss": 0.4565, | |
| "mean_token_accuracy": 0.8721787855029106, | |
| "num_tokens": 450943.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 1.4689613282680511, | |
| "epoch": 0.17803138373751784, | |
| "grad_norm": 3.9375, | |
| "learning_rate": 4.980709378298851e-06, | |
| "loss": 0.5434, | |
| "mean_token_accuracy": 0.8531812652945518, | |
| "num_tokens": 456471.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 1.474008470773697, | |
| "epoch": 0.1803138373751783, | |
| "grad_norm": 4.09375, | |
| "learning_rate": 4.979524158331123e-06, | |
| "loss": 0.531, | |
| "mean_token_accuracy": 0.8535453379154205, | |
| "num_tokens": 462328.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 1.3587582856416702, | |
| "epoch": 0.1825962910128388, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 4.978303755152254e-06, | |
| "loss": 0.4992, | |
| "mean_token_accuracy": 0.8549595400691032, | |
| "num_tokens": 468402.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.3619231432676315, | |
| "epoch": 0.18487874465049928, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 4.977048186079155e-06, | |
| "loss": 0.4981, | |
| "mean_token_accuracy": 0.8575711026787758, | |
| "num_tokens": 473714.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 1.4384445995092392, | |
| "epoch": 0.18716119828815977, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 4.975757468927727e-06, | |
| "loss": 0.4181, | |
| "mean_token_accuracy": 0.8731885701417923, | |
| "num_tokens": 479842.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 1.5311954617500305, | |
| "epoch": 0.18944365192582024, | |
| "grad_norm": 4.34375, | |
| "learning_rate": 4.974431622012601e-06, | |
| "loss": 0.6287, | |
| "mean_token_accuracy": 0.821938157081604, | |
| "num_tokens": 485680.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 1.358711913228035, | |
| "epoch": 0.19172610556348074, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 4.973070664146885e-06, | |
| "loss": 0.4416, | |
| "mean_token_accuracy": 0.873858779668808, | |
| "num_tokens": 491390.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 1.4033315032720566, | |
| "epoch": 0.19400855920114124, | |
| "grad_norm": 3.890625, | |
| "learning_rate": 4.971674614641891e-06, | |
| "loss": 0.4835, | |
| "mean_token_accuracy": 0.861111544072628, | |
| "num_tokens": 497469.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 1.373718798160553, | |
| "epoch": 0.1962910128388017, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 4.970243493306865e-06, | |
| "loss": 0.4599, | |
| "mean_token_accuracy": 0.8647707998752594, | |
| "num_tokens": 503754.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 1.4159798175096512, | |
| "epoch": 0.1985734664764622, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 4.968777320448707e-06, | |
| "loss": 0.41, | |
| "mean_token_accuracy": 0.8731393367052078, | |
| "num_tokens": 509255.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 1.397733435034752, | |
| "epoch": 0.20085592011412268, | |
| "grad_norm": 4.0, | |
| "learning_rate": 4.9672761168716766e-06, | |
| "loss": 0.4607, | |
| "mean_token_accuracy": 0.8771609216928482, | |
| "num_tokens": 515162.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 1.3901693522930145, | |
| "epoch": 0.20313837375178317, | |
| "grad_norm": 3.703125, | |
| "learning_rate": 4.9657399038771045e-06, | |
| "loss": 0.4985, | |
| "mean_token_accuracy": 0.8564205095171928, | |
| "num_tokens": 520980.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 1.470759555697441, | |
| "epoch": 0.20542082738944364, | |
| "grad_norm": 4.09375, | |
| "learning_rate": 4.964168703263086e-06, | |
| "loss": 0.5552, | |
| "mean_token_accuracy": 0.834749348461628, | |
| "num_tokens": 526901.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 1.5493524819612503, | |
| "epoch": 0.20770328102710414, | |
| "grad_norm": 4.09375, | |
| "learning_rate": 4.962562537324176e-06, | |
| "loss": 0.5276, | |
| "mean_token_accuracy": 0.8242713585495949, | |
| "num_tokens": 532502.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 1.4955510944128036, | |
| "epoch": 0.2099857346647646, | |
| "grad_norm": 4.5, | |
| "learning_rate": 4.960921428851066e-06, | |
| "loss": 0.6117, | |
| "mean_token_accuracy": 0.8246004208922386, | |
| "num_tokens": 538159.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 1.4567335098981857, | |
| "epoch": 0.2122681883024251, | |
| "grad_norm": 3.0, | |
| "learning_rate": 4.959245401130269e-06, | |
| "loss": 0.3503, | |
| "mean_token_accuracy": 0.8856313973665237, | |
| "num_tokens": 544079.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 1.458535224199295, | |
| "epoch": 0.21455064194008558, | |
| "grad_norm": 3.625, | |
| "learning_rate": 4.957534477943782e-06, | |
| "loss": 0.4434, | |
| "mean_token_accuracy": 0.858425110578537, | |
| "num_tokens": 550037.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 1.3983053117990494, | |
| "epoch": 0.21683309557774608, | |
| "grad_norm": 3.375, | |
| "learning_rate": 4.955788683568749e-06, | |
| "loss": 0.4004, | |
| "mean_token_accuracy": 0.8748428821563721, | |
| "num_tokens": 556585.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 1.481145054101944, | |
| "epoch": 0.21911554921540657, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 4.954008042777125e-06, | |
| "loss": 0.409, | |
| "mean_token_accuracy": 0.8758149892091751, | |
| "num_tokens": 562355.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 1.6243803054094315, | |
| "epoch": 0.22139800285306704, | |
| "grad_norm": 4.75, | |
| "learning_rate": 4.952192580835313e-06, | |
| "loss": 0.6636, | |
| "mean_token_accuracy": 0.7973536550998688, | |
| "num_tokens": 568202.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 1.575976401567459, | |
| "epoch": 0.22368045649072754, | |
| "grad_norm": 4.59375, | |
| "learning_rate": 4.950342323503812e-06, | |
| "loss": 0.6046, | |
| "mean_token_accuracy": 0.813086025416851, | |
| "num_tokens": 573655.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 1.5205018073320389, | |
| "epoch": 0.225962910128388, | |
| "grad_norm": 3.953125, | |
| "learning_rate": 4.9484572970368516e-06, | |
| "loss": 0.5502, | |
| "mean_token_accuracy": 0.8478811085224152, | |
| "num_tokens": 579742.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 1.5319028943777084, | |
| "epoch": 0.2282453637660485, | |
| "grad_norm": 4.71875, | |
| "learning_rate": 4.946537528182017e-06, | |
| "loss": 0.6014, | |
| "mean_token_accuracy": 0.8344146087765694, | |
| "num_tokens": 584824.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2282453637660485, | |
| "eval_entropy": 1.4501528475019667, | |
| "eval_loss": 0.5052191615104675, | |
| "eval_mean_token_accuracy": 0.8605326036612193, | |
| "eval_num_tokens": 584824.0, | |
| "eval_runtime": 4.4666, | |
| "eval_samples_per_second": 20.149, | |
| "eval_steps_per_second": 20.149, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.3917143046855927, | |
| "epoch": 0.23052781740370898, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 4.944583044179871e-06, | |
| "loss": 0.3933, | |
| "mean_token_accuracy": 0.8733155429363251, | |
| "num_tokens": 590608.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 1.3328562825918198, | |
| "epoch": 0.23281027104136948, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 4.942593872763566e-06, | |
| "loss": 0.3922, | |
| "mean_token_accuracy": 0.8770610764622688, | |
| "num_tokens": 596918.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 1.3897913247346878, | |
| "epoch": 0.23509272467902995, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 4.940570042158454e-06, | |
| "loss": 0.4864, | |
| "mean_token_accuracy": 0.8629380613565445, | |
| "num_tokens": 602674.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 1.5906241983175278, | |
| "epoch": 0.23737517831669044, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 4.93851158108168e-06, | |
| "loss": 0.6066, | |
| "mean_token_accuracy": 0.8188068121671677, | |
| "num_tokens": 608041.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 1.421783059835434, | |
| "epoch": 0.2396576319543509, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 4.93641851874178e-06, | |
| "loss": 0.4813, | |
| "mean_token_accuracy": 0.8542051687836647, | |
| "num_tokens": 613908.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 1.4839935898780823, | |
| "epoch": 0.2419400855920114, | |
| "grad_norm": 4.34375, | |
| "learning_rate": 4.934290884838266e-06, | |
| "loss": 0.539, | |
| "mean_token_accuracy": 0.8587613850831985, | |
| "num_tokens": 620475.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 1.4981091767549515, | |
| "epoch": 0.2442225392296719, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 4.932128709561202e-06, | |
| "loss": 0.4702, | |
| "mean_token_accuracy": 0.866189256310463, | |
| "num_tokens": 626833.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 1.47100168466568, | |
| "epoch": 0.24650499286733238, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 4.929932023590776e-06, | |
| "loss": 0.4146, | |
| "mean_token_accuracy": 0.8706357181072235, | |
| "num_tokens": 632605.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 1.4089600145816803, | |
| "epoch": 0.24878744650499288, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 4.9277008580968665e-06, | |
| "loss": 0.4052, | |
| "mean_token_accuracy": 0.8793638423085213, | |
| "num_tokens": 639026.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 1.4623335748910904, | |
| "epoch": 0.25106990014265335, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 4.925435244738599e-06, | |
| "loss": 0.4251, | |
| "mean_token_accuracy": 0.8607661128044128, | |
| "num_tokens": 645661.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.469603717327118, | |
| "epoch": 0.25335235378031384, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 4.923135215663897e-06, | |
| "loss": 0.4562, | |
| "mean_token_accuracy": 0.8637586832046509, | |
| "num_tokens": 652088.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 1.4699177891016006, | |
| "epoch": 0.25563480741797434, | |
| "grad_norm": 3.78125, | |
| "learning_rate": 4.920800803509026e-06, | |
| "loss": 0.4358, | |
| "mean_token_accuracy": 0.8661052659153938, | |
| "num_tokens": 657148.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 1.4687887877225876, | |
| "epoch": 0.2579172610556348, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 4.91843204139813e-06, | |
| "loss": 0.4832, | |
| "mean_token_accuracy": 0.87067711353302, | |
| "num_tokens": 662846.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 1.3910206109285355, | |
| "epoch": 0.2601997146932953, | |
| "grad_norm": 3.96875, | |
| "learning_rate": 4.916028962942763e-06, | |
| "loss": 0.4606, | |
| "mean_token_accuracy": 0.8688057661056519, | |
| "num_tokens": 668283.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 1.4946473091840744, | |
| "epoch": 0.2624821683309558, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 4.913591602241409e-06, | |
| "loss": 0.5177, | |
| "mean_token_accuracy": 0.8503523468971252, | |
| "num_tokens": 673962.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 1.4268899112939835, | |
| "epoch": 0.2647646219686163, | |
| "grad_norm": 3.734375, | |
| "learning_rate": 4.911119993878999e-06, | |
| "loss": 0.4608, | |
| "mean_token_accuracy": 0.8624838441610336, | |
| "num_tokens": 679433.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 1.4775933474302292, | |
| "epoch": 0.2670470756062768, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 4.908614172926426e-06, | |
| "loss": 0.373, | |
| "mean_token_accuracy": 0.8674890100955963, | |
| "num_tokens": 685178.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 1.4562716633081436, | |
| "epoch": 0.2693295292439372, | |
| "grad_norm": 3.890625, | |
| "learning_rate": 4.906074174940038e-06, | |
| "loss": 0.5465, | |
| "mean_token_accuracy": 0.8421404510736465, | |
| "num_tokens": 691044.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 1.404031679034233, | |
| "epoch": 0.2716119828815977, | |
| "grad_norm": 3.625, | |
| "learning_rate": 4.903500035961139e-06, | |
| "loss": 0.4888, | |
| "mean_token_accuracy": 0.8540224209427834, | |
| "num_tokens": 697301.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 1.421856850385666, | |
| "epoch": 0.2738944365192582, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 4.9008917925154795e-06, | |
| "loss": 0.438, | |
| "mean_token_accuracy": 0.8775565698742867, | |
| "num_tokens": 704275.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.5078845471143723, | |
| "epoch": 0.2761768901569187, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 4.89824948161273e-06, | |
| "loss": 0.4837, | |
| "mean_token_accuracy": 0.8578910827636719, | |
| "num_tokens": 710429.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 1.4396383464336395, | |
| "epoch": 0.27845934379457915, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 4.895573140745967e-06, | |
| "loss": 0.5219, | |
| "mean_token_accuracy": 0.8433092087507248, | |
| "num_tokens": 715838.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 1.4553385972976685, | |
| "epoch": 0.28074179743223965, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 4.892862807891131e-06, | |
| "loss": 0.4401, | |
| "mean_token_accuracy": 0.869629830121994, | |
| "num_tokens": 721249.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 1.4222912788391113, | |
| "epoch": 0.28302425106990015, | |
| "grad_norm": 3.921875, | |
| "learning_rate": 4.890118521506494e-06, | |
| "loss": 0.5689, | |
| "mean_token_accuracy": 0.8471446335315704, | |
| "num_tokens": 727806.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 1.4638441801071167, | |
| "epoch": 0.28530670470756064, | |
| "grad_norm": 3.59375, | |
| "learning_rate": 4.8873403205321115e-06, | |
| "loss": 0.4898, | |
| "mean_token_accuracy": 0.8609614819288254, | |
| "num_tokens": 733588.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 1.360969141125679, | |
| "epoch": 0.2875891583452211, | |
| "grad_norm": 4.9375, | |
| "learning_rate": 4.884528244389269e-06, | |
| "loss": 0.5004, | |
| "mean_token_accuracy": 0.8577578216791153, | |
| "num_tokens": 739069.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 1.4701900631189346, | |
| "epoch": 0.2898716119828816, | |
| "grad_norm": 3.890625, | |
| "learning_rate": 4.881682332979925e-06, | |
| "loss": 0.4782, | |
| "mean_token_accuracy": 0.8597236052155495, | |
| "num_tokens": 744612.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 1.484321504831314, | |
| "epoch": 0.2921540656205421, | |
| "grad_norm": 4.34375, | |
| "learning_rate": 4.878802626686141e-06, | |
| "loss": 0.5044, | |
| "mean_token_accuracy": 0.8599332422018051, | |
| "num_tokens": 750198.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 1.4526187181472778, | |
| "epoch": 0.2944365192582026, | |
| "grad_norm": 4.25, | |
| "learning_rate": 4.8758891663695165e-06, | |
| "loss": 0.5283, | |
| "mean_token_accuracy": 0.8519927933812141, | |
| "num_tokens": 755825.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 1.487746685743332, | |
| "epoch": 0.2967189728958631, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 4.872941993370598e-06, | |
| "loss": 0.4834, | |
| "mean_token_accuracy": 0.865722268819809, | |
| "num_tokens": 762609.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.4334597885608673, | |
| "epoch": 0.2990014265335235, | |
| "grad_norm": 3.609375, | |
| "learning_rate": 4.869961149508301e-06, | |
| "loss": 0.462, | |
| "mean_token_accuracy": 0.8797513917088509, | |
| "num_tokens": 768825.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 1.5593868792057037, | |
| "epoch": 0.301283880171184, | |
| "grad_norm": 3.75, | |
| "learning_rate": 4.866946677079314e-06, | |
| "loss": 0.4398, | |
| "mean_token_accuracy": 0.8622937723994255, | |
| "num_tokens": 774231.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 1.582775130867958, | |
| "epoch": 0.3035663338088445, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 4.8638986188574955e-06, | |
| "loss": 0.5733, | |
| "mean_token_accuracy": 0.8216232135891914, | |
| "num_tokens": 779217.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 1.4957093298435211, | |
| "epoch": 0.305848787446505, | |
| "grad_norm": 3.875, | |
| "learning_rate": 4.8608170180932725e-06, | |
| "loss": 0.4983, | |
| "mean_token_accuracy": 0.8560524433851242, | |
| "num_tokens": 785209.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 1.4334331154823303, | |
| "epoch": 0.30813124108416545, | |
| "grad_norm": 3.375, | |
| "learning_rate": 4.857701918513023e-06, | |
| "loss": 0.4457, | |
| "mean_token_accuracy": 0.8704549074172974, | |
| "num_tokens": 791251.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 1.4960424304008484, | |
| "epoch": 0.31041369472182595, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 4.854553364318456e-06, | |
| "loss": 0.4823, | |
| "mean_token_accuracy": 0.869213730096817, | |
| "num_tokens": 797202.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 1.3933140188455582, | |
| "epoch": 0.31269614835948645, | |
| "grad_norm": 3.0, | |
| "learning_rate": 4.851371400185986e-06, | |
| "loss": 0.4387, | |
| "mean_token_accuracy": 0.8605329319834709, | |
| "num_tokens": 804144.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 1.4915095120668411, | |
| "epoch": 0.31497860199714695, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 4.848156071266095e-06, | |
| "loss": 0.404, | |
| "mean_token_accuracy": 0.8624937981367111, | |
| "num_tokens": 809125.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 1.422121912240982, | |
| "epoch": 0.31726105563480744, | |
| "grad_norm": 3.5, | |
| "learning_rate": 4.844907423182699e-06, | |
| "loss": 0.3698, | |
| "mean_token_accuracy": 0.8753552809357643, | |
| "num_tokens": 814420.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 1.3587403669953346, | |
| "epoch": 0.3195435092724679, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 4.841625502032495e-06, | |
| "loss": 0.4201, | |
| "mean_token_accuracy": 0.8749541118741035, | |
| "num_tokens": 819445.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.5249932259321213, | |
| "epoch": 0.3218259629101284, | |
| "grad_norm": 3.765625, | |
| "learning_rate": 4.838310354384304e-06, | |
| "loss": 0.4569, | |
| "mean_token_accuracy": 0.8636204749345779, | |
| "num_tokens": 825423.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 1.4787572473287582, | |
| "epoch": 0.3241084165477889, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 4.834962027278418e-06, | |
| "loss": 0.4271, | |
| "mean_token_accuracy": 0.8966826573014259, | |
| "num_tokens": 830608.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 1.3900313079357147, | |
| "epoch": 0.3263908701854494, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 4.831580568225931e-06, | |
| "loss": 0.4272, | |
| "mean_token_accuracy": 0.8754951432347298, | |
| "num_tokens": 837069.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 1.4659005105495453, | |
| "epoch": 0.3286733238231098, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 4.828166025208059e-06, | |
| "loss": 0.4788, | |
| "mean_token_accuracy": 0.8542606756091118, | |
| "num_tokens": 842779.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 1.4241313189268112, | |
| "epoch": 0.3309557774607703, | |
| "grad_norm": 3.5, | |
| "learning_rate": 4.824718446675465e-06, | |
| "loss": 0.4501, | |
| "mean_token_accuracy": 0.8673816919326782, | |
| "num_tokens": 848075.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 1.3615167737007141, | |
| "epoch": 0.3332382310984308, | |
| "grad_norm": 3.984375, | |
| "learning_rate": 4.821237881547567e-06, | |
| "loss": 0.4803, | |
| "mean_token_accuracy": 0.8680660426616669, | |
| "num_tokens": 853972.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 1.4747860878705978, | |
| "epoch": 0.3355206847360913, | |
| "grad_norm": 3.890625, | |
| "learning_rate": 4.8177243792118515e-06, | |
| "loss": 0.4336, | |
| "mean_token_accuracy": 0.8747361823916435, | |
| "num_tokens": 859859.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 1.5414969474077225, | |
| "epoch": 0.3378031383737518, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 4.814177989523162e-06, | |
| "loss": 0.4489, | |
| "mean_token_accuracy": 0.8644633367657661, | |
| "num_tokens": 865836.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 1.6249495893716812, | |
| "epoch": 0.34008559201141225, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 4.810598762803e-06, | |
| "loss": 0.5226, | |
| "mean_token_accuracy": 0.8477596640586853, | |
| "num_tokens": 872086.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 1.4743667244911194, | |
| "epoch": 0.34236804564907275, | |
| "grad_norm": 3.75, | |
| "learning_rate": 4.8069867498388066e-06, | |
| "loss": 0.4693, | |
| "mean_token_accuracy": 0.8513918668031693, | |
| "num_tokens": 877138.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.3822671622037888, | |
| "epoch": 0.34465049928673325, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 4.803342001883247e-06, | |
| "loss": 0.408, | |
| "mean_token_accuracy": 0.8763712868094444, | |
| "num_tokens": 883268.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 1.4955266863107681, | |
| "epoch": 0.34693295292439374, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 4.799664570653473e-06, | |
| "loss": 0.5271, | |
| "mean_token_accuracy": 0.8504318669438362, | |
| "num_tokens": 889206.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 1.6125495880842209, | |
| "epoch": 0.3492154065620542, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 4.795954508330403e-06, | |
| "loss": 0.6248, | |
| "mean_token_accuracy": 0.8179907724261284, | |
| "num_tokens": 894476.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 1.5931424498558044, | |
| "epoch": 0.3514978601997147, | |
| "grad_norm": 4.75, | |
| "learning_rate": 4.792211867557969e-06, | |
| "loss": 0.4888, | |
| "mean_token_accuracy": 0.8579384312033653, | |
| "num_tokens": 899026.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 1.4209279268980026, | |
| "epoch": 0.3537803138373752, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 4.788436701442378e-06, | |
| "loss": 0.4354, | |
| "mean_token_accuracy": 0.8708065152168274, | |
| "num_tokens": 905347.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 1.4381519109010696, | |
| "epoch": 0.3560627674750357, | |
| "grad_norm": 3.703125, | |
| "learning_rate": 4.784629063551354e-06, | |
| "loss": 0.5609, | |
| "mean_token_accuracy": 0.8458188697695732, | |
| "num_tokens": 911400.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 1.4265454858541489, | |
| "epoch": 0.3583452211126961, | |
| "grad_norm": 3.5, | |
| "learning_rate": 4.780789007913379e-06, | |
| "loss": 0.516, | |
| "mean_token_accuracy": 0.8464484214782715, | |
| "num_tokens": 917633.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 1.6952187418937683, | |
| "epoch": 0.3606276747503566, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 4.776916589016928e-06, | |
| "loss": 0.6655, | |
| "mean_token_accuracy": 0.8154120817780495, | |
| "num_tokens": 922878.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 1.4849806427955627, | |
| "epoch": 0.3629101283880171, | |
| "grad_norm": 3.984375, | |
| "learning_rate": 4.773011861809694e-06, | |
| "loss": 0.5529, | |
| "mean_token_accuracy": 0.8317237794399261, | |
| "num_tokens": 928432.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 1.3825362920761108, | |
| "epoch": 0.3651925820256776, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 4.769074881697806e-06, | |
| "loss": 0.422, | |
| "mean_token_accuracy": 0.8742568120360374, | |
| "num_tokens": 934019.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.429061233997345, | |
| "epoch": 0.3674750356633381, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 4.765105704545052e-06, | |
| "loss": 0.4181, | |
| "mean_token_accuracy": 0.8700381815433502, | |
| "num_tokens": 940405.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 1.4522172808647156, | |
| "epoch": 0.36975748930099855, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 4.761104386672074e-06, | |
| "loss": 0.4664, | |
| "mean_token_accuracy": 0.8705998063087463, | |
| "num_tokens": 946891.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 1.4823334366083145, | |
| "epoch": 0.37203994293865905, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 4.757070984855577e-06, | |
| "loss": 0.3902, | |
| "mean_token_accuracy": 0.8853188008069992, | |
| "num_tokens": 954063.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 1.4951584190130234, | |
| "epoch": 0.37432239657631955, | |
| "grad_norm": 4.25, | |
| "learning_rate": 4.7530055563275225e-06, | |
| "loss": 0.4601, | |
| "mean_token_accuracy": 0.8618411421775818, | |
| "num_tokens": 959914.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 1.4573408663272858, | |
| "epoch": 0.37660485021398005, | |
| "grad_norm": 4.625, | |
| "learning_rate": 4.748908158774312e-06, | |
| "loss": 0.5381, | |
| "mean_token_accuracy": 0.8516411259770393, | |
| "num_tokens": 965145.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 1.4346065074205399, | |
| "epoch": 0.3788873038516405, | |
| "grad_norm": 3.421875, | |
| "learning_rate": 4.744778850335974e-06, | |
| "loss": 0.4718, | |
| "mean_token_accuracy": 0.8635387867689133, | |
| "num_tokens": 971469.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 1.4204413443803787, | |
| "epoch": 0.381169757489301, | |
| "grad_norm": 3.6875, | |
| "learning_rate": 4.7406176896053356e-06, | |
| "loss": 0.4281, | |
| "mean_token_accuracy": 0.8760756626725197, | |
| "num_tokens": 976905.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 1.4582399874925613, | |
| "epoch": 0.3834522111269615, | |
| "grad_norm": 3.625, | |
| "learning_rate": 4.736424735627193e-06, | |
| "loss": 0.472, | |
| "mean_token_accuracy": 0.8653873577713966, | |
| "num_tokens": 982797.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 1.4145529568195343, | |
| "epoch": 0.385734664764622, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 4.73220004789747e-06, | |
| "loss": 0.4677, | |
| "mean_token_accuracy": 0.8689080029726028, | |
| "num_tokens": 988588.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 1.4675364196300507, | |
| "epoch": 0.3880171184022825, | |
| "grad_norm": 3.796875, | |
| "learning_rate": 4.7279436863623805e-06, | |
| "loss": 0.4218, | |
| "mean_token_accuracy": 0.8724250420928001, | |
| "num_tokens": 994490.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.4822284132242203, | |
| "epoch": 0.3902995720399429, | |
| "grad_norm": 3.25, | |
| "learning_rate": 4.7236557114175705e-06, | |
| "loss": 0.4036, | |
| "mean_token_accuracy": 0.8729385659098625, | |
| "num_tokens": 1000341.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 1.5275023579597473, | |
| "epoch": 0.3925820256776034, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 4.719336183907266e-06, | |
| "loss": 0.5107, | |
| "mean_token_accuracy": 0.846622422337532, | |
| "num_tokens": 1005552.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 1.4371494799852371, | |
| "epoch": 0.3948644793152639, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 4.7149851651234085e-06, | |
| "loss": 0.4761, | |
| "mean_token_accuracy": 0.856620728969574, | |
| "num_tokens": 1011272.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 1.4481075257062912, | |
| "epoch": 0.3971469329529244, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 4.710602716804784e-06, | |
| "loss": 0.4907, | |
| "mean_token_accuracy": 0.8551308736205101, | |
| "num_tokens": 1018025.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 1.4776830077171326, | |
| "epoch": 0.39942938659058486, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 4.706188901136148e-06, | |
| "loss": 0.4157, | |
| "mean_token_accuracy": 0.8659848943352699, | |
| "num_tokens": 1023559.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 1.3460393995046616, | |
| "epoch": 0.40171184022824535, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 4.701743780747345e-06, | |
| "loss": 0.3891, | |
| "mean_token_accuracy": 0.8979940786957741, | |
| "num_tokens": 1029587.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 1.5323508977890015, | |
| "epoch": 0.40399429386590585, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 4.697267418712415e-06, | |
| "loss": 0.5064, | |
| "mean_token_accuracy": 0.8600496724247932, | |
| "num_tokens": 1035523.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 1.3961755633354187, | |
| "epoch": 0.40627674750356635, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 4.6927598785487026e-06, | |
| "loss": 0.4937, | |
| "mean_token_accuracy": 0.8478540182113647, | |
| "num_tokens": 1041403.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 1.4182656705379486, | |
| "epoch": 0.40855920114122685, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 4.6882212242159555e-06, | |
| "loss": 0.3456, | |
| "mean_token_accuracy": 0.8982625529170036, | |
| "num_tokens": 1047682.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 1.548415094614029, | |
| "epoch": 0.4108416547788873, | |
| "grad_norm": 4.3125, | |
| "learning_rate": 4.683651520115414e-06, | |
| "loss": 0.5678, | |
| "mean_token_accuracy": 0.8428888395428658, | |
| "num_tokens": 1053172.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.396517127752304, | |
| "epoch": 0.4131241084165478, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 4.679050831088902e-06, | |
| "loss": 0.4803, | |
| "mean_token_accuracy": 0.856790341436863, | |
| "num_tokens": 1059373.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 1.3589655607938766, | |
| "epoch": 0.4154065620542083, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 4.674419222417899e-06, | |
| "loss": 0.3944, | |
| "mean_token_accuracy": 0.8856743425130844, | |
| "num_tokens": 1065347.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 1.5359989404678345, | |
| "epoch": 0.4176890156918688, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 4.669756759822625e-06, | |
| "loss": 0.4896, | |
| "mean_token_accuracy": 0.8504308834671974, | |
| "num_tokens": 1070311.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 1.3297200053930283, | |
| "epoch": 0.4199714693295292, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 4.665063509461098e-06, | |
| "loss": 0.3047, | |
| "mean_token_accuracy": 0.9152820706367493, | |
| "num_tokens": 1076590.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 1.3356045931577682, | |
| "epoch": 0.4222539229671897, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 4.660339537928198e-06, | |
| "loss": 0.3891, | |
| "mean_token_accuracy": 0.8858283907175064, | |
| "num_tokens": 1082550.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 1.518212452530861, | |
| "epoch": 0.4245363766048502, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 4.655584912254727e-06, | |
| "loss": 0.393, | |
| "mean_token_accuracy": 0.8758783265948296, | |
| "num_tokens": 1088391.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 1.488260880112648, | |
| "epoch": 0.4268188302425107, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 4.650799699906452e-06, | |
| "loss": 0.4005, | |
| "mean_token_accuracy": 0.871321365237236, | |
| "num_tokens": 1093823.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 1.4447910338640213, | |
| "epoch": 0.42910128388017116, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 4.645983968783148e-06, | |
| "loss": 0.3873, | |
| "mean_token_accuracy": 0.8878121376037598, | |
| "num_tokens": 1099347.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 1.4393097907304764, | |
| "epoch": 0.43138373751783166, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 4.64113778721764e-06, | |
| "loss": 0.3712, | |
| "mean_token_accuracy": 0.8943579867482185, | |
| "num_tokens": 1104941.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 1.5411454141139984, | |
| "epoch": 0.43366619115549215, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 4.636261223974826e-06, | |
| "loss": 0.498, | |
| "mean_token_accuracy": 0.8571378961205482, | |
| "num_tokens": 1110031.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.3604239225387573, | |
| "epoch": 0.43594864479315265, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 4.631354348250706e-06, | |
| "loss": 0.4366, | |
| "mean_token_accuracy": 0.8668901473283768, | |
| "num_tokens": 1116176.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 1.4267419427633286, | |
| "epoch": 0.43823109843081315, | |
| "grad_norm": 3.125, | |
| "learning_rate": 4.626417229671401e-06, | |
| "loss": 0.4324, | |
| "mean_token_accuracy": 0.8729524612426758, | |
| "num_tokens": 1122065.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 1.554912507534027, | |
| "epoch": 0.4405135520684736, | |
| "grad_norm": 4.34375, | |
| "learning_rate": 4.621449938292159e-06, | |
| "loss": 0.5843, | |
| "mean_token_accuracy": 0.8273278325796127, | |
| "num_tokens": 1127506.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 1.3502502888441086, | |
| "epoch": 0.4427960057061341, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 4.616452544596367e-06, | |
| "loss": 0.3874, | |
| "mean_token_accuracy": 0.8785886839032173, | |
| "num_tokens": 1133494.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 1.4718603789806366, | |
| "epoch": 0.4450784593437946, | |
| "grad_norm": 3.90625, | |
| "learning_rate": 4.611425119494552e-06, | |
| "loss": 0.4499, | |
| "mean_token_accuracy": 0.8621420189738274, | |
| "num_tokens": 1139036.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 1.592808559536934, | |
| "epoch": 0.4473609129814551, | |
| "grad_norm": 4.5625, | |
| "learning_rate": 4.606367734323365e-06, | |
| "loss": 0.5667, | |
| "mean_token_accuracy": 0.832310289144516, | |
| "num_tokens": 1144022.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 1.410594865679741, | |
| "epoch": 0.4496433666191155, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 4.601280460844583e-06, | |
| "loss": 0.5266, | |
| "mean_token_accuracy": 0.855924166738987, | |
| "num_tokens": 1150011.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 1.4304940402507782, | |
| "epoch": 0.451925820256776, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 4.596163371244076e-06, | |
| "loss": 0.5302, | |
| "mean_token_accuracy": 0.8468711525201797, | |
| "num_tokens": 1155938.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 1.4850642681121826, | |
| "epoch": 0.4542082738944365, | |
| "grad_norm": 3.8125, | |
| "learning_rate": 4.591016538130796e-06, | |
| "loss": 0.5296, | |
| "mean_token_accuracy": 0.8607726991176605, | |
| "num_tokens": 1161187.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 1.495200276374817, | |
| "epoch": 0.456490727532097, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 4.585840034535736e-06, | |
| "loss": 0.4806, | |
| "mean_token_accuracy": 0.865336537361145, | |
| "num_tokens": 1167354.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.456490727532097, | |
| "eval_entropy": 1.4645510156949362, | |
| "eval_loss": 0.48574092984199524, | |
| "eval_mean_token_accuracy": 0.8648963557349311, | |
| "eval_num_tokens": 1167354.0, | |
| "eval_runtime": 4.6146, | |
| "eval_samples_per_second": 19.503, | |
| "eval_steps_per_second": 19.503, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.5986905246973038, | |
| "epoch": 0.4587731811697575, | |
| "grad_norm": 3.984375, | |
| "learning_rate": 4.580633933910901e-06, | |
| "loss": 0.4827, | |
| "mean_token_accuracy": 0.8589570224285126, | |
| "num_tokens": 1173168.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 1.4304189831018448, | |
| "epoch": 0.46105563480741796, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 4.575398310128263e-06, | |
| "loss": 0.432, | |
| "mean_token_accuracy": 0.870637446641922, | |
| "num_tokens": 1178884.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 1.5412327647209167, | |
| "epoch": 0.46333808844507846, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 4.570133237478711e-06, | |
| "loss": 0.5089, | |
| "mean_token_accuracy": 0.8491686582565308, | |
| "num_tokens": 1184480.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 1.4805094599723816, | |
| "epoch": 0.46562054208273895, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 4.564838790671e-06, | |
| "loss": 0.5336, | |
| "mean_token_accuracy": 0.8480750620365143, | |
| "num_tokens": 1190484.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 1.4799759984016418, | |
| "epoch": 0.46790299572039945, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 4.55951504483069e-06, | |
| "loss": 0.4372, | |
| "mean_token_accuracy": 0.8827960044145584, | |
| "num_tokens": 1195901.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 1.5237813293933868, | |
| "epoch": 0.4701854493580599, | |
| "grad_norm": 4.125, | |
| "learning_rate": 4.55416207549908e-06, | |
| "loss": 0.613, | |
| "mean_token_accuracy": 0.8307090178132057, | |
| "num_tokens": 1201383.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 1.438712790608406, | |
| "epoch": 0.4724679029957204, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 4.548779958632134e-06, | |
| "loss": 0.5351, | |
| "mean_token_accuracy": 0.8520702794194221, | |
| "num_tokens": 1207874.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 1.4036246687173843, | |
| "epoch": 0.4747503566333809, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 4.543368770599406e-06, | |
| "loss": 0.346, | |
| "mean_token_accuracy": 0.8787712529301643, | |
| "num_tokens": 1213989.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 1.486038789153099, | |
| "epoch": 0.4770328102710414, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 4.537928588182955e-06, | |
| "loss": 0.5211, | |
| "mean_token_accuracy": 0.8482290953397751, | |
| "num_tokens": 1219525.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 1.444077506661415, | |
| "epoch": 0.4793152639087018, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 4.532459488576258e-06, | |
| "loss": 0.3976, | |
| "mean_token_accuracy": 0.8832324147224426, | |
| "num_tokens": 1226231.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.5054399818181992, | |
| "epoch": 0.4815977175463623, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 4.526961549383109e-06, | |
| "loss": 0.4581, | |
| "mean_token_accuracy": 0.8546851649880409, | |
| "num_tokens": 1232271.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 1.4887232929468155, | |
| "epoch": 0.4838801711840228, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 4.521434848616523e-06, | |
| "loss": 0.4776, | |
| "mean_token_accuracy": 0.8665826469659805, | |
| "num_tokens": 1239076.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 1.4471513032913208, | |
| "epoch": 0.4861626248216833, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 4.515879464697629e-06, | |
| "loss": 0.3437, | |
| "mean_token_accuracy": 0.9033405035734177, | |
| "num_tokens": 1245117.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 1.4255793392658234, | |
| "epoch": 0.4884450784593438, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 4.5102954764545525e-06, | |
| "loss": 0.3922, | |
| "mean_token_accuracy": 0.879116877913475, | |
| "num_tokens": 1251024.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 1.4146728217601776, | |
| "epoch": 0.49072753209700426, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 4.5046829631213014e-06, | |
| "loss": 0.4581, | |
| "mean_token_accuracy": 0.8701305538415909, | |
| "num_tokens": 1257738.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 1.4356386065483093, | |
| "epoch": 0.49300998573466476, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 4.499042004336642e-06, | |
| "loss": 0.4283, | |
| "mean_token_accuracy": 0.8771600425243378, | |
| "num_tokens": 1265254.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 1.3496776968240738, | |
| "epoch": 0.49529243937232525, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 4.4933726801429665e-06, | |
| "loss": 0.3705, | |
| "mean_token_accuracy": 0.8920829594135284, | |
| "num_tokens": 1271970.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 1.5127773433923721, | |
| "epoch": 0.49757489300998575, | |
| "grad_norm": 3.78125, | |
| "learning_rate": 4.487675070985156e-06, | |
| "loss": 0.4624, | |
| "mean_token_accuracy": 0.8566678315401077, | |
| "num_tokens": 1277606.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 1.4766086488962173, | |
| "epoch": 0.4998573466476462, | |
| "grad_norm": 3.796875, | |
| "learning_rate": 4.481949257709442e-06, | |
| "loss": 0.4412, | |
| "mean_token_accuracy": 0.8686520978808403, | |
| "num_tokens": 1283617.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 1.5000656843185425, | |
| "epoch": 0.5021398002853067, | |
| "grad_norm": 4.40625, | |
| "learning_rate": 4.476195321562262e-06, | |
| "loss": 0.5898, | |
| "mean_token_accuracy": 0.8323855772614479, | |
| "num_tokens": 1289328.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.4562593698501587, | |
| "epoch": 0.5044222539229671, | |
| "grad_norm": 3.625, | |
| "learning_rate": 4.470413344189098e-06, | |
| "loss": 0.4657, | |
| "mean_token_accuracy": 0.8688141480088234, | |
| "num_tokens": 1294897.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 1.412929117679596, | |
| "epoch": 0.5067047075606277, | |
| "grad_norm": 3.515625, | |
| "learning_rate": 4.464603407633326e-06, | |
| "loss": 0.4717, | |
| "mean_token_accuracy": 0.8586973398923874, | |
| "num_tokens": 1300887.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 1.5253776609897614, | |
| "epoch": 0.5089871611982881, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 4.458765594335048e-06, | |
| "loss": 0.473, | |
| "mean_token_accuracy": 0.8543320819735527, | |
| "num_tokens": 1306712.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 1.5946801453828812, | |
| "epoch": 0.5112696148359487, | |
| "grad_norm": 3.90625, | |
| "learning_rate": 4.452899987129922e-06, | |
| "loss": 0.5303, | |
| "mean_token_accuracy": 0.8440029099583626, | |
| "num_tokens": 1311955.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 1.3364089578390121, | |
| "epoch": 0.5135520684736091, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 4.44700666924799e-06, | |
| "loss": 0.3431, | |
| "mean_token_accuracy": 0.8987620249390602, | |
| "num_tokens": 1318460.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 1.4394992887973785, | |
| "epoch": 0.5158345221112696, | |
| "grad_norm": 3.0, | |
| "learning_rate": 4.441085724312494e-06, | |
| "loss": 0.4805, | |
| "mean_token_accuracy": 0.861751489341259, | |
| "num_tokens": 1325269.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 1.4739690721035004, | |
| "epoch": 0.5181169757489301, | |
| "grad_norm": 3.5, | |
| "learning_rate": 4.435137236338688e-06, | |
| "loss": 0.4712, | |
| "mean_token_accuracy": 0.8692339286208153, | |
| "num_tokens": 1331087.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 1.408553659915924, | |
| "epoch": 0.5203994293865906, | |
| "grad_norm": 3.703125, | |
| "learning_rate": 4.42916128973265e-06, | |
| "loss": 0.545, | |
| "mean_token_accuracy": 0.8480049669742584, | |
| "num_tokens": 1336928.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 1.4906915128231049, | |
| "epoch": 0.5226818830242511, | |
| "grad_norm": 4.3125, | |
| "learning_rate": 4.423157969290081e-06, | |
| "loss": 0.4943, | |
| "mean_token_accuracy": 0.8629228696227074, | |
| "num_tokens": 1341951.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 1.5799495428800583, | |
| "epoch": 0.5249643366619116, | |
| "grad_norm": 3.875, | |
| "learning_rate": 4.417127360195107e-06, | |
| "loss": 0.454, | |
| "mean_token_accuracy": 0.8446270450949669, | |
| "num_tokens": 1346983.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.3668962121009827, | |
| "epoch": 0.527246790299572, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 4.41106954801906e-06, | |
| "loss": 0.3977, | |
| "mean_token_accuracy": 0.8871706500649452, | |
| "num_tokens": 1354122.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 1.5603487640619278, | |
| "epoch": 0.5295292439372326, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 4.404984618719275e-06, | |
| "loss": 0.4717, | |
| "mean_token_accuracy": 0.8657551482319832, | |
| "num_tokens": 1359608.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 1.4570914506912231, | |
| "epoch": 0.531811697574893, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 4.398872658637863e-06, | |
| "loss": 0.4311, | |
| "mean_token_accuracy": 0.8685552924871445, | |
| "num_tokens": 1365590.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 1.329675242304802, | |
| "epoch": 0.5340941512125535, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 4.39273375450049e-06, | |
| "loss": 0.4566, | |
| "mean_token_accuracy": 0.8627236634492874, | |
| "num_tokens": 1372145.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 1.4357402175664902, | |
| "epoch": 0.536376604850214, | |
| "grad_norm": 3.5, | |
| "learning_rate": 4.386567993415144e-06, | |
| "loss": 0.4507, | |
| "mean_token_accuracy": 0.8667884543538094, | |
| "num_tokens": 1377900.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 1.5077559649944305, | |
| "epoch": 0.5386590584878744, | |
| "grad_norm": 3.984375, | |
| "learning_rate": 4.3803754628708995e-06, | |
| "loss": 0.5176, | |
| "mean_token_accuracy": 0.8583211898803711, | |
| "num_tokens": 1383999.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 1.3777508586645126, | |
| "epoch": 0.540941512125535, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 4.3741562507366754e-06, | |
| "loss": 0.3431, | |
| "mean_token_accuracy": 0.8923545554280281, | |
| "num_tokens": 1390419.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 1.4933728128671646, | |
| "epoch": 0.5432239657631954, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 4.367910445259991e-06, | |
| "loss": 0.4044, | |
| "mean_token_accuracy": 0.8686385452747345, | |
| "num_tokens": 1396684.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 1.4653480350971222, | |
| "epoch": 0.5455064194008559, | |
| "grad_norm": 3.25, | |
| "learning_rate": 4.361638135065711e-06, | |
| "loss": 0.4561, | |
| "mean_token_accuracy": 0.8716481998562813, | |
| "num_tokens": 1402830.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 1.5274227857589722, | |
| "epoch": 0.5477888730385164, | |
| "grad_norm": 3.6875, | |
| "learning_rate": 4.355339409154788e-06, | |
| "loss": 0.5069, | |
| "mean_token_accuracy": 0.8373076170682907, | |
| "num_tokens": 1408506.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.4511406421661377, | |
| "epoch": 0.5500713266761769, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 4.3490143569030025e-06, | |
| "loss": 0.4684, | |
| "mean_token_accuracy": 0.8665965721011162, | |
| "num_tokens": 1414792.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 1.3838857859373093, | |
| "epoch": 0.5523537803138374, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 4.34266306805969e-06, | |
| "loss": 0.4547, | |
| "mean_token_accuracy": 0.8690644651651382, | |
| "num_tokens": 1420524.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 1.4130767732858658, | |
| "epoch": 0.5546362339514979, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 4.336285632746472e-06, | |
| "loss": 0.471, | |
| "mean_token_accuracy": 0.8564508408308029, | |
| "num_tokens": 1426426.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 1.618276908993721, | |
| "epoch": 0.5569186875891583, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 4.329882141455974e-06, | |
| "loss": 0.5143, | |
| "mean_token_accuracy": 0.8403759598731995, | |
| "num_tokens": 1431586.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 1.4412871301174164, | |
| "epoch": 0.5592011412268189, | |
| "grad_norm": 3.90625, | |
| "learning_rate": 4.323452685050545e-06, | |
| "loss": 0.4539, | |
| "mean_token_accuracy": 0.863670825958252, | |
| "num_tokens": 1437354.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 1.4914350509643555, | |
| "epoch": 0.5614835948644793, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 4.316997354760965e-06, | |
| "loss": 0.3826, | |
| "mean_token_accuracy": 0.8802237138152122, | |
| "num_tokens": 1443221.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 1.5026773810386658, | |
| "epoch": 0.5637660485021398, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 4.3105162421851494e-06, | |
| "loss": 0.4275, | |
| "mean_token_accuracy": 0.8739782869815826, | |
| "num_tokens": 1448716.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 1.467271402478218, | |
| "epoch": 0.5660485021398003, | |
| "grad_norm": 3.765625, | |
| "learning_rate": 4.304009439286855e-06, | |
| "loss": 0.4786, | |
| "mean_token_accuracy": 0.8454955220222473, | |
| "num_tokens": 1453607.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 1.3084248155355453, | |
| "epoch": 0.5683309557774607, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 4.297477038394368e-06, | |
| "loss": 0.4264, | |
| "mean_token_accuracy": 0.8782637789845467, | |
| "num_tokens": 1460122.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 1.4157914519309998, | |
| "epoch": 0.5706134094151213, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 4.2909191321992e-06, | |
| "loss": 0.4883, | |
| "mean_token_accuracy": 0.8630497455596924, | |
| "num_tokens": 1466789.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.3701231330633163, | |
| "epoch": 0.5728958630527817, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 4.28433581375477e-06, | |
| "loss": 0.4331, | |
| "mean_token_accuracy": 0.874555304646492, | |
| "num_tokens": 1472752.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 1.5737513154745102, | |
| "epoch": 0.5751783166904422, | |
| "grad_norm": 3.625, | |
| "learning_rate": 4.2777271764750805e-06, | |
| "loss": 0.4553, | |
| "mean_token_accuracy": 0.8664311170578003, | |
| "num_tokens": 1478473.0, | |
| "step": 252 | |
| }, | |
| { | |
| "entropy": 1.525623768568039, | |
| "epoch": 0.5774607703281027, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 4.271093314133401e-06, | |
| "loss": 0.466, | |
| "mean_token_accuracy": 0.8556927219033241, | |
| "num_tokens": 1484284.0, | |
| "step": 253 | |
| }, | |
| { | |
| "entropy": 1.4639706760644913, | |
| "epoch": 0.5797432239657632, | |
| "grad_norm": 3.75, | |
| "learning_rate": 4.264434320860929e-06, | |
| "loss": 0.5532, | |
| "mean_token_accuracy": 0.844054289162159, | |
| "num_tokens": 1490166.0, | |
| "step": 254 | |
| }, | |
| { | |
| "entropy": 1.5366946905851364, | |
| "epoch": 0.5820256776034237, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 4.257750291145457e-06, | |
| "loss": 0.5268, | |
| "mean_token_accuracy": 0.8521439135074615, | |
| "num_tokens": 1495689.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 1.5063273757696152, | |
| "epoch": 0.5843081312410842, | |
| "grad_norm": 3.796875, | |
| "learning_rate": 4.251041319830034e-06, | |
| "loss": 0.5244, | |
| "mean_token_accuracy": 0.8497593402862549, | |
| "num_tokens": 1501104.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 1.5439026057720184, | |
| "epoch": 0.5865905848787446, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 4.2443075021116166e-06, | |
| "loss": 0.3605, | |
| "mean_token_accuracy": 0.8726519420742989, | |
| "num_tokens": 1506924.0, | |
| "step": 257 | |
| }, | |
| { | |
| "entropy": 1.4876836389303207, | |
| "epoch": 0.5888730385164052, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 4.237548933539718e-06, | |
| "loss": 0.4703, | |
| "mean_token_accuracy": 0.866664931178093, | |
| "num_tokens": 1512828.0, | |
| "step": 258 | |
| }, | |
| { | |
| "entropy": 1.480648323893547, | |
| "epoch": 0.5911554921540656, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 4.230765710015058e-06, | |
| "loss": 0.466, | |
| "mean_token_accuracy": 0.8522143065929413, | |
| "num_tokens": 1518522.0, | |
| "step": 259 | |
| }, | |
| { | |
| "entropy": 1.4419532120227814, | |
| "epoch": 0.5934379457917262, | |
| "grad_norm": 3.703125, | |
| "learning_rate": 4.223957927788195e-06, | |
| "loss": 0.4973, | |
| "mean_token_accuracy": 0.8543191030621529, | |
| "num_tokens": 1523970.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.4034761041402817, | |
| "epoch": 0.5957203994293866, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 4.217125683458162e-06, | |
| "loss": 0.3724, | |
| "mean_token_accuracy": 0.8887425437569618, | |
| "num_tokens": 1530150.0, | |
| "step": 261 | |
| }, | |
| { | |
| "entropy": 1.5668024867773056, | |
| "epoch": 0.598002853067047, | |
| "grad_norm": 4.5, | |
| "learning_rate": 4.210269073971098e-06, | |
| "loss": 0.4921, | |
| "mean_token_accuracy": 0.8630413040518761, | |
| "num_tokens": 1535368.0, | |
| "step": 262 | |
| }, | |
| { | |
| "entropy": 1.4702572673559189, | |
| "epoch": 0.6002853067047076, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 4.203388196618874e-06, | |
| "loss": 0.3834, | |
| "mean_token_accuracy": 0.8823850229382515, | |
| "num_tokens": 1541388.0, | |
| "step": 263 | |
| }, | |
| { | |
| "entropy": 1.353348970413208, | |
| "epoch": 0.602567760342368, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 4.196483149037707e-06, | |
| "loss": 0.3882, | |
| "mean_token_accuracy": 0.8797778934240341, | |
| "num_tokens": 1547245.0, | |
| "step": 264 | |
| }, | |
| { | |
| "entropy": 1.3397300243377686, | |
| "epoch": 0.6048502139800286, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 4.1895540292067765e-06, | |
| "loss": 0.4969, | |
| "mean_token_accuracy": 0.8677136451005936, | |
| "num_tokens": 1553007.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 1.533875733613968, | |
| "epoch": 0.607132667617689, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 4.18260093544684e-06, | |
| "loss": 0.5423, | |
| "mean_token_accuracy": 0.8619329035282135, | |
| "num_tokens": 1559044.0, | |
| "step": 266 | |
| }, | |
| { | |
| "entropy": 1.4415863156318665, | |
| "epoch": 0.6094151212553495, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 4.1756239664188275e-06, | |
| "loss": 0.4586, | |
| "mean_token_accuracy": 0.8679523020982742, | |
| "num_tokens": 1565121.0, | |
| "step": 267 | |
| }, | |
| { | |
| "entropy": 1.5389132052659988, | |
| "epoch": 0.61169757489301, | |
| "grad_norm": 3.515625, | |
| "learning_rate": 4.168623221122451e-06, | |
| "loss": 0.3954, | |
| "mean_token_accuracy": 0.8800017014145851, | |
| "num_tokens": 1570839.0, | |
| "step": 268 | |
| }, | |
| { | |
| "entropy": 1.4849002212285995, | |
| "epoch": 0.6139800285306705, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 4.161598798894795e-06, | |
| "loss": 0.5272, | |
| "mean_token_accuracy": 0.842116691172123, | |
| "num_tokens": 1576765.0, | |
| "step": 269 | |
| }, | |
| { | |
| "entropy": 1.526948407292366, | |
| "epoch": 0.6162624821683309, | |
| "grad_norm": 3.796875, | |
| "learning_rate": 4.154550799408906e-06, | |
| "loss": 0.4815, | |
| "mean_token_accuracy": 0.8517501726746559, | |
| "num_tokens": 1582404.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.5471256375312805, | |
| "epoch": 0.6185449358059915, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 4.147479322672383e-06, | |
| "loss": 0.5704, | |
| "mean_token_accuracy": 0.8349821045994759, | |
| "num_tokens": 1588027.0, | |
| "step": 271 | |
| }, | |
| { | |
| "entropy": 1.3742996156215668, | |
| "epoch": 0.6208273894436519, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 4.1403844690259544e-06, | |
| "loss": 0.4357, | |
| "mean_token_accuracy": 0.8906814530491829, | |
| "num_tokens": 1594482.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 1.7183980494737625, | |
| "epoch": 0.6231098430813125, | |
| "grad_norm": 4.625, | |
| "learning_rate": 4.1332663391420515e-06, | |
| "loss": 0.6023, | |
| "mean_token_accuracy": 0.8240282908082008, | |
| "num_tokens": 1599978.0, | |
| "step": 273 | |
| }, | |
| { | |
| "entropy": 1.4364304840564728, | |
| "epoch": 0.6253922967189729, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 4.126125034023392e-06, | |
| "loss": 0.4642, | |
| "mean_token_accuracy": 0.8591607213020325, | |
| "num_tokens": 1606427.0, | |
| "step": 274 | |
| }, | |
| { | |
| "entropy": 1.4346765726804733, | |
| "epoch": 0.6276747503566333, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 4.11896065500153e-06, | |
| "loss": 0.4256, | |
| "mean_token_accuracy": 0.8701624721288681, | |
| "num_tokens": 1612618.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 1.625702291727066, | |
| "epoch": 0.6299572039942939, | |
| "grad_norm": 5.125, | |
| "learning_rate": 4.111773303735432e-06, | |
| "loss": 0.4558, | |
| "mean_token_accuracy": 0.8545658215880394, | |
| "num_tokens": 1617388.0, | |
| "step": 276 | |
| }, | |
| { | |
| "entropy": 1.4506097733974457, | |
| "epoch": 0.6322396576319543, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 4.104563082210028e-06, | |
| "loss": 0.4293, | |
| "mean_token_accuracy": 0.8728143572807312, | |
| "num_tokens": 1623851.0, | |
| "step": 277 | |
| }, | |
| { | |
| "entropy": 1.5303080081939697, | |
| "epoch": 0.6345221112696149, | |
| "grad_norm": 3.515625, | |
| "learning_rate": 4.097330092734765e-06, | |
| "loss": 0.5024, | |
| "mean_token_accuracy": 0.8505230322480202, | |
| "num_tokens": 1629428.0, | |
| "step": 278 | |
| }, | |
| { | |
| "entropy": 1.4354898631572723, | |
| "epoch": 0.6368045649072753, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 4.090074437942155e-06, | |
| "loss": 0.435, | |
| "mean_token_accuracy": 0.8785936459898949, | |
| "num_tokens": 1635769.0, | |
| "step": 279 | |
| }, | |
| { | |
| "entropy": 1.547384113073349, | |
| "epoch": 0.6390870185449358, | |
| "grad_norm": 4.0, | |
| "learning_rate": 4.082796220786324e-06, | |
| "loss": 0.5469, | |
| "mean_token_accuracy": 0.8383355513215065, | |
| "num_tokens": 1641791.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 1.480806604027748, | |
| "epoch": 0.6413694721825963, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 4.0754955445415405e-06, | |
| "loss": 0.4233, | |
| "mean_token_accuracy": 0.8961210995912552, | |
| "num_tokens": 1646709.0, | |
| "step": 281 | |
| }, | |
| { | |
| "entropy": 1.4669694900512695, | |
| "epoch": 0.6436519258202568, | |
| "grad_norm": 4.0, | |
| "learning_rate": 4.06817251280076e-06, | |
| "loss": 0.4288, | |
| "mean_token_accuracy": 0.8806118816137314, | |
| "num_tokens": 1651676.0, | |
| "step": 282 | |
| }, | |
| { | |
| "entropy": 1.4136276096105576, | |
| "epoch": 0.6459343794579172, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 4.06082722947415e-06, | |
| "loss": 0.4005, | |
| "mean_token_accuracy": 0.8672489523887634, | |
| "num_tokens": 1657293.0, | |
| "step": 283 | |
| }, | |
| { | |
| "entropy": 1.4642555862665176, | |
| "epoch": 0.6482168330955778, | |
| "grad_norm": 3.421875, | |
| "learning_rate": 4.053459798787619e-06, | |
| "loss": 0.4534, | |
| "mean_token_accuracy": 0.8670831546187401, | |
| "num_tokens": 1662778.0, | |
| "step": 284 | |
| }, | |
| { | |
| "entropy": 1.4143490493297577, | |
| "epoch": 0.6504992867332382, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 4.046070325281333e-06, | |
| "loss": 0.4511, | |
| "mean_token_accuracy": 0.8704198077321053, | |
| "num_tokens": 1669050.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 1.402878537774086, | |
| "epoch": 0.6527817403708988, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 4.038658913808235e-06, | |
| "loss": 0.3552, | |
| "mean_token_accuracy": 0.8852335959672928, | |
| "num_tokens": 1675168.0, | |
| "step": 286 | |
| }, | |
| { | |
| "entropy": 1.4332346022129059, | |
| "epoch": 0.6550641940085592, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 4.031225669532558e-06, | |
| "loss": 0.4411, | |
| "mean_token_accuracy": 0.8605756536126137, | |
| "num_tokens": 1680716.0, | |
| "step": 287 | |
| }, | |
| { | |
| "entropy": 1.4855122715234756, | |
| "epoch": 0.6573466476462196, | |
| "grad_norm": 3.890625, | |
| "learning_rate": 4.0237706979283306e-06, | |
| "loss": 0.5067, | |
| "mean_token_accuracy": 0.8480587676167488, | |
| "num_tokens": 1686358.0, | |
| "step": 288 | |
| }, | |
| { | |
| "entropy": 1.415476381778717, | |
| "epoch": 0.6596291012838802, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 4.016294104777883e-06, | |
| "loss": 0.3724, | |
| "mean_token_accuracy": 0.8872483521699905, | |
| "num_tokens": 1692477.0, | |
| "step": 289 | |
| }, | |
| { | |
| "entropy": 1.4918617755174637, | |
| "epoch": 0.6619115549215406, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 4.008795996170341e-06, | |
| "loss": 0.481, | |
| "mean_token_accuracy": 0.8568604290485382, | |
| "num_tokens": 1698377.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 1.3961764425039291, | |
| "epoch": 0.6641940085592012, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 4.001276478500127e-06, | |
| "loss": 0.3972, | |
| "mean_token_accuracy": 0.885112538933754, | |
| "num_tokens": 1704209.0, | |
| "step": 291 | |
| }, | |
| { | |
| "entropy": 1.4769706726074219, | |
| "epoch": 0.6664764621968616, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 3.993735658465446e-06, | |
| "loss": 0.5053, | |
| "mean_token_accuracy": 0.8577989414334297, | |
| "num_tokens": 1710422.0, | |
| "step": 292 | |
| }, | |
| { | |
| "entropy": 1.3838878571987152, | |
| "epoch": 0.6687589158345221, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 3.986173643066774e-06, | |
| "loss": 0.3759, | |
| "mean_token_accuracy": 0.8760515302419662, | |
| "num_tokens": 1716105.0, | |
| "step": 293 | |
| }, | |
| { | |
| "entropy": 1.3878977000713348, | |
| "epoch": 0.6710413694721826, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 3.978590539605338e-06, | |
| "loss": 0.329, | |
| "mean_token_accuracy": 0.8979349583387375, | |
| "num_tokens": 1723015.0, | |
| "step": 294 | |
| }, | |
| { | |
| "entropy": 1.4417504221200943, | |
| "epoch": 0.6733238231098431, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 3.970986455681593e-06, | |
| "loss": 0.5339, | |
| "mean_token_accuracy": 0.854948602616787, | |
| "num_tokens": 1729102.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 1.4643060863018036, | |
| "epoch": 0.6756062767475036, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 3.963361499193699e-06, | |
| "loss": 0.4545, | |
| "mean_token_accuracy": 0.8652586191892624, | |
| "num_tokens": 1734903.0, | |
| "step": 296 | |
| }, | |
| { | |
| "entropy": 1.4911223948001862, | |
| "epoch": 0.6778887303851641, | |
| "grad_norm": 3.25, | |
| "learning_rate": 3.955715778335984e-06, | |
| "loss": 0.4584, | |
| "mean_token_accuracy": 0.8684913441538811, | |
| "num_tokens": 1740820.0, | |
| "step": 297 | |
| }, | |
| { | |
| "entropy": 1.414558470249176, | |
| "epoch": 0.6801711840228245, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 3.948049401597414e-06, | |
| "loss": 0.4304, | |
| "mean_token_accuracy": 0.8772279694676399, | |
| "num_tokens": 1747285.0, | |
| "step": 298 | |
| }, | |
| { | |
| "entropy": 1.4684519618749619, | |
| "epoch": 0.6824536376604851, | |
| "grad_norm": 3.5, | |
| "learning_rate": 3.9403624777600526e-06, | |
| "loss": 0.3402, | |
| "mean_token_accuracy": 0.8974613174796104, | |
| "num_tokens": 1752238.0, | |
| "step": 299 | |
| }, | |
| { | |
| "entropy": 1.513798087835312, | |
| "epoch": 0.6847360912981455, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 3.932655115897513e-06, | |
| "loss": 0.518, | |
| "mean_token_accuracy": 0.8387879729270935, | |
| "num_tokens": 1757263.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6847360912981455, | |
| "eval_entropy": 1.4728518161508772, | |
| "eval_loss": 0.4787273108959198, | |
| "eval_mean_token_accuracy": 0.8652989500098758, | |
| "eval_num_tokens": 1757263.0, | |
| "eval_runtime": 4.4526, | |
| "eval_samples_per_second": 20.213, | |
| "eval_steps_per_second": 20.213, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.5329459309577942, | |
| "epoch": 0.6870185449358059, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 3.924927425373417e-06, | |
| "loss": 0.3762, | |
| "mean_token_accuracy": 0.8721340969204903, | |
| "num_tokens": 1762777.0, | |
| "step": 301 | |
| }, | |
| { | |
| "entropy": 1.3736970275640488, | |
| "epoch": 0.6893009985734665, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 3.91717951583984e-06, | |
| "loss": 0.403, | |
| "mean_token_accuracy": 0.8769481182098389, | |
| "num_tokens": 1769311.0, | |
| "step": 302 | |
| }, | |
| { | |
| "entropy": 1.4778434038162231, | |
| "epoch": 0.6915834522111269, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 3.909411497235752e-06, | |
| "loss": 0.4176, | |
| "mean_token_accuracy": 0.8799067437648773, | |
| "num_tokens": 1775618.0, | |
| "step": 303 | |
| }, | |
| { | |
| "entropy": 1.493824690580368, | |
| "epoch": 0.6938659058487875, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 3.901623479785465e-06, | |
| "loss": 0.4883, | |
| "mean_token_accuracy": 0.8613429367542267, | |
| "num_tokens": 1782559.0, | |
| "step": 304 | |
| }, | |
| { | |
| "entropy": 1.412913128733635, | |
| "epoch": 0.6961483594864479, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 3.89381557399706e-06, | |
| "loss": 0.4606, | |
| "mean_token_accuracy": 0.8659727945923805, | |
| "num_tokens": 1788268.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 1.3977010250091553, | |
| "epoch": 0.6984308131241084, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 3.885987890660828e-06, | |
| "loss": 0.3609, | |
| "mean_token_accuracy": 0.8855833634734154, | |
| "num_tokens": 1794289.0, | |
| "step": 306 | |
| }, | |
| { | |
| "entropy": 1.4351555556058884, | |
| "epoch": 0.7007132667617689, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 3.87814054084769e-06, | |
| "loss": 0.3922, | |
| "mean_token_accuracy": 0.882360152900219, | |
| "num_tokens": 1800100.0, | |
| "step": 307 | |
| }, | |
| { | |
| "entropy": 1.4589954763650894, | |
| "epoch": 0.7029957203994294, | |
| "grad_norm": 3.84375, | |
| "learning_rate": 3.8702736359076265e-06, | |
| "loss": 0.4728, | |
| "mean_token_accuracy": 0.8583435043692589, | |
| "num_tokens": 1806175.0, | |
| "step": 308 | |
| }, | |
| { | |
| "entropy": 1.497866302728653, | |
| "epoch": 0.7052781740370899, | |
| "grad_norm": 3.984375, | |
| "learning_rate": 3.862387287468095e-06, | |
| "loss": 0.5149, | |
| "mean_token_accuracy": 0.8527609705924988, | |
| "num_tokens": 1811406.0, | |
| "step": 309 | |
| }, | |
| { | |
| "entropy": 1.5516266524791718, | |
| "epoch": 0.7075606276747504, | |
| "grad_norm": 3.5, | |
| "learning_rate": 3.854481607432445e-06, | |
| "loss": 0.4476, | |
| "mean_token_accuracy": 0.8626842275261879, | |
| "num_tokens": 1816804.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 1.3300371170043945, | |
| "epoch": 0.7098430813124108, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 3.846556707978337e-06, | |
| "loss": 0.4001, | |
| "mean_token_accuracy": 0.8860765770077705, | |
| "num_tokens": 1823102.0, | |
| "step": 311 | |
| }, | |
| { | |
| "entropy": 1.5138549208641052, | |
| "epoch": 0.7121255349500714, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 3.838612701556138e-06, | |
| "loss": 0.4696, | |
| "mean_token_accuracy": 0.8707823753356934, | |
| "num_tokens": 1828740.0, | |
| "step": 312 | |
| }, | |
| { | |
| "entropy": 1.4780635386705399, | |
| "epoch": 0.7144079885877318, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 3.830649700887339e-06, | |
| "loss": 0.4598, | |
| "mean_token_accuracy": 0.8627598807215691, | |
| "num_tokens": 1835314.0, | |
| "step": 313 | |
| }, | |
| { | |
| "entropy": 1.483512207865715, | |
| "epoch": 0.7166904422253922, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 3.822667818962948e-06, | |
| "loss": 0.3944, | |
| "mean_token_accuracy": 0.8666610270738602, | |
| "num_tokens": 1840589.0, | |
| "step": 314 | |
| }, | |
| { | |
| "entropy": 1.351017713546753, | |
| "epoch": 0.7189728958630528, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 3.814667169041887e-06, | |
| "loss": 0.4589, | |
| "mean_token_accuracy": 0.8681119009852409, | |
| "num_tokens": 1846865.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 1.454156056046486, | |
| "epoch": 0.7212553495007132, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 3.8066478646493898e-06, | |
| "loss": 0.3616, | |
| "mean_token_accuracy": 0.887380801141262, | |
| "num_tokens": 1853343.0, | |
| "step": 316 | |
| }, | |
| { | |
| "entropy": 1.3507359623908997, | |
| "epoch": 0.7235378031383738, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 3.798610019575384e-06, | |
| "loss": 0.3908, | |
| "mean_token_accuracy": 0.8893059492111206, | |
| "num_tokens": 1859535.0, | |
| "step": 317 | |
| }, | |
| { | |
| "entropy": 1.5166684240102768, | |
| "epoch": 0.7258202567760342, | |
| "grad_norm": 4.3125, | |
| "learning_rate": 3.790553747872885e-06, | |
| "loss": 0.5235, | |
| "mean_token_accuracy": 0.8411901965737343, | |
| "num_tokens": 1864957.0, | |
| "step": 318 | |
| }, | |
| { | |
| "entropy": 1.4589732587337494, | |
| "epoch": 0.7281027104136947, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 3.7824791638563674e-06, | |
| "loss": 0.4074, | |
| "mean_token_accuracy": 0.8821713030338287, | |
| "num_tokens": 1870586.0, | |
| "step": 319 | |
| }, | |
| { | |
| "entropy": 1.429191216826439, | |
| "epoch": 0.7303851640513552, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 3.7743863821001538e-06, | |
| "loss": 0.4902, | |
| "mean_token_accuracy": 0.8597285747528076, | |
| "num_tokens": 1876572.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 1.52955062687397, | |
| "epoch": 0.7326676176890157, | |
| "grad_norm": 4.09375, | |
| "learning_rate": 3.766275517436779e-06, | |
| "loss": 0.5007, | |
| "mean_token_accuracy": 0.8509823232889175, | |
| "num_tokens": 1881581.0, | |
| "step": 321 | |
| }, | |
| { | |
| "entropy": 1.6073177456855774, | |
| "epoch": 0.7349500713266762, | |
| "grad_norm": 4.9375, | |
| "learning_rate": 3.7581466849553685e-06, | |
| "loss": 0.5742, | |
| "mean_token_accuracy": 0.8330699577927589, | |
| "num_tokens": 1886980.0, | |
| "step": 322 | |
| }, | |
| { | |
| "entropy": 1.490510642528534, | |
| "epoch": 0.7372325249643367, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.5597, | |
| "mean_token_accuracy": 0.8421717286109924, | |
| "num_tokens": 1892848.0, | |
| "step": 323 | |
| }, | |
| { | |
| "entropy": 1.4249206632375717, | |
| "epoch": 0.7395149786019971, | |
| "grad_norm": 3.59375, | |
| "learning_rate": 3.741835578168071e-06, | |
| "loss": 0.5289, | |
| "mean_token_accuracy": 0.8406483083963394, | |
| "num_tokens": 1899057.0, | |
| "step": 324 | |
| }, | |
| { | |
| "entropy": 1.4169852286577225, | |
| "epoch": 0.7417974322396577, | |
| "grad_norm": 3.421875, | |
| "learning_rate": 3.7336535353086546e-06, | |
| "loss": 0.4855, | |
| "mean_token_accuracy": 0.8616788312792778, | |
| "num_tokens": 1905042.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 1.5189264565706253, | |
| "epoch": 0.7440798858773181, | |
| "grad_norm": 3.59375, | |
| "learning_rate": 3.7254539875208577e-06, | |
| "loss": 0.5092, | |
| "mean_token_accuracy": 0.8563691675662994, | |
| "num_tokens": 1910608.0, | |
| "step": 326 | |
| }, | |
| { | |
| "entropy": 1.4049191176891327, | |
| "epoch": 0.7463623395149787, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 3.717237051152175e-06, | |
| "loss": 0.4253, | |
| "mean_token_accuracy": 0.8755350038409233, | |
| "num_tokens": 1916900.0, | |
| "step": 327 | |
| }, | |
| { | |
| "entropy": 1.4406355023384094, | |
| "epoch": 0.7486447931526391, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 3.7090028427968343e-06, | |
| "loss": 0.5454, | |
| "mean_token_accuracy": 0.8430257961153984, | |
| "num_tokens": 1923487.0, | |
| "step": 328 | |
| }, | |
| { | |
| "entropy": 1.4147418439388275, | |
| "epoch": 0.7509272467902995, | |
| "grad_norm": 3.734375, | |
| "learning_rate": 3.7007514792941462e-06, | |
| "loss": 0.4328, | |
| "mean_token_accuracy": 0.873896099627018, | |
| "num_tokens": 1929126.0, | |
| "step": 329 | |
| }, | |
| { | |
| "entropy": 1.4407319128513336, | |
| "epoch": 0.7532097004279601, | |
| "grad_norm": 3.609375, | |
| "learning_rate": 3.692483077726843e-06, | |
| "loss": 0.4482, | |
| "mean_token_accuracy": 0.8734828159213066, | |
| "num_tokens": 1935299.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 1.3825494647026062, | |
| "epoch": 0.7554921540656205, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 3.684197755419419e-06, | |
| "loss": 0.3914, | |
| "mean_token_accuracy": 0.8881862461566925, | |
| "num_tokens": 1941583.0, | |
| "step": 331 | |
| }, | |
| { | |
| "entropy": 1.5679688155651093, | |
| "epoch": 0.757774607703281, | |
| "grad_norm": 3.5, | |
| "learning_rate": 3.6758956299364643e-06, | |
| "loss": 0.5205, | |
| "mean_token_accuracy": 0.850575216114521, | |
| "num_tokens": 1947719.0, | |
| "step": 332 | |
| }, | |
| { | |
| "entropy": 1.478807806968689, | |
| "epoch": 0.7600570613409415, | |
| "grad_norm": 3.953125, | |
| "learning_rate": 3.6675768190810023e-06, | |
| "loss": 0.5383, | |
| "mean_token_accuracy": 0.8558880761265755, | |
| "num_tokens": 1952792.0, | |
| "step": 333 | |
| }, | |
| { | |
| "entropy": 1.4567435085773468, | |
| "epoch": 0.762339514978602, | |
| "grad_norm": 3.625, | |
| "learning_rate": 3.659241440892806e-06, | |
| "loss": 0.4479, | |
| "mean_token_accuracy": 0.8747463598847389, | |
| "num_tokens": 1959114.0, | |
| "step": 334 | |
| }, | |
| { | |
| "entropy": 1.3806256204843521, | |
| "epoch": 0.7646219686162625, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 3.6508896136467376e-06, | |
| "loss": 0.3259, | |
| "mean_token_accuracy": 0.9004263803362846, | |
| "num_tokens": 1965297.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 1.3886072635650635, | |
| "epoch": 0.766904422253923, | |
| "grad_norm": 2.75, | |
| "learning_rate": 3.642521455851058e-06, | |
| "loss": 0.3218, | |
| "mean_token_accuracy": 0.8972492516040802, | |
| "num_tokens": 1972145.0, | |
| "step": 336 | |
| }, | |
| { | |
| "entropy": 1.4320484548807144, | |
| "epoch": 0.7691868758915834, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 3.634137086245754e-06, | |
| "loss": 0.4502, | |
| "mean_token_accuracy": 0.8562175408005714, | |
| "num_tokens": 1977851.0, | |
| "step": 337 | |
| }, | |
| { | |
| "entropy": 1.606041207909584, | |
| "epoch": 0.771469329529244, | |
| "grad_norm": 3.921875, | |
| "learning_rate": 3.625736623800849e-06, | |
| "loss": 0.5698, | |
| "mean_token_accuracy": 0.8275244310498238, | |
| "num_tokens": 1983459.0, | |
| "step": 338 | |
| }, | |
| { | |
| "entropy": 1.396391972899437, | |
| "epoch": 0.7737517831669044, | |
| "grad_norm": 3.125, | |
| "learning_rate": 3.6173201877147134e-06, | |
| "loss": 0.4157, | |
| "mean_token_accuracy": 0.8768060877919197, | |
| "num_tokens": 1989443.0, | |
| "step": 339 | |
| }, | |
| { | |
| "entropy": 1.2999221831560135, | |
| "epoch": 0.776034236804565, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 3.6088878974123796e-06, | |
| "loss": 0.3211, | |
| "mean_token_accuracy": 0.9015626162290573, | |
| "num_tokens": 1996081.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.4438531249761581, | |
| "epoch": 0.7783166904422254, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 3.6004398725438406e-06, | |
| "loss": 0.4224, | |
| "mean_token_accuracy": 0.8693163841962814, | |
| "num_tokens": 2002046.0, | |
| "step": 341 | |
| }, | |
| { | |
| "entropy": 1.5661528557538986, | |
| "epoch": 0.7805991440798858, | |
| "grad_norm": 3.8125, | |
| "learning_rate": 3.5919762329823556e-06, | |
| "loss": 0.4583, | |
| "mean_token_accuracy": 0.8407174274325371, | |
| "num_tokens": 2007992.0, | |
| "step": 342 | |
| }, | |
| { | |
| "entropy": 1.5423270612955093, | |
| "epoch": 0.7828815977175464, | |
| "grad_norm": 3.828125, | |
| "learning_rate": 3.5834970988227484e-06, | |
| "loss": 0.5046, | |
| "mean_token_accuracy": 0.8615901097655296, | |
| "num_tokens": 2013678.0, | |
| "step": 343 | |
| }, | |
| { | |
| "entropy": 1.3757345080375671, | |
| "epoch": 0.7851640513552068, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 3.5750025903797053e-06, | |
| "loss": 0.435, | |
| "mean_token_accuracy": 0.8637730702757835, | |
| "num_tokens": 2019976.0, | |
| "step": 344 | |
| }, | |
| { | |
| "entropy": 1.5496114045381546, | |
| "epoch": 0.7874465049928673, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 3.566492828186063e-06, | |
| "loss": 0.466, | |
| "mean_token_accuracy": 0.861820325255394, | |
| "num_tokens": 2025396.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 1.4001742899417877, | |
| "epoch": 0.7897289586305278, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 3.5579679329911025e-06, | |
| "loss": 0.4244, | |
| "mean_token_accuracy": 0.8774027079343796, | |
| "num_tokens": 2031341.0, | |
| "step": 346 | |
| }, | |
| { | |
| "entropy": 1.4246700257062912, | |
| "epoch": 0.7920114122681883, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 3.5494280257588367e-06, | |
| "loss": 0.3573, | |
| "mean_token_accuracy": 0.8994497805833817, | |
| "num_tokens": 2038154.0, | |
| "step": 347 | |
| }, | |
| { | |
| "entropy": 1.4771685898303986, | |
| "epoch": 0.7942938659058488, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 3.5408732276662882e-06, | |
| "loss": 0.4837, | |
| "mean_token_accuracy": 0.8569220453500748, | |
| "num_tokens": 2043977.0, | |
| "step": 348 | |
| }, | |
| { | |
| "entropy": 1.3758689016103745, | |
| "epoch": 0.7965763195435093, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 3.532303660101776e-06, | |
| "loss": 0.4086, | |
| "mean_token_accuracy": 0.8799771890044212, | |
| "num_tokens": 2049581.0, | |
| "step": 349 | |
| }, | |
| { | |
| "entropy": 1.4391580671072006, | |
| "epoch": 0.7988587731811697, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 3.5237194446631883e-06, | |
| "loss": 0.4414, | |
| "mean_token_accuracy": 0.8686051443219185, | |
| "num_tokens": 2054885.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.572434514760971, | |
| "epoch": 0.8011412268188303, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 3.515120703156264e-06, | |
| "loss": 0.4561, | |
| "mean_token_accuracy": 0.869783990085125, | |
| "num_tokens": 2060752.0, | |
| "step": 351 | |
| }, | |
| { | |
| "entropy": 1.3927340656518936, | |
| "epoch": 0.8034236804564907, | |
| "grad_norm": 3.25, | |
| "learning_rate": 3.506507557592853e-06, | |
| "loss": 0.3986, | |
| "mean_token_accuracy": 0.8710938170552254, | |
| "num_tokens": 2066701.0, | |
| "step": 352 | |
| }, | |
| { | |
| "entropy": 1.6066904217004776, | |
| "epoch": 0.8057061340941513, | |
| "grad_norm": 4.5, | |
| "learning_rate": 3.4978801301891972e-06, | |
| "loss": 0.5213, | |
| "mean_token_accuracy": 0.8417335525155067, | |
| "num_tokens": 2072037.0, | |
| "step": 353 | |
| }, | |
| { | |
| "entropy": 1.5368521958589554, | |
| "epoch": 0.8079885877318117, | |
| "grad_norm": 3.6875, | |
| "learning_rate": 3.4892385433641875e-06, | |
| "loss": 0.5679, | |
| "mean_token_accuracy": 0.8372282758355141, | |
| "num_tokens": 2077090.0, | |
| "step": 354 | |
| }, | |
| { | |
| "entropy": 1.4477348923683167, | |
| "epoch": 0.8102710413694721, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 3.480582919737631e-06, | |
| "loss": 0.4322, | |
| "mean_token_accuracy": 0.8827796950936317, | |
| "num_tokens": 2083157.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 1.449633464217186, | |
| "epoch": 0.8125534950071327, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 3.4719133821285108e-06, | |
| "loss": 0.497, | |
| "mean_token_accuracy": 0.8483736291527748, | |
| "num_tokens": 2089047.0, | |
| "step": 356 | |
| }, | |
| { | |
| "entropy": 1.4000667333602905, | |
| "epoch": 0.8148359486447931, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 3.4632300535532415e-06, | |
| "loss": 0.5416, | |
| "mean_token_accuracy": 0.8374148234724998, | |
| "num_tokens": 2095911.0, | |
| "step": 357 | |
| }, | |
| { | |
| "entropy": 1.5335423648357391, | |
| "epoch": 0.8171184022824537, | |
| "grad_norm": 3.703125, | |
| "learning_rate": 3.4545330572239234e-06, | |
| "loss": 0.4418, | |
| "mean_token_accuracy": 0.8705498203635216, | |
| "num_tokens": 2101062.0, | |
| "step": 358 | |
| }, | |
| { | |
| "entropy": 1.4877882897853851, | |
| "epoch": 0.8194008559201141, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 3.445822516546598e-06, | |
| "loss": 0.382, | |
| "mean_token_accuracy": 0.885826900601387, | |
| "num_tokens": 2107503.0, | |
| "step": 359 | |
| }, | |
| { | |
| "entropy": 1.5615941286087036, | |
| "epoch": 0.8216833095577746, | |
| "grad_norm": 3.515625, | |
| "learning_rate": 3.437098555119493e-06, | |
| "loss": 0.4703, | |
| "mean_token_accuracy": 0.8597147017717361, | |
| "num_tokens": 2112957.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.4338414072990417, | |
| "epoch": 0.8239657631954351, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 3.4283612967312692e-06, | |
| "loss": 0.4431, | |
| "mean_token_accuracy": 0.8747149705886841, | |
| "num_tokens": 2119534.0, | |
| "step": 361 | |
| }, | |
| { | |
| "entropy": 1.3991961032152176, | |
| "epoch": 0.8262482168330956, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 3.4196108653592662e-06, | |
| "loss": 0.3343, | |
| "mean_token_accuracy": 0.9073175340890884, | |
| "num_tokens": 2125905.0, | |
| "step": 362 | |
| }, | |
| { | |
| "entropy": 1.4029065370559692, | |
| "epoch": 0.828530670470756, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 3.4108473851677408e-06, | |
| "loss": 0.3691, | |
| "mean_token_accuracy": 0.8828721046447754, | |
| "num_tokens": 2132517.0, | |
| "step": 363 | |
| }, | |
| { | |
| "entropy": 1.4478721916675568, | |
| "epoch": 0.8308131241084166, | |
| "grad_norm": 3.0, | |
| "learning_rate": 3.4020709805061066e-06, | |
| "loss": 0.399, | |
| "mean_token_accuracy": 0.8760695457458496, | |
| "num_tokens": 2138908.0, | |
| "step": 364 | |
| }, | |
| { | |
| "entropy": 1.470540538430214, | |
| "epoch": 0.833095577746077, | |
| "grad_norm": 3.59375, | |
| "learning_rate": 3.3932817759071666e-06, | |
| "loss": 0.4839, | |
| "mean_token_accuracy": 0.8647991716861725, | |
| "num_tokens": 2144936.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 1.3821264803409576, | |
| "epoch": 0.8353780313837376, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 3.3844798960853533e-06, | |
| "loss": 0.4712, | |
| "mean_token_accuracy": 0.8681535720825195, | |
| "num_tokens": 2151022.0, | |
| "step": 366 | |
| }, | |
| { | |
| "entropy": 1.4431174248456955, | |
| "epoch": 0.837660485021398, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 3.3756654659349487e-06, | |
| "loss": 0.4008, | |
| "mean_token_accuracy": 0.8728353902697563, | |
| "num_tokens": 2156626.0, | |
| "step": 367 | |
| }, | |
| { | |
| "entropy": 1.3731088489294052, | |
| "epoch": 0.8399429386590584, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 3.3668386105283226e-06, | |
| "loss": 0.4741, | |
| "mean_token_accuracy": 0.863268293440342, | |
| "num_tokens": 2163234.0, | |
| "step": 368 | |
| }, | |
| { | |
| "entropy": 1.4210239797830582, | |
| "epoch": 0.842225392296719, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 3.357999455114148e-06, | |
| "loss": 0.4039, | |
| "mean_token_accuracy": 0.8817742839455605, | |
| "num_tokens": 2169749.0, | |
| "step": 369 | |
| }, | |
| { | |
| "entropy": 1.4794443249702454, | |
| "epoch": 0.8445078459343794, | |
| "grad_norm": 3.25, | |
| "learning_rate": 3.3491481251156355e-06, | |
| "loss": 0.4879, | |
| "mean_token_accuracy": 0.8580229580402374, | |
| "num_tokens": 2175776.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 1.6413906067609787, | |
| "epoch": 0.84679029957204, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 3.34028474612874e-06, | |
| "loss": 0.4411, | |
| "mean_token_accuracy": 0.8557733818888664, | |
| "num_tokens": 2180562.0, | |
| "step": 371 | |
| }, | |
| { | |
| "entropy": 1.410418540239334, | |
| "epoch": 0.8490727532097004, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 3.3314094439203903e-06, | |
| "loss": 0.4152, | |
| "mean_token_accuracy": 0.8825007230043411, | |
| "num_tokens": 2185764.0, | |
| "step": 372 | |
| }, | |
| { | |
| "entropy": 1.479749009013176, | |
| "epoch": 0.8513552068473609, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 3.322522344426698e-06, | |
| "loss": 0.4534, | |
| "mean_token_accuracy": 0.8688785433769226, | |
| "num_tokens": 2191225.0, | |
| "step": 373 | |
| }, | |
| { | |
| "entropy": 1.4503730237483978, | |
| "epoch": 0.8536376604850214, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 3.3136235737511715e-06, | |
| "loss": 0.3714, | |
| "mean_token_accuracy": 0.8881650194525719, | |
| "num_tokens": 2196792.0, | |
| "step": 374 | |
| }, | |
| { | |
| "entropy": 1.3789267241954803, | |
| "epoch": 0.8559201141226819, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 3.3047132581629297e-06, | |
| "loss": 0.398, | |
| "mean_token_accuracy": 0.8848712220788002, | |
| "num_tokens": 2203140.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 1.4894972145557404, | |
| "epoch": 0.8582025677603423, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 3.295791524094906e-06, | |
| "loss": 0.3865, | |
| "mean_token_accuracy": 0.8710450083017349, | |
| "num_tokens": 2209122.0, | |
| "step": 376 | |
| }, | |
| { | |
| "entropy": 1.3985904306173325, | |
| "epoch": 0.8604850213980029, | |
| "grad_norm": 2.875, | |
| "learning_rate": 3.286858498142057e-06, | |
| "loss": 0.4158, | |
| "mean_token_accuracy": 0.878923624753952, | |
| "num_tokens": 2215258.0, | |
| "step": 377 | |
| }, | |
| { | |
| "entropy": 1.548867017030716, | |
| "epoch": 0.8627674750356633, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 3.277914307059566e-06, | |
| "loss": 0.5408, | |
| "mean_token_accuracy": 0.8471002653241158, | |
| "num_tokens": 2221371.0, | |
| "step": 378 | |
| }, | |
| { | |
| "entropy": 1.4772979021072388, | |
| "epoch": 0.8650499286733239, | |
| "grad_norm": 3.25, | |
| "learning_rate": 3.2689590777610443e-06, | |
| "loss": 0.3972, | |
| "mean_token_accuracy": 0.8763172924518585, | |
| "num_tokens": 2227158.0, | |
| "step": 379 | |
| }, | |
| { | |
| "entropy": 1.5023012608289719, | |
| "epoch": 0.8673323823109843, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 3.259992937316727e-06, | |
| "loss": 0.4516, | |
| "mean_token_accuracy": 0.8623324111104012, | |
| "num_tokens": 2233629.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 1.5667530596256256, | |
| "epoch": 0.8696148359486447, | |
| "grad_norm": 5.0625, | |
| "learning_rate": 3.251016012951678e-06, | |
| "loss": 0.6043, | |
| "mean_token_accuracy": 0.8312884569168091, | |
| "num_tokens": 2239082.0, | |
| "step": 381 | |
| }, | |
| { | |
| "entropy": 1.380866751074791, | |
| "epoch": 0.8718972895863053, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 3.242028432043974e-06, | |
| "loss": 0.4196, | |
| "mean_token_accuracy": 0.8756621181964874, | |
| "num_tokens": 2245272.0, | |
| "step": 382 | |
| }, | |
| { | |
| "entropy": 1.4950210005044937, | |
| "epoch": 0.8741797432239657, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 3.2330303221229078e-06, | |
| "loss": 0.4317, | |
| "mean_token_accuracy": 0.8579834923148155, | |
| "num_tokens": 2251010.0, | |
| "step": 383 | |
| }, | |
| { | |
| "entropy": 1.7085559666156769, | |
| "epoch": 0.8764621968616263, | |
| "grad_norm": 4.6875, | |
| "learning_rate": 3.2240218108671683e-06, | |
| "loss": 0.6511, | |
| "mean_token_accuracy": 0.8028427958488464, | |
| "num_tokens": 2256288.0, | |
| "step": 384 | |
| }, | |
| { | |
| "entropy": 1.579810380935669, | |
| "epoch": 0.8787446504992867, | |
| "grad_norm": 3.5, | |
| "learning_rate": 3.2150030261030414e-06, | |
| "loss": 0.4849, | |
| "mean_token_accuracy": 0.8453002646565437, | |
| "num_tokens": 2262186.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 1.5028070509433746, | |
| "epoch": 0.8810271041369472, | |
| "grad_norm": 3.6875, | |
| "learning_rate": 3.205974095802582e-06, | |
| "loss": 0.5576, | |
| "mean_token_accuracy": 0.8453918322920799, | |
| "num_tokens": 2268003.0, | |
| "step": 386 | |
| }, | |
| { | |
| "entropy": 1.50083489716053, | |
| "epoch": 0.8833095577746077, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 3.196935148081808e-06, | |
| "loss": 0.5821, | |
| "mean_token_accuracy": 0.8238921985030174, | |
| "num_tokens": 2273238.0, | |
| "step": 387 | |
| }, | |
| { | |
| "entropy": 1.460751935839653, | |
| "epoch": 0.8855920114122682, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 3.187886311198881e-06, | |
| "loss": 0.463, | |
| "mean_token_accuracy": 0.8708171024918556, | |
| "num_tokens": 2279778.0, | |
| "step": 388 | |
| }, | |
| { | |
| "entropy": 1.3422992527484894, | |
| "epoch": 0.8878744650499286, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 3.178827713552281e-06, | |
| "loss": 0.4008, | |
| "mean_token_accuracy": 0.875513955950737, | |
| "num_tokens": 2286016.0, | |
| "step": 389 | |
| }, | |
| { | |
| "entropy": 1.5027628540992737, | |
| "epoch": 0.8901569186875892, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 3.1697594836789924e-06, | |
| "loss": 0.5086, | |
| "mean_token_accuracy": 0.8417061790823936, | |
| "num_tokens": 2291896.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 1.5571343451738358, | |
| "epoch": 0.8924393723252496, | |
| "grad_norm": 4.53125, | |
| "learning_rate": 3.160681750252674e-06, | |
| "loss": 0.5863, | |
| "mean_token_accuracy": 0.8346568569540977, | |
| "num_tokens": 2296989.0, | |
| "step": 391 | |
| }, | |
| { | |
| "entropy": 1.4478174448013306, | |
| "epoch": 0.8947218259629102, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 3.1515946420818343e-06, | |
| "loss": 0.4618, | |
| "mean_token_accuracy": 0.8564577624201775, | |
| "num_tokens": 2303240.0, | |
| "step": 392 | |
| }, | |
| { | |
| "entropy": 1.4417100101709366, | |
| "epoch": 0.8970042796005706, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 3.142498288108007e-06, | |
| "loss": 0.5086, | |
| "mean_token_accuracy": 0.8544816300272942, | |
| "num_tokens": 2308819.0, | |
| "step": 393 | |
| }, | |
| { | |
| "entropy": 1.549110621213913, | |
| "epoch": 0.899286733238231, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 3.133392817403919e-06, | |
| "loss": 0.4943, | |
| "mean_token_accuracy": 0.8492691740393639, | |
| "num_tokens": 2315199.0, | |
| "step": 394 | |
| }, | |
| { | |
| "entropy": 1.437395378947258, | |
| "epoch": 0.9015691868758916, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 3.124278359171657e-06, | |
| "loss": 0.4162, | |
| "mean_token_accuracy": 0.8790151923894882, | |
| "num_tokens": 2321449.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 1.4882071912288666, | |
| "epoch": 0.903851640513552, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 3.1151550427408383e-06, | |
| "loss": 0.3974, | |
| "mean_token_accuracy": 0.8646276146173477, | |
| "num_tokens": 2327198.0, | |
| "step": 396 | |
| }, | |
| { | |
| "entropy": 1.414357990026474, | |
| "epoch": 0.9061340941512126, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 3.1060229975667716e-06, | |
| "loss": 0.3884, | |
| "mean_token_accuracy": 0.874775730073452, | |
| "num_tokens": 2333184.0, | |
| "step": 397 | |
| }, | |
| { | |
| "entropy": 1.5017937868833542, | |
| "epoch": 0.908416547788873, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 3.0968823532286246e-06, | |
| "loss": 0.4596, | |
| "mean_token_accuracy": 0.8661977797746658, | |
| "num_tokens": 2339353.0, | |
| "step": 398 | |
| }, | |
| { | |
| "entropy": 1.4912959188222885, | |
| "epoch": 0.9106990014265335, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 3.0877332394275806e-06, | |
| "loss": 0.3845, | |
| "mean_token_accuracy": 0.8872612118721008, | |
| "num_tokens": 2345323.0, | |
| "step": 399 | |
| }, | |
| { | |
| "entropy": 1.5040694773197174, | |
| "epoch": 0.912981455064194, | |
| "grad_norm": 3.84375, | |
| "learning_rate": 3.0785757859850025e-06, | |
| "loss": 0.4793, | |
| "mean_token_accuracy": 0.8584380373358727, | |
| "num_tokens": 2350382.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.912981455064194, | |
| "eval_entropy": 1.4835859013928308, | |
| "eval_loss": 0.47563549876213074, | |
| "eval_mean_token_accuracy": 0.8651414997047848, | |
| "eval_num_tokens": 2350382.0, | |
| "eval_runtime": 4.4144, | |
| "eval_samples_per_second": 20.388, | |
| "eval_steps_per_second": 20.388, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.4684801995754242, | |
| "epoch": 0.9152639087018545, | |
| "grad_norm": 3.765625, | |
| "learning_rate": 3.069410122840585e-06, | |
| "loss": 0.4838, | |
| "mean_token_accuracy": 0.8577789217233658, | |
| "num_tokens": 2356642.0, | |
| "step": 401 | |
| }, | |
| { | |
| "entropy": 1.4736972451210022, | |
| "epoch": 0.917546362339515, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 3.0602363800505198e-06, | |
| "loss": 0.4626, | |
| "mean_token_accuracy": 0.8666577711701393, | |
| "num_tokens": 2363069.0, | |
| "step": 402 | |
| }, | |
| { | |
| "entropy": 1.4170372486114502, | |
| "epoch": 0.9198288159771755, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 3.05105468778564e-06, | |
| "loss": 0.4183, | |
| "mean_token_accuracy": 0.8878279328346252, | |
| "num_tokens": 2369558.0, | |
| "step": 403 | |
| }, | |
| { | |
| "entropy": 1.2785319834947586, | |
| "epoch": 0.9221112696148359, | |
| "grad_norm": 3.0, | |
| "learning_rate": 3.041865176329579e-06, | |
| "loss": 0.383, | |
| "mean_token_accuracy": 0.8874974772334099, | |
| "num_tokens": 2376487.0, | |
| "step": 404 | |
| }, | |
| { | |
| "entropy": 1.5108132362365723, | |
| "epoch": 0.9243937232524965, | |
| "grad_norm": 3.796875, | |
| "learning_rate": 3.032667976076923e-06, | |
| "loss": 0.5087, | |
| "mean_token_accuracy": 0.8496776968240738, | |
| "num_tokens": 2382047.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 1.4732455164194107, | |
| "epoch": 0.9266761768901569, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 3.0234632175313537e-06, | |
| "loss": 0.3808, | |
| "mean_token_accuracy": 0.8731858357787132, | |
| "num_tokens": 2388697.0, | |
| "step": 406 | |
| }, | |
| { | |
| "entropy": 1.428204596042633, | |
| "epoch": 0.9289586305278174, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 3.0142510313038057e-06, | |
| "loss": 0.3893, | |
| "mean_token_accuracy": 0.8852085620164871, | |
| "num_tokens": 2395175.0, | |
| "step": 407 | |
| }, | |
| { | |
| "entropy": 1.3948392271995544, | |
| "epoch": 0.9312410841654779, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 3.0050315481106074e-06, | |
| "loss": 0.4367, | |
| "mean_token_accuracy": 0.8680780380964279, | |
| "num_tokens": 2401107.0, | |
| "step": 408 | |
| }, | |
| { | |
| "entropy": 1.4686945080757141, | |
| "epoch": 0.9335235378031383, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 2.9958048987716266e-06, | |
| "loss": 0.4492, | |
| "mean_token_accuracy": 0.8716259375214577, | |
| "num_tokens": 2407315.0, | |
| "step": 409 | |
| }, | |
| { | |
| "entropy": 1.5125146508216858, | |
| "epoch": 0.9358059914407989, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 2.9865712142084145e-06, | |
| "loss": 0.5313, | |
| "mean_token_accuracy": 0.8568686470389366, | |
| "num_tokens": 2413259.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.433497592806816, | |
| "epoch": 0.9380884450784593, | |
| "grad_norm": 3.0, | |
| "learning_rate": 2.977330625442352e-06, | |
| "loss": 0.412, | |
| "mean_token_accuracy": 0.8721762746572495, | |
| "num_tokens": 2419468.0, | |
| "step": 411 | |
| }, | |
| { | |
| "entropy": 1.4551435112953186, | |
| "epoch": 0.9403708987161198, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 2.9680832635927824e-06, | |
| "loss": 0.472, | |
| "mean_token_accuracy": 0.8528627678751945, | |
| "num_tokens": 2426271.0, | |
| "step": 412 | |
| }, | |
| { | |
| "entropy": 1.447442203760147, | |
| "epoch": 0.9426533523537803, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 2.95882925987516e-06, | |
| "loss": 0.3598, | |
| "mean_token_accuracy": 0.8820754066109657, | |
| "num_tokens": 2432887.0, | |
| "step": 413 | |
| }, | |
| { | |
| "entropy": 1.5209446549415588, | |
| "epoch": 0.9449358059914408, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 2.949568745599182e-06, | |
| "loss": 0.4893, | |
| "mean_token_accuracy": 0.8616260290145874, | |
| "num_tokens": 2438656.0, | |
| "step": 414 | |
| }, | |
| { | |
| "entropy": 1.4069498479366302, | |
| "epoch": 0.9472182596291013, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 2.9403018521669256e-06, | |
| "loss": 0.5104, | |
| "mean_token_accuracy": 0.8574993088841438, | |
| "num_tokens": 2444704.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 1.487932413816452, | |
| "epoch": 0.9495007132667618, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 2.9310287110709895e-06, | |
| "loss": 0.4016, | |
| "mean_token_accuracy": 0.8731286600232124, | |
| "num_tokens": 2450361.0, | |
| "step": 416 | |
| }, | |
| { | |
| "entropy": 1.5046747326850891, | |
| "epoch": 0.9517831669044222, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 2.921749453892618e-06, | |
| "loss": 0.4286, | |
| "mean_token_accuracy": 0.8756372630596161, | |
| "num_tokens": 2456532.0, | |
| "step": 417 | |
| }, | |
| { | |
| "entropy": 1.5569333881139755, | |
| "epoch": 0.9540656205420828, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 2.9124642122998453e-06, | |
| "loss": 0.5047, | |
| "mean_token_accuracy": 0.8422510251402855, | |
| "num_tokens": 2462276.0, | |
| "step": 418 | |
| }, | |
| { | |
| "entropy": 1.477220967411995, | |
| "epoch": 0.9563480741797432, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 2.903173118045616e-06, | |
| "loss": 0.4585, | |
| "mean_token_accuracy": 0.8631913363933563, | |
| "num_tokens": 2468621.0, | |
| "step": 419 | |
| }, | |
| { | |
| "entropy": 1.3926943019032478, | |
| "epoch": 0.9586305278174037, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 2.893876302965925e-06, | |
| "loss": 0.4379, | |
| "mean_token_accuracy": 0.8661207035183907, | |
| "num_tokens": 2474234.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 1.5482182949781418, | |
| "epoch": 0.9609129814550642, | |
| "grad_norm": 3.78125, | |
| "learning_rate": 2.884573898977941e-06, | |
| "loss": 0.507, | |
| "mean_token_accuracy": 0.8496933579444885, | |
| "num_tokens": 2479680.0, | |
| "step": 421 | |
| }, | |
| { | |
| "entropy": 1.360275536775589, | |
| "epoch": 0.9631954350927246, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 2.8752660380781367e-06, | |
| "loss": 0.4307, | |
| "mean_token_accuracy": 0.8788939565420151, | |
| "num_tokens": 2485907.0, | |
| "step": 422 | |
| }, | |
| { | |
| "entropy": 1.3031716644763947, | |
| "epoch": 0.9654778887303852, | |
| "grad_norm": 2.875, | |
| "learning_rate": 2.865952852340417e-06, | |
| "loss": 0.3625, | |
| "mean_token_accuracy": 0.8956428542733192, | |
| "num_tokens": 2492467.0, | |
| "step": 423 | |
| }, | |
| { | |
| "entropy": 1.541382610797882, | |
| "epoch": 0.9677603423680456, | |
| "grad_norm": 3.8125, | |
| "learning_rate": 2.856634473914242e-06, | |
| "loss": 0.5266, | |
| "mean_token_accuracy": 0.8559072092175484, | |
| "num_tokens": 2498045.0, | |
| "step": 424 | |
| }, | |
| { | |
| "entropy": 1.4921831041574478, | |
| "epoch": 0.9700427960057061, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 2.8473110350227536e-06, | |
| "loss": 0.3466, | |
| "mean_token_accuracy": 0.8902567103505135, | |
| "num_tokens": 2503553.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 1.470309928059578, | |
| "epoch": 0.9723252496433666, | |
| "grad_norm": 3.375, | |
| "learning_rate": 2.8379826679609e-06, | |
| "loss": 0.4556, | |
| "mean_token_accuracy": 0.8601387813687325, | |
| "num_tokens": 2509707.0, | |
| "step": 426 | |
| }, | |
| { | |
| "entropy": 1.3546678721904755, | |
| "epoch": 0.9746077032810271, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 2.828649505093558e-06, | |
| "loss": 0.3985, | |
| "mean_token_accuracy": 0.8941172435879707, | |
| "num_tokens": 2516288.0, | |
| "step": 427 | |
| }, | |
| { | |
| "entropy": 1.4447802305221558, | |
| "epoch": 0.9768901569186876, | |
| "grad_norm": 3.421875, | |
| "learning_rate": 2.819311678853652e-06, | |
| "loss": 0.4776, | |
| "mean_token_accuracy": 0.8569598346948624, | |
| "num_tokens": 2521956.0, | |
| "step": 428 | |
| }, | |
| { | |
| "entropy": 1.6203635483980179, | |
| "epoch": 0.9791726105563481, | |
| "grad_norm": 3.734375, | |
| "learning_rate": 2.8099693217402807e-06, | |
| "loss": 0.4593, | |
| "mean_token_accuracy": 0.8529090061783791, | |
| "num_tokens": 2526920.0, | |
| "step": 429 | |
| }, | |
| { | |
| "entropy": 1.473097711801529, | |
| "epoch": 0.9814550641940085, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 2.800622566316831e-06, | |
| "loss": 0.5033, | |
| "mean_token_accuracy": 0.8560734689235687, | |
| "num_tokens": 2533504.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 1.5207239985466003, | |
| "epoch": 0.9837375178316691, | |
| "grad_norm": 4.09375, | |
| "learning_rate": 2.7912715452091014e-06, | |
| "loss": 0.5041, | |
| "mean_token_accuracy": 0.8554971441626549, | |
| "num_tokens": 2538535.0, | |
| "step": 431 | |
| }, | |
| { | |
| "entropy": 1.5741059184074402, | |
| "epoch": 0.9860199714693295, | |
| "grad_norm": 4.0, | |
| "learning_rate": 2.7819163911034175e-06, | |
| "loss": 0.4511, | |
| "mean_token_accuracy": 0.8700136467814445, | |
| "num_tokens": 2543371.0, | |
| "step": 432 | |
| }, | |
| { | |
| "entropy": 1.3865297734737396, | |
| "epoch": 0.9883024251069901, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 2.77255723674475e-06, | |
| "loss": 0.4648, | |
| "mean_token_accuracy": 0.8642655313014984, | |
| "num_tokens": 2549303.0, | |
| "step": 433 | |
| }, | |
| { | |
| "entropy": 1.484322428703308, | |
| "epoch": 0.9905848787446505, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 2.7631942149348313e-06, | |
| "loss": 0.5178, | |
| "mean_token_accuracy": 0.8604016155004501, | |
| "num_tokens": 2554892.0, | |
| "step": 434 | |
| }, | |
| { | |
| "entropy": 1.4711394906044006, | |
| "epoch": 0.992867332382311, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 2.7538274585302707e-06, | |
| "loss": 0.5105, | |
| "mean_token_accuracy": 0.8574899211525917, | |
| "num_tokens": 2561168.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 1.4003391563892365, | |
| "epoch": 0.9951497860199715, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 2.74445710044067e-06, | |
| "loss": 0.3995, | |
| "mean_token_accuracy": 0.8786035105586052, | |
| "num_tokens": 2567401.0, | |
| "step": 436 | |
| }, | |
| { | |
| "entropy": 1.4778650850057602, | |
| "epoch": 0.997432239657632, | |
| "grad_norm": 3.25, | |
| "learning_rate": 2.735083273626738e-06, | |
| "loss": 0.5094, | |
| "mean_token_accuracy": 0.8610806316137314, | |
| "num_tokens": 2573896.0, | |
| "step": 437 | |
| }, | |
| { | |
| "entropy": 1.5298404842615128, | |
| "epoch": 0.9997146932952924, | |
| "grad_norm": 3.765625, | |
| "learning_rate": 2.7257061110984005e-06, | |
| "loss": 0.5801, | |
| "mean_token_accuracy": 0.8354984298348427, | |
| "num_tokens": 2579575.0, | |
| "step": 438 | |
| }, | |
| { | |
| "entropy": 1.2647957801818848, | |
| "epoch": 1.0, | |
| "grad_norm": 7.5, | |
| "learning_rate": 2.7163257459129184e-06, | |
| "loss": 0.3378, | |
| "mean_token_accuracy": 0.9111570119857788, | |
| "num_tokens": 2580462.0, | |
| "step": 439 | |
| }, | |
| { | |
| "entropy": 1.5493428707122803, | |
| "epoch": 1.0022824536376604, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 2.7069423111729948e-06, | |
| "loss": 0.482, | |
| "mean_token_accuracy": 0.8536929711699486, | |
| "num_tokens": 2586104.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 1.6429398506879807, | |
| "epoch": 1.0045649072753209, | |
| "grad_norm": 3.765625, | |
| "learning_rate": 2.6975559400248876e-06, | |
| "loss": 0.5162, | |
| "mean_token_accuracy": 0.8646445199847221, | |
| "num_tokens": 2591601.0, | |
| "step": 441 | |
| }, | |
| { | |
| "entropy": 1.3536241203546524, | |
| "epoch": 1.0068473609129815, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 2.688166765656523e-06, | |
| "loss": 0.3578, | |
| "mean_token_accuracy": 0.8843531683087349, | |
| "num_tokens": 2598127.0, | |
| "step": 442 | |
| }, | |
| { | |
| "entropy": 1.4669701904058456, | |
| "epoch": 1.009129814550642, | |
| "grad_norm": 3.921875, | |
| "learning_rate": 2.6787749212956023e-06, | |
| "loss": 0.5313, | |
| "mean_token_accuracy": 0.8472650721669197, | |
| "num_tokens": 2603447.0, | |
| "step": 443 | |
| }, | |
| { | |
| "entropy": 1.4554204195737839, | |
| "epoch": 1.0114122681883024, | |
| "grad_norm": 3.78125, | |
| "learning_rate": 2.6693805402077123e-06, | |
| "loss": 0.5817, | |
| "mean_token_accuracy": 0.83076561242342, | |
| "num_tokens": 2609040.0, | |
| "step": 444 | |
| }, | |
| { | |
| "entropy": 1.4986287206411362, | |
| "epoch": 1.0136947218259629, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 2.6599837556944353e-06, | |
| "loss": 0.498, | |
| "mean_token_accuracy": 0.8590250089764595, | |
| "num_tokens": 2615545.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 1.5251432359218597, | |
| "epoch": 1.0159771754636233, | |
| "grad_norm": 4.0, | |
| "learning_rate": 2.6505847010914575e-06, | |
| "loss": 0.633, | |
| "mean_token_accuracy": 0.8183507323265076, | |
| "num_tokens": 2621930.0, | |
| "step": 446 | |
| }, | |
| { | |
| "entropy": 1.4970913529396057, | |
| "epoch": 1.018259629101284, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 2.641183509766675e-06, | |
| "loss": 0.3988, | |
| "mean_token_accuracy": 0.8723035603761673, | |
| "num_tokens": 2627761.0, | |
| "step": 447 | |
| }, | |
| { | |
| "entropy": 1.4567296206951141, | |
| "epoch": 1.0205420827389444, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 2.6317803151183053e-06, | |
| "loss": 0.4201, | |
| "mean_token_accuracy": 0.8818748518824577, | |
| "num_tokens": 2633748.0, | |
| "step": 448 | |
| }, | |
| { | |
| "entropy": 1.4635232239961624, | |
| "epoch": 1.0228245363766049, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 2.6223752505729884e-06, | |
| "loss": 0.452, | |
| "mean_token_accuracy": 0.8645489439368248, | |
| "num_tokens": 2639662.0, | |
| "step": 449 | |
| }, | |
| { | |
| "entropy": 1.4294497519731522, | |
| "epoch": 1.0251069900142653, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 2.6129684495839013e-06, | |
| "loss": 0.5102, | |
| "mean_token_accuracy": 0.8570954278111458, | |
| "num_tokens": 2645946.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.3900626301765442, | |
| "epoch": 1.0273894436519257, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 2.6035600456288573e-06, | |
| "loss": 0.3859, | |
| "mean_token_accuracy": 0.8834785372018814, | |
| "num_tokens": 2652364.0, | |
| "step": 451 | |
| }, | |
| { | |
| "entropy": 1.4409504532814026, | |
| "epoch": 1.0296718972895864, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 2.594150172208417e-06, | |
| "loss": 0.4641, | |
| "mean_token_accuracy": 0.8652448132634163, | |
| "num_tokens": 2658338.0, | |
| "step": 452 | |
| }, | |
| { | |
| "entropy": 1.5055885165929794, | |
| "epoch": 1.0319543509272469, | |
| "grad_norm": 3.625, | |
| "learning_rate": 2.5847389628439905e-06, | |
| "loss": 0.426, | |
| "mean_token_accuracy": 0.8645097240805626, | |
| "num_tokens": 2663620.0, | |
| "step": 453 | |
| }, | |
| { | |
| "entropy": 1.5077017843723297, | |
| "epoch": 1.0342368045649073, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 2.575326551075945e-06, | |
| "loss": 0.4288, | |
| "mean_token_accuracy": 0.8733096942305565, | |
| "num_tokens": 2669362.0, | |
| "step": 454 | |
| }, | |
| { | |
| "entropy": 1.3824554234743118, | |
| "epoch": 1.0365192582025677, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 2.5659130704617092e-06, | |
| "loss": 0.4209, | |
| "mean_token_accuracy": 0.8664216324687004, | |
| "num_tokens": 2675587.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 1.4790180027484894, | |
| "epoch": 1.0388017118402282, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 2.5564986545738767e-06, | |
| "loss": 0.3928, | |
| "mean_token_accuracy": 0.8827410265803337, | |
| "num_tokens": 2681742.0, | |
| "step": 456 | |
| }, | |
| { | |
| "entropy": 1.4870340526103973, | |
| "epoch": 1.0410841654778886, | |
| "grad_norm": 3.734375, | |
| "learning_rate": 2.547083436998316e-06, | |
| "loss": 0.3968, | |
| "mean_token_accuracy": 0.8777871504426003, | |
| "num_tokens": 2687070.0, | |
| "step": 457 | |
| }, | |
| { | |
| "entropy": 1.492873653769493, | |
| "epoch": 1.0433666191155493, | |
| "grad_norm": 3.375, | |
| "learning_rate": 2.5376675513322665e-06, | |
| "loss": 0.4273, | |
| "mean_token_accuracy": 0.8743336573243141, | |
| "num_tokens": 2693415.0, | |
| "step": 458 | |
| }, | |
| { | |
| "entropy": 1.5607992857694626, | |
| "epoch": 1.0456490727532097, | |
| "grad_norm": 4.0, | |
| "learning_rate": 2.52825113118245e-06, | |
| "loss": 0.5436, | |
| "mean_token_accuracy": 0.8444447070360184, | |
| "num_tokens": 2699241.0, | |
| "step": 459 | |
| }, | |
| { | |
| "entropy": 1.4991340637207031, | |
| "epoch": 1.0479315263908702, | |
| "grad_norm": 3.0, | |
| "learning_rate": 2.5188343101631717e-06, | |
| "loss": 0.4713, | |
| "mean_token_accuracy": 0.8594570085406303, | |
| "num_tokens": 2705629.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.4429044276475906, | |
| "epoch": 1.0502139800285306, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 2.5094172218944276e-06, | |
| "loss": 0.5136, | |
| "mean_token_accuracy": 0.8507946282625198, | |
| "num_tokens": 2711944.0, | |
| "step": 461 | |
| }, | |
| { | |
| "entropy": 1.5478469878435135, | |
| "epoch": 1.052496433666191, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.4498, | |
| "mean_token_accuracy": 0.8698392882943153, | |
| "num_tokens": 2717870.0, | |
| "step": 462 | |
| }, | |
| { | |
| "entropy": 1.4724483042955399, | |
| "epoch": 1.0547788873038517, | |
| "grad_norm": 4.09375, | |
| "learning_rate": 2.4905827781055733e-06, | |
| "loss": 0.5091, | |
| "mean_token_accuracy": 0.8364823833107948, | |
| "num_tokens": 2722955.0, | |
| "step": 463 | |
| }, | |
| { | |
| "entropy": 1.4399842321872711, | |
| "epoch": 1.0570613409415122, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 2.4811656898368287e-06, | |
| "loss": 0.4118, | |
| "mean_token_accuracy": 0.8793508112430573, | |
| "num_tokens": 2729267.0, | |
| "step": 464 | |
| }, | |
| { | |
| "entropy": 1.4447701424360275, | |
| "epoch": 1.0593437945791726, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 2.4717488688175513e-06, | |
| "loss": 0.4089, | |
| "mean_token_accuracy": 0.8816163316369057, | |
| "num_tokens": 2735200.0, | |
| "step": 465 | |
| }, | |
| { | |
| "entropy": 1.507298544049263, | |
| "epoch": 1.061626248216833, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 2.4623324486677352e-06, | |
| "loss": 0.5426, | |
| "mean_token_accuracy": 0.8359150066971779, | |
| "num_tokens": 2740627.0, | |
| "step": 466 | |
| }, | |
| { | |
| "entropy": 1.4749993681907654, | |
| "epoch": 1.0639087018544935, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 2.4529165630016855e-06, | |
| "loss": 0.4186, | |
| "mean_token_accuracy": 0.8762158378958702, | |
| "num_tokens": 2745817.0, | |
| "step": 467 | |
| }, | |
| { | |
| "entropy": 1.5043630599975586, | |
| "epoch": 1.0661911554921542, | |
| "grad_norm": 3.25, | |
| "learning_rate": 2.4435013454261246e-06, | |
| "loss": 0.4691, | |
| "mean_token_accuracy": 0.8595764860510826, | |
| "num_tokens": 2752047.0, | |
| "step": 468 | |
| }, | |
| { | |
| "entropy": 1.464219182729721, | |
| "epoch": 1.0684736091298146, | |
| "grad_norm": 3.609375, | |
| "learning_rate": 2.4340869295382924e-06, | |
| "loss": 0.4847, | |
| "mean_token_accuracy": 0.8647123128175735, | |
| "num_tokens": 2758030.0, | |
| "step": 469 | |
| }, | |
| { | |
| "entropy": 1.5525110363960266, | |
| "epoch": 1.070756062767475, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 2.4246734489240554e-06, | |
| "loss": 0.4389, | |
| "mean_token_accuracy": 0.871659129858017, | |
| "num_tokens": 2763739.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 1.441315084695816, | |
| "epoch": 1.0730385164051355, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 2.4152610371560095e-06, | |
| "loss": 0.4706, | |
| "mean_token_accuracy": 0.8659368455410004, | |
| "num_tokens": 2770144.0, | |
| "step": 471 | |
| }, | |
| { | |
| "entropy": 1.5431715548038483, | |
| "epoch": 1.075320970042796, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 2.4058498277915835e-06, | |
| "loss": 0.5396, | |
| "mean_token_accuracy": 0.8234963491559029, | |
| "num_tokens": 2776060.0, | |
| "step": 472 | |
| }, | |
| { | |
| "entropy": 1.3775285333395004, | |
| "epoch": 1.0776034236804566, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 2.3964399543711427e-06, | |
| "loss": 0.3289, | |
| "mean_token_accuracy": 0.8977130725979805, | |
| "num_tokens": 2782100.0, | |
| "step": 473 | |
| }, | |
| { | |
| "entropy": 1.424841582775116, | |
| "epoch": 1.079885877318117, | |
| "grad_norm": 3.125, | |
| "learning_rate": 2.3870315504160995e-06, | |
| "loss": 0.4425, | |
| "mean_token_accuracy": 0.8671782091259956, | |
| "num_tokens": 2787965.0, | |
| "step": 474 | |
| }, | |
| { | |
| "entropy": 1.4423463493585587, | |
| "epoch": 1.0821683309557775, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 2.377624749427012e-06, | |
| "loss": 0.3595, | |
| "mean_token_accuracy": 0.8889539316296577, | |
| "num_tokens": 2794165.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 1.4992396533489227, | |
| "epoch": 1.084450784593438, | |
| "grad_norm": 3.875, | |
| "learning_rate": 2.3682196848816955e-06, | |
| "loss": 0.4793, | |
| "mean_token_accuracy": 0.8694660887122154, | |
| "num_tokens": 2800010.0, | |
| "step": 476 | |
| }, | |
| { | |
| "entropy": 1.4096488505601883, | |
| "epoch": 1.0867332382310984, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 2.358816490233326e-06, | |
| "loss": 0.3516, | |
| "mean_token_accuracy": 0.8974229022860527, | |
| "num_tokens": 2805889.0, | |
| "step": 477 | |
| }, | |
| { | |
| "entropy": 1.4805195033550262, | |
| "epoch": 1.089015691868759, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 2.3494152989085433e-06, | |
| "loss": 0.5061, | |
| "mean_token_accuracy": 0.8679251745343208, | |
| "num_tokens": 2811684.0, | |
| "step": 478 | |
| }, | |
| { | |
| "entropy": 1.5036189705133438, | |
| "epoch": 1.0912981455064195, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 2.3400162443055655e-06, | |
| "loss": 0.5221, | |
| "mean_token_accuracy": 0.8420342952013016, | |
| "num_tokens": 2817131.0, | |
| "step": 479 | |
| }, | |
| { | |
| "entropy": 1.594360738992691, | |
| "epoch": 1.09358059914408, | |
| "grad_norm": 4.0, | |
| "learning_rate": 2.330619459792289e-06, | |
| "loss": 0.5052, | |
| "mean_token_accuracy": 0.8538608327507973, | |
| "num_tokens": 2822205.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 1.3911210894584656, | |
| "epoch": 1.0958630527817403, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 2.321225078704399e-06, | |
| "loss": 0.3525, | |
| "mean_token_accuracy": 0.8852925226092339, | |
| "num_tokens": 2828146.0, | |
| "step": 481 | |
| }, | |
| { | |
| "entropy": 1.5996953547000885, | |
| "epoch": 1.0981455064194008, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 2.311833234343478e-06, | |
| "loss": 0.4677, | |
| "mean_token_accuracy": 0.8572832494974136, | |
| "num_tokens": 2833879.0, | |
| "step": 482 | |
| }, | |
| { | |
| "entropy": 1.5117892771959305, | |
| "epoch": 1.1004279600570612, | |
| "grad_norm": 4.09375, | |
| "learning_rate": 2.3024440599751132e-06, | |
| "loss": 0.4467, | |
| "mean_token_accuracy": 0.8582476228475571, | |
| "num_tokens": 2839173.0, | |
| "step": 483 | |
| }, | |
| { | |
| "entropy": 1.433998242020607, | |
| "epoch": 1.102710413694722, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 2.293057688827007e-06, | |
| "loss": 0.3942, | |
| "mean_token_accuracy": 0.8847835510969162, | |
| "num_tokens": 2845616.0, | |
| "step": 484 | |
| }, | |
| { | |
| "entropy": 1.5342581421136856, | |
| "epoch": 1.1049928673323823, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 2.283674254087082e-06, | |
| "loss": 0.4659, | |
| "mean_token_accuracy": 0.8615615218877792, | |
| "num_tokens": 2851949.0, | |
| "step": 485 | |
| }, | |
| { | |
| "entropy": 1.5389353781938553, | |
| "epoch": 1.1072753209700428, | |
| "grad_norm": 3.421875, | |
| "learning_rate": 2.274293888901599e-06, | |
| "loss": 0.4388, | |
| "mean_token_accuracy": 0.871217891573906, | |
| "num_tokens": 2857358.0, | |
| "step": 486 | |
| }, | |
| { | |
| "entropy": 1.4772920906543732, | |
| "epoch": 1.1095577746077032, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 2.264916726373263e-06, | |
| "loss": 0.5044, | |
| "mean_token_accuracy": 0.8598240464925766, | |
| "num_tokens": 2862299.0, | |
| "step": 487 | |
| }, | |
| { | |
| "entropy": 1.4805989265441895, | |
| "epoch": 1.1118402282453639, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 2.2555428995593303e-06, | |
| "loss": 0.444, | |
| "mean_token_accuracy": 0.8689677938818932, | |
| "num_tokens": 2868820.0, | |
| "step": 488 | |
| }, | |
| { | |
| "entropy": 1.4840258061885834, | |
| "epoch": 1.1141226818830243, | |
| "grad_norm": 3.421875, | |
| "learning_rate": 2.24617254146973e-06, | |
| "loss": 0.4531, | |
| "mean_token_accuracy": 0.8679408878087997, | |
| "num_tokens": 2874968.0, | |
| "step": 489 | |
| }, | |
| { | |
| "entropy": 1.4381522238254547, | |
| "epoch": 1.1164051355206848, | |
| "grad_norm": 3.125, | |
| "learning_rate": 2.23680578506517e-06, | |
| "loss": 0.4115, | |
| "mean_token_accuracy": 0.8769493475556374, | |
| "num_tokens": 2880835.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 1.4330200850963593, | |
| "epoch": 1.1186875891583452, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 2.2274427632552507e-06, | |
| "loss": 0.4123, | |
| "mean_token_accuracy": 0.8793010637164116, | |
| "num_tokens": 2887529.0, | |
| "step": 491 | |
| }, | |
| { | |
| "entropy": 1.3696521073579788, | |
| "epoch": 1.1209700427960057, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 2.2180836088965833e-06, | |
| "loss": 0.3384, | |
| "mean_token_accuracy": 0.8860399350523949, | |
| "num_tokens": 2893458.0, | |
| "step": 492 | |
| }, | |
| { | |
| "entropy": 1.4893521070480347, | |
| "epoch": 1.123252496433666, | |
| "grad_norm": 3.0, | |
| "learning_rate": 2.208728454790899e-06, | |
| "loss": 0.4691, | |
| "mean_token_accuracy": 0.8572286292910576, | |
| "num_tokens": 2899716.0, | |
| "step": 493 | |
| }, | |
| { | |
| "entropy": 1.3807679414749146, | |
| "epoch": 1.1255349500713268, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 2.1993774336831696e-06, | |
| "loss": 0.4068, | |
| "mean_token_accuracy": 0.8788377121090889, | |
| "num_tokens": 2906271.0, | |
| "step": 494 | |
| }, | |
| { | |
| "entropy": 1.4945531785488129, | |
| "epoch": 1.1278174037089872, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 2.19003067825972e-06, | |
| "loss": 0.4081, | |
| "mean_token_accuracy": 0.8731363192200661, | |
| "num_tokens": 2912348.0, | |
| "step": 495 | |
| }, | |
| { | |
| "entropy": 1.5495448559522629, | |
| "epoch": 1.1300998573466476, | |
| "grad_norm": 3.921875, | |
| "learning_rate": 2.180688321146349e-06, | |
| "loss": 0.601, | |
| "mean_token_accuracy": 0.8166243210434914, | |
| "num_tokens": 2918060.0, | |
| "step": 496 | |
| }, | |
| { | |
| "entropy": 1.5690300911664963, | |
| "epoch": 1.132382310984308, | |
| "grad_norm": 3.5, | |
| "learning_rate": 2.1713504949064433e-06, | |
| "loss": 0.4601, | |
| "mean_token_accuracy": 0.85266974568367, | |
| "num_tokens": 2923409.0, | |
| "step": 497 | |
| }, | |
| { | |
| "entropy": 1.3820966184139252, | |
| "epoch": 1.1346647646219685, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 2.1620173320391007e-06, | |
| "loss": 0.2558, | |
| "mean_token_accuracy": 0.9106499254703522, | |
| "num_tokens": 2929722.0, | |
| "step": 498 | |
| }, | |
| { | |
| "entropy": 1.540186420083046, | |
| "epoch": 1.1369472182596292, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 2.1526889649772477e-06, | |
| "loss": 0.4437, | |
| "mean_token_accuracy": 0.8645635023713112, | |
| "num_tokens": 2935812.0, | |
| "step": 499 | |
| }, | |
| { | |
| "entropy": 1.435683935880661, | |
| "epoch": 1.1392296718972896, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 2.143365526085759e-06, | |
| "loss": 0.48, | |
| "mean_token_accuracy": 0.8664367198944092, | |
| "num_tokens": 2942222.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.1392296718972896, | |
| "eval_entropy": 1.4798295431666904, | |
| "eval_loss": 0.4741344451904297, | |
| "eval_mean_token_accuracy": 0.8666040844387478, | |
| "eval_num_tokens": 2942222.0, | |
| "eval_runtime": 4.4417, | |
| "eval_samples_per_second": 20.262, | |
| "eval_steps_per_second": 20.262, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 1.4722786843776703, | |
| "epoch": 1.14151212553495, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 2.1340471476595836e-06, | |
| "loss": 0.4947, | |
| "mean_token_accuracy": 0.8604092225432396, | |
| "num_tokens": 2947869.0, | |
| "step": 501 | |
| }, | |
| { | |
| "entropy": 1.5302625745534897, | |
| "epoch": 1.1437945791726105, | |
| "grad_norm": 3.765625, | |
| "learning_rate": 2.124733961921864e-06, | |
| "loss": 0.5213, | |
| "mean_token_accuracy": 0.8443537354469299, | |
| "num_tokens": 2953787.0, | |
| "step": 502 | |
| }, | |
| { | |
| "entropy": 1.482778623700142, | |
| "epoch": 1.146077032810271, | |
| "grad_norm": 4.0, | |
| "learning_rate": 2.11542610102206e-06, | |
| "loss": 0.5494, | |
| "mean_token_accuracy": 0.8402880057692528, | |
| "num_tokens": 2958803.0, | |
| "step": 503 | |
| }, | |
| { | |
| "entropy": 1.4486149698495865, | |
| "epoch": 1.1483594864479316, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 2.1061236970340756e-06, | |
| "loss": 0.4747, | |
| "mean_token_accuracy": 0.8640668168663979, | |
| "num_tokens": 2965403.0, | |
| "step": 504 | |
| }, | |
| { | |
| "entropy": 1.4366931170225143, | |
| "epoch": 1.150641940085592, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 2.096826881954385e-06, | |
| "loss": 0.4002, | |
| "mean_token_accuracy": 0.869108684360981, | |
| "num_tokens": 2971085.0, | |
| "step": 505 | |
| }, | |
| { | |
| "entropy": 1.4204550981521606, | |
| "epoch": 1.1529243937232525, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 2.0875357877001556e-06, | |
| "loss": 0.3827, | |
| "mean_token_accuracy": 0.8868636935949326, | |
| "num_tokens": 2976577.0, | |
| "step": 506 | |
| }, | |
| { | |
| "entropy": 1.484930396080017, | |
| "epoch": 1.155206847360913, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 2.0782505461073822e-06, | |
| "loss": 0.4416, | |
| "mean_token_accuracy": 0.8644617721438408, | |
| "num_tokens": 2981977.0, | |
| "step": 507 | |
| }, | |
| { | |
| "entropy": 1.5487978011369705, | |
| "epoch": 1.1574893009985734, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 2.0689712889290114e-06, | |
| "loss": 0.4142, | |
| "mean_token_accuracy": 0.8582484424114227, | |
| "num_tokens": 2987315.0, | |
| "step": 508 | |
| }, | |
| { | |
| "entropy": 1.4167566150426865, | |
| "epoch": 1.159771754636234, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 2.059698147833075e-06, | |
| "loss": 0.4121, | |
| "mean_token_accuracy": 0.8841976970434189, | |
| "num_tokens": 2993295.0, | |
| "step": 509 | |
| }, | |
| { | |
| "entropy": 1.3966283351182938, | |
| "epoch": 1.1620542082738945, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 2.0504312544008193e-06, | |
| "loss": 0.4939, | |
| "mean_token_accuracy": 0.8544362857937813, | |
| "num_tokens": 2999720.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 1.5113223046064377, | |
| "epoch": 1.164336661911555, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 2.0411707401248406e-06, | |
| "loss": 0.4498, | |
| "mean_token_accuracy": 0.8582001850008965, | |
| "num_tokens": 3004838.0, | |
| "step": 511 | |
| }, | |
| { | |
| "entropy": 1.4698415398597717, | |
| "epoch": 1.1666191155492154, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 2.0319167364072184e-06, | |
| "loss": 0.4023, | |
| "mean_token_accuracy": 0.8724709004163742, | |
| "num_tokens": 3010321.0, | |
| "step": 512 | |
| }, | |
| { | |
| "entropy": 1.5211911350488663, | |
| "epoch": 1.1689015691868758, | |
| "grad_norm": 4.71875, | |
| "learning_rate": 2.0226693745576494e-06, | |
| "loss": 0.5156, | |
| "mean_token_accuracy": 0.8473959043622017, | |
| "num_tokens": 3015170.0, | |
| "step": 513 | |
| }, | |
| { | |
| "entropy": 1.3687680065631866, | |
| "epoch": 1.1711840228245363, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 2.0134287857915864e-06, | |
| "loss": 0.4614, | |
| "mean_token_accuracy": 0.8563283011317253, | |
| "num_tokens": 3021067.0, | |
| "step": 514 | |
| }, | |
| { | |
| "entropy": 1.4768206179141998, | |
| "epoch": 1.173466476462197, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 2.004195101228374e-06, | |
| "loss": 0.5225, | |
| "mean_token_accuracy": 0.8456647023558617, | |
| "num_tokens": 3026317.0, | |
| "step": 515 | |
| }, | |
| { | |
| "entropy": 1.5133604854345322, | |
| "epoch": 1.1757489300998574, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 1.9949684518893926e-06, | |
| "loss": 0.4637, | |
| "mean_token_accuracy": 0.8587842807173729, | |
| "num_tokens": 3032462.0, | |
| "step": 516 | |
| }, | |
| { | |
| "entropy": 1.5985838025808334, | |
| "epoch": 1.1780313837375178, | |
| "grad_norm": 3.5, | |
| "learning_rate": 1.985748968696194e-06, | |
| "loss": 0.4668, | |
| "mean_token_accuracy": 0.8562392815947533, | |
| "num_tokens": 3037823.0, | |
| "step": 517 | |
| }, | |
| { | |
| "entropy": 1.2920548766851425, | |
| "epoch": 1.1803138373751783, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 1.9765367824686467e-06, | |
| "loss": 0.3451, | |
| "mean_token_accuracy": 0.8893763497471809, | |
| "num_tokens": 3044938.0, | |
| "step": 518 | |
| }, | |
| { | |
| "entropy": 1.5204766243696213, | |
| "epoch": 1.182596291012839, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 1.9673320239230783e-06, | |
| "loss": 0.4753, | |
| "mean_token_accuracy": 0.8598108664155006, | |
| "num_tokens": 3051301.0, | |
| "step": 519 | |
| }, | |
| { | |
| "entropy": 1.445823684334755, | |
| "epoch": 1.1848787446504994, | |
| "grad_norm": 3.875, | |
| "learning_rate": 1.9581348236704217e-06, | |
| "loss": 0.4797, | |
| "mean_token_accuracy": 0.8649851009249687, | |
| "num_tokens": 3056991.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 1.4846927672624588, | |
| "epoch": 1.1871611982881598, | |
| "grad_norm": 3.875, | |
| "learning_rate": 1.9489453122143605e-06, | |
| "loss": 0.5029, | |
| "mean_token_accuracy": 0.8675966411828995, | |
| "num_tokens": 3062974.0, | |
| "step": 521 | |
| }, | |
| { | |
| "entropy": 1.4466316848993301, | |
| "epoch": 1.1894436519258202, | |
| "grad_norm": 3.421875, | |
| "learning_rate": 1.939763619949481e-06, | |
| "loss": 0.4049, | |
| "mean_token_accuracy": 0.8771371468901634, | |
| "num_tokens": 3068426.0, | |
| "step": 522 | |
| }, | |
| { | |
| "entropy": 1.5384458899497986, | |
| "epoch": 1.1917261055634807, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 1.930589877159415e-06, | |
| "loss": 0.454, | |
| "mean_token_accuracy": 0.864221066236496, | |
| "num_tokens": 3074213.0, | |
| "step": 523 | |
| }, | |
| { | |
| "entropy": 1.541999727487564, | |
| "epoch": 1.1940085592011411, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 1.9214242140149987e-06, | |
| "loss": 0.3965, | |
| "mean_token_accuracy": 0.874009445309639, | |
| "num_tokens": 3080429.0, | |
| "step": 524 | |
| }, | |
| { | |
| "entropy": 1.4880231320858002, | |
| "epoch": 1.1962910128388018, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 1.9122667605724202e-06, | |
| "loss": 0.5623, | |
| "mean_token_accuracy": 0.8356714621186256, | |
| "num_tokens": 3085713.0, | |
| "step": 525 | |
| }, | |
| { | |
| "entropy": 1.5347374975681305, | |
| "epoch": 1.1985734664764622, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 1.9031176467713763e-06, | |
| "loss": 0.3592, | |
| "mean_token_accuracy": 0.8790554702281952, | |
| "num_tokens": 3092191.0, | |
| "step": 526 | |
| }, | |
| { | |
| "entropy": 1.4829518347978592, | |
| "epoch": 1.2008559201141227, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 1.8939770024332294e-06, | |
| "loss": 0.3886, | |
| "mean_token_accuracy": 0.882826641201973, | |
| "num_tokens": 3098756.0, | |
| "step": 527 | |
| }, | |
| { | |
| "entropy": 1.4315824955701828, | |
| "epoch": 1.2031383737517831, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 1.884844957259163e-06, | |
| "loss": 0.4995, | |
| "mean_token_accuracy": 0.8524395078420639, | |
| "num_tokens": 3104965.0, | |
| "step": 528 | |
| }, | |
| { | |
| "entropy": 1.4298695474863052, | |
| "epoch": 1.2054208273894436, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 1.875721640828344e-06, | |
| "loss": 0.3871, | |
| "mean_token_accuracy": 0.8858682960271835, | |
| "num_tokens": 3111490.0, | |
| "step": 529 | |
| }, | |
| { | |
| "entropy": 1.4648047238588333, | |
| "epoch": 1.2077032810271042, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 1.866607182596081e-06, | |
| "loss": 0.3277, | |
| "mean_token_accuracy": 0.8968348726630211, | |
| "num_tokens": 3117215.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 1.5635619461536407, | |
| "epoch": 1.2099857346647647, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 1.857501711891993e-06, | |
| "loss": 0.4185, | |
| "mean_token_accuracy": 0.8711593821644783, | |
| "num_tokens": 3123093.0, | |
| "step": 531 | |
| }, | |
| { | |
| "entropy": 1.4186049550771713, | |
| "epoch": 1.212268188302425, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 1.848405357918166e-06, | |
| "loss": 0.4707, | |
| "mean_token_accuracy": 0.8640479817986488, | |
| "num_tokens": 3129377.0, | |
| "step": 532 | |
| }, | |
| { | |
| "entropy": 1.439442053437233, | |
| "epoch": 1.2145506419400856, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 1.8393182497473271e-06, | |
| "loss": 0.3726, | |
| "mean_token_accuracy": 0.8774393498897552, | |
| "num_tokens": 3135006.0, | |
| "step": 533 | |
| }, | |
| { | |
| "entropy": 1.5040159970521927, | |
| "epoch": 1.216833095577746, | |
| "grad_norm": 3.734375, | |
| "learning_rate": 1.830240516321008e-06, | |
| "loss": 0.5652, | |
| "mean_token_accuracy": 0.8349686115980148, | |
| "num_tokens": 3140699.0, | |
| "step": 534 | |
| }, | |
| { | |
| "entropy": 1.5229474604129791, | |
| "epoch": 1.2191155492154067, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 1.8211722864477197e-06, | |
| "loss": 0.4583, | |
| "mean_token_accuracy": 0.8692138940095901, | |
| "num_tokens": 3147116.0, | |
| "step": 535 | |
| }, | |
| { | |
| "entropy": 1.4866357445716858, | |
| "epoch": 1.221398002853067, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 1.8121136888011198e-06, | |
| "loss": 0.5026, | |
| "mean_token_accuracy": 0.8499261438846588, | |
| "num_tokens": 3153155.0, | |
| "step": 536 | |
| }, | |
| { | |
| "entropy": 1.4372419267892838, | |
| "epoch": 1.2236804564907275, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 1.8030648519181926e-06, | |
| "loss": 0.4709, | |
| "mean_token_accuracy": 0.8507603630423546, | |
| "num_tokens": 3158699.0, | |
| "step": 537 | |
| }, | |
| { | |
| "entropy": 1.4247355163097382, | |
| "epoch": 1.225962910128388, | |
| "grad_norm": 2.875, | |
| "learning_rate": 1.7940259041974189e-06, | |
| "loss": 0.4764, | |
| "mean_token_accuracy": 0.8748277649283409, | |
| "num_tokens": 3165422.0, | |
| "step": 538 | |
| }, | |
| { | |
| "entropy": 1.4786742329597473, | |
| "epoch": 1.2282453637660484, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 1.7849969738969592e-06, | |
| "loss": 0.4736, | |
| "mean_token_accuracy": 0.8629911243915558, | |
| "num_tokens": 3171419.0, | |
| "step": 539 | |
| }, | |
| { | |
| "entropy": 1.5304382294416428, | |
| "epoch": 1.230527817403709, | |
| "grad_norm": 3.375, | |
| "learning_rate": 1.7759781891328321e-06, | |
| "loss": 0.494, | |
| "mean_token_accuracy": 0.8473329395055771, | |
| "num_tokens": 3177530.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 1.3744118362665176, | |
| "epoch": 1.2328102710413695, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 1.766969677877094e-06, | |
| "loss": 0.4123, | |
| "mean_token_accuracy": 0.8834565728902817, | |
| "num_tokens": 3184220.0, | |
| "step": 541 | |
| }, | |
| { | |
| "entropy": 1.3755813837051392, | |
| "epoch": 1.23509272467903, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 1.7579715679560273e-06, | |
| "loss": 0.4265, | |
| "mean_token_accuracy": 0.8768275752663612, | |
| "num_tokens": 3190613.0, | |
| "step": 542 | |
| }, | |
| { | |
| "entropy": 1.4333829581737518, | |
| "epoch": 1.2373751783166904, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 1.7489839870483236e-06, | |
| "loss": 0.4931, | |
| "mean_token_accuracy": 0.8496510609984398, | |
| "num_tokens": 3196269.0, | |
| "step": 543 | |
| }, | |
| { | |
| "entropy": 1.4510899037122726, | |
| "epoch": 1.2396576319543509, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 1.7400070626832732e-06, | |
| "loss": 0.3757, | |
| "mean_token_accuracy": 0.8865254819393158, | |
| "num_tokens": 3201924.0, | |
| "step": 544 | |
| }, | |
| { | |
| "entropy": 1.4932819455862045, | |
| "epoch": 1.2419400855920113, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 1.7310409222389563e-06, | |
| "loss": 0.4531, | |
| "mean_token_accuracy": 0.850062184035778, | |
| "num_tokens": 3207808.0, | |
| "step": 545 | |
| }, | |
| { | |
| "entropy": 1.5299255549907684, | |
| "epoch": 1.244222539229672, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 1.7220856929404342e-06, | |
| "loss": 0.4531, | |
| "mean_token_accuracy": 0.8687416762113571, | |
| "num_tokens": 3213083.0, | |
| "step": 546 | |
| }, | |
| { | |
| "entropy": 1.5315914154052734, | |
| "epoch": 1.2465049928673324, | |
| "grad_norm": 3.84375, | |
| "learning_rate": 1.713141501857943e-06, | |
| "loss": 0.504, | |
| "mean_token_accuracy": 0.850853443145752, | |
| "num_tokens": 3218803.0, | |
| "step": 547 | |
| }, | |
| { | |
| "entropy": 1.5325356125831604, | |
| "epoch": 1.2487874465049928, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 1.7042084759050948e-06, | |
| "loss": 0.495, | |
| "mean_token_accuracy": 0.8577945232391357, | |
| "num_tokens": 3224187.0, | |
| "step": 548 | |
| }, | |
| { | |
| "entropy": 1.3780454993247986, | |
| "epoch": 1.2510699001426533, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 1.6952867418370707e-06, | |
| "loss": 0.4453, | |
| "mean_token_accuracy": 0.8700388446450233, | |
| "num_tokens": 3230589.0, | |
| "step": 549 | |
| }, | |
| { | |
| "entropy": 1.466676115989685, | |
| "epoch": 1.253352353780314, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 1.6863764262488292e-06, | |
| "loss": 0.496, | |
| "mean_token_accuracy": 0.8478997200727463, | |
| "num_tokens": 3237256.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 1.4295217841863632, | |
| "epoch": 1.2556348074179744, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 1.677477655573303e-06, | |
| "loss": 0.4455, | |
| "mean_token_accuracy": 0.8676532134413719, | |
| "num_tokens": 3243578.0, | |
| "step": 551 | |
| }, | |
| { | |
| "entropy": 1.4432758837938309, | |
| "epoch": 1.2579172610556348, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 1.6685905560796101e-06, | |
| "loss": 0.4933, | |
| "mean_token_accuracy": 0.8503763899207115, | |
| "num_tokens": 3249344.0, | |
| "step": 552 | |
| }, | |
| { | |
| "entropy": 1.4768379628658295, | |
| "epoch": 1.2601997146932953, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 1.6597152538712608e-06, | |
| "loss": 0.5331, | |
| "mean_token_accuracy": 0.8477922007441521, | |
| "num_tokens": 3256038.0, | |
| "step": 553 | |
| }, | |
| { | |
| "entropy": 1.4157912582159042, | |
| "epoch": 1.2624821683309557, | |
| "grad_norm": 3.515625, | |
| "learning_rate": 1.6508518748843651e-06, | |
| "loss": 0.5013, | |
| "mean_token_accuracy": 0.860062412917614, | |
| "num_tokens": 3261703.0, | |
| "step": 554 | |
| }, | |
| { | |
| "entropy": 1.403880551457405, | |
| "epoch": 1.2647646219686162, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 1.6420005448858522e-06, | |
| "loss": 0.5094, | |
| "mean_token_accuracy": 0.8528245538473129, | |
| "num_tokens": 3268063.0, | |
| "step": 555 | |
| }, | |
| { | |
| "entropy": 1.5064998269081116, | |
| "epoch": 1.2670470756062768, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 1.6331613894716787e-06, | |
| "loss": 0.4452, | |
| "mean_token_accuracy": 0.8757540956139565, | |
| "num_tokens": 3274092.0, | |
| "step": 556 | |
| }, | |
| { | |
| "entropy": 1.4100589752197266, | |
| "epoch": 1.2693295292439373, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 1.6243345340650523e-06, | |
| "loss": 0.4675, | |
| "mean_token_accuracy": 0.8688594177365303, | |
| "num_tokens": 3280661.0, | |
| "step": 557 | |
| }, | |
| { | |
| "entropy": 1.5577640682458878, | |
| "epoch": 1.2716119828815977, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 1.6155201039146478e-06, | |
| "loss": 0.4195, | |
| "mean_token_accuracy": 0.8589218854904175, | |
| "num_tokens": 3286601.0, | |
| "step": 558 | |
| }, | |
| { | |
| "entropy": 1.3485192209482193, | |
| "epoch": 1.2738944365192582, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 1.6067182240928332e-06, | |
| "loss": 0.3449, | |
| "mean_token_accuracy": 0.8934107944369316, | |
| "num_tokens": 3292073.0, | |
| "step": 559 | |
| }, | |
| { | |
| "entropy": 1.5532638430595398, | |
| "epoch": 1.2761768901569188, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 1.5979290194938938e-06, | |
| "loss": 0.4331, | |
| "mean_token_accuracy": 0.8702542334794998, | |
| "num_tokens": 3298200.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 1.5261798650026321, | |
| "epoch": 1.2784593437945793, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 1.5891526148322594e-06, | |
| "loss": 0.4389, | |
| "mean_token_accuracy": 0.862305723130703, | |
| "num_tokens": 3304356.0, | |
| "step": 561 | |
| }, | |
| { | |
| "entropy": 1.536175400018692, | |
| "epoch": 1.2807417974322397, | |
| "grad_norm": 4.3125, | |
| "learning_rate": 1.5803891346407342e-06, | |
| "loss": 0.5677, | |
| "mean_token_accuracy": 0.8316505700349808, | |
| "num_tokens": 3309722.0, | |
| "step": 562 | |
| }, | |
| { | |
| "entropy": 1.4453733563423157, | |
| "epoch": 1.2830242510699001, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 1.5716387032687314e-06, | |
| "loss": 0.3941, | |
| "mean_token_accuracy": 0.8798687309026718, | |
| "num_tokens": 3315076.0, | |
| "step": 563 | |
| }, | |
| { | |
| "entropy": 1.5081749856472015, | |
| "epoch": 1.2853067047075606, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 1.562901444880508e-06, | |
| "loss": 0.4143, | |
| "mean_token_accuracy": 0.8727659210562706, | |
| "num_tokens": 3320848.0, | |
| "step": 564 | |
| }, | |
| { | |
| "entropy": 1.5081788897514343, | |
| "epoch": 1.287589158345221, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 1.5541774834534024e-06, | |
| "loss": 0.4623, | |
| "mean_token_accuracy": 0.8562600538134575, | |
| "num_tokens": 3327236.0, | |
| "step": 565 | |
| }, | |
| { | |
| "entropy": 1.4714922159910202, | |
| "epoch": 1.2898716119828815, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 1.5454669427760774e-06, | |
| "loss": 0.4112, | |
| "mean_token_accuracy": 0.8714669123291969, | |
| "num_tokens": 3333039.0, | |
| "step": 566 | |
| }, | |
| { | |
| "entropy": 1.496582642197609, | |
| "epoch": 1.2921540656205421, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 1.5367699464467596e-06, | |
| "loss": 0.4667, | |
| "mean_token_accuracy": 0.8694412559270859, | |
| "num_tokens": 3339578.0, | |
| "step": 567 | |
| }, | |
| { | |
| "entropy": 1.453754335641861, | |
| "epoch": 1.2944365192582026, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 1.5280866178714898e-06, | |
| "loss": 0.4655, | |
| "mean_token_accuracy": 0.8703877553343773, | |
| "num_tokens": 3346073.0, | |
| "step": 568 | |
| }, | |
| { | |
| "entropy": 1.496316447854042, | |
| "epoch": 1.296718972895863, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 1.5194170802623692e-06, | |
| "loss": 0.403, | |
| "mean_token_accuracy": 0.8825008124113083, | |
| "num_tokens": 3351735.0, | |
| "step": 569 | |
| }, | |
| { | |
| "entropy": 1.5532702058553696, | |
| "epoch": 1.2990014265335235, | |
| "grad_norm": 3.375, | |
| "learning_rate": 1.5107614566358136e-06, | |
| "loss": 0.5159, | |
| "mean_token_accuracy": 0.872811533510685, | |
| "num_tokens": 3358008.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 1.3984228074550629, | |
| "epoch": 1.3012838801711841, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 1.5021198698108038e-06, | |
| "loss": 0.4531, | |
| "mean_token_accuracy": 0.8692669570446014, | |
| "num_tokens": 3364752.0, | |
| "step": 571 | |
| }, | |
| { | |
| "entropy": 1.500732660293579, | |
| "epoch": 1.3035663338088446, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 1.4934924424071479e-06, | |
| "loss": 0.3973, | |
| "mean_token_accuracy": 0.8750224709510803, | |
| "num_tokens": 3369908.0, | |
| "step": 572 | |
| }, | |
| { | |
| "entropy": 1.4046034514904022, | |
| "epoch": 1.305848787446505, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 1.4848792968437376e-06, | |
| "loss": 0.407, | |
| "mean_token_accuracy": 0.8775566592812538, | |
| "num_tokens": 3376101.0, | |
| "step": 573 | |
| }, | |
| { | |
| "entropy": 1.4599164128303528, | |
| "epoch": 1.3081312410841655, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 1.4762805553368115e-06, | |
| "loss": 0.4068, | |
| "mean_token_accuracy": 0.8896359950304031, | |
| "num_tokens": 3381766.0, | |
| "step": 574 | |
| }, | |
| { | |
| "entropy": 1.5650553405284882, | |
| "epoch": 1.310413694721826, | |
| "grad_norm": 3.90625, | |
| "learning_rate": 1.4676963398982248e-06, | |
| "loss": 0.526, | |
| "mean_token_accuracy": 0.8529334291815758, | |
| "num_tokens": 3387045.0, | |
| "step": 575 | |
| }, | |
| { | |
| "entropy": 1.4292816668748856, | |
| "epoch": 1.3126961483594863, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 1.4591267723337122e-06, | |
| "loss": 0.4427, | |
| "mean_token_accuracy": 0.8748316466808319, | |
| "num_tokens": 3393002.0, | |
| "step": 576 | |
| }, | |
| { | |
| "entropy": 1.5142599791288376, | |
| "epoch": 1.314978601997147, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 1.4505719742411644e-06, | |
| "loss": 0.3505, | |
| "mean_token_accuracy": 0.8907722160220146, | |
| "num_tokens": 3398389.0, | |
| "step": 577 | |
| }, | |
| { | |
| "entropy": 1.3882330507040024, | |
| "epoch": 1.3172610556348074, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 1.4420320670088977e-06, | |
| "loss": 0.3516, | |
| "mean_token_accuracy": 0.891185887157917, | |
| "num_tokens": 3404815.0, | |
| "step": 578 | |
| }, | |
| { | |
| "entropy": 1.5874699354171753, | |
| "epoch": 1.3195435092724679, | |
| "grad_norm": 3.6875, | |
| "learning_rate": 1.4335071718139379e-06, | |
| "loss": 0.5036, | |
| "mean_token_accuracy": 0.8607900366187096, | |
| "num_tokens": 3410299.0, | |
| "step": 579 | |
| }, | |
| { | |
| "entropy": 1.5547137558460236, | |
| "epoch": 1.3218259629101283, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 1.424997409620295e-06, | |
| "loss": 0.4533, | |
| "mean_token_accuracy": 0.8668412491679192, | |
| "num_tokens": 3415403.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 1.3236225843429565, | |
| "epoch": 1.324108416547789, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 1.4165029011772513e-06, | |
| "loss": 0.4062, | |
| "mean_token_accuracy": 0.8871461227536201, | |
| "num_tokens": 3421683.0, | |
| "step": 581 | |
| }, | |
| { | |
| "entropy": 1.469793826341629, | |
| "epoch": 1.3263908701854494, | |
| "grad_norm": 2.875, | |
| "learning_rate": 1.4080237670176456e-06, | |
| "loss": 0.4243, | |
| "mean_token_accuracy": 0.8801388815045357, | |
| "num_tokens": 3427994.0, | |
| "step": 582 | |
| }, | |
| { | |
| "entropy": 1.4647793471813202, | |
| "epoch": 1.3286733238231099, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 1.3995601274561605e-06, | |
| "loss": 0.4262, | |
| "mean_token_accuracy": 0.8648821488022804, | |
| "num_tokens": 3434912.0, | |
| "step": 583 | |
| }, | |
| { | |
| "entropy": 1.4584257155656815, | |
| "epoch": 1.3309557774607703, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 1.3911121025876212e-06, | |
| "loss": 0.4423, | |
| "mean_token_accuracy": 0.8798868283629417, | |
| "num_tokens": 3442058.0, | |
| "step": 584 | |
| }, | |
| { | |
| "entropy": 1.5300581902265549, | |
| "epoch": 1.3332382310984308, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 1.382679812285287e-06, | |
| "loss": 0.4313, | |
| "mean_token_accuracy": 0.8496553376317024, | |
| "num_tokens": 3447771.0, | |
| "step": 585 | |
| }, | |
| { | |
| "entropy": 1.5843760669231415, | |
| "epoch": 1.3355206847360912, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 1.3742633761991519e-06, | |
| "loss": 0.4945, | |
| "mean_token_accuracy": 0.8482984900474548, | |
| "num_tokens": 3452785.0, | |
| "step": 586 | |
| }, | |
| { | |
| "entropy": 1.4104232043027878, | |
| "epoch": 1.3378031383737519, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 1.365862913754247e-06, | |
| "loss": 0.3925, | |
| "mean_token_accuracy": 0.8749718070030212, | |
| "num_tokens": 3458611.0, | |
| "step": 587 | |
| }, | |
| { | |
| "entropy": 1.5725494027137756, | |
| "epoch": 1.3400855920114123, | |
| "grad_norm": 3.5, | |
| "learning_rate": 1.357478544148943e-06, | |
| "loss": 0.4045, | |
| "mean_token_accuracy": 0.8671303018927574, | |
| "num_tokens": 3465091.0, | |
| "step": 588 | |
| }, | |
| { | |
| "entropy": 1.4776265919208527, | |
| "epoch": 1.3423680456490727, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 1.3491103863532626e-06, | |
| "loss": 0.3392, | |
| "mean_token_accuracy": 0.9015164896845818, | |
| "num_tokens": 3470488.0, | |
| "step": 589 | |
| }, | |
| { | |
| "entropy": 1.6683387607336044, | |
| "epoch": 1.3446504992867332, | |
| "grad_norm": 4.8125, | |
| "learning_rate": 1.3407585591071944e-06, | |
| "loss": 0.5101, | |
| "mean_token_accuracy": 0.846831701695919, | |
| "num_tokens": 3475407.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 1.5126305967569351, | |
| "epoch": 1.3469329529243939, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 1.3324231809189985e-06, | |
| "loss": 0.4343, | |
| "mean_token_accuracy": 0.8680194914340973, | |
| "num_tokens": 3481469.0, | |
| "step": 591 | |
| }, | |
| { | |
| "entropy": 1.5650955736637115, | |
| "epoch": 1.3492154065620543, | |
| "grad_norm": 3.515625, | |
| "learning_rate": 1.3241043700635352e-06, | |
| "loss": 0.4892, | |
| "mean_token_accuracy": 0.86560869961977, | |
| "num_tokens": 3487280.0, | |
| "step": 592 | |
| }, | |
| { | |
| "entropy": 1.5224829465150833, | |
| "epoch": 1.3514978601997147, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 1.3158022445805816e-06, | |
| "loss": 0.437, | |
| "mean_token_accuracy": 0.8517628982663155, | |
| "num_tokens": 3492699.0, | |
| "step": 593 | |
| }, | |
| { | |
| "entropy": 1.5090250372886658, | |
| "epoch": 1.3537803138373752, | |
| "grad_norm": 3.734375, | |
| "learning_rate": 1.3075169222731573e-06, | |
| "loss": 0.4919, | |
| "mean_token_accuracy": 0.8590176850557327, | |
| "num_tokens": 3498075.0, | |
| "step": 594 | |
| }, | |
| { | |
| "entropy": 1.3978570252656937, | |
| "epoch": 1.3560627674750356, | |
| "grad_norm": 3.375, | |
| "learning_rate": 1.2992485207058548e-06, | |
| "loss": 0.4248, | |
| "mean_token_accuracy": 0.8699210062623024, | |
| "num_tokens": 3503380.0, | |
| "step": 595 | |
| }, | |
| { | |
| "entropy": 1.4768076539039612, | |
| "epoch": 1.358345221112696, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 1.2909971572031663e-06, | |
| "loss": 0.4681, | |
| "mean_token_accuracy": 0.8609839826822281, | |
| "num_tokens": 3509109.0, | |
| "step": 596 | |
| }, | |
| { | |
| "entropy": 1.4522972255945206, | |
| "epoch": 1.3606276747503565, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 1.2827629488478254e-06, | |
| "loss": 0.5161, | |
| "mean_token_accuracy": 0.8707276359200478, | |
| "num_tokens": 3515057.0, | |
| "step": 597 | |
| }, | |
| { | |
| "entropy": 1.57838936150074, | |
| "epoch": 1.3629101283880172, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 1.2745460124791425e-06, | |
| "loss": 0.4295, | |
| "mean_token_accuracy": 0.8608080074191093, | |
| "num_tokens": 3520795.0, | |
| "step": 598 | |
| }, | |
| { | |
| "entropy": 1.4834775626659393, | |
| "epoch": 1.3651925820256776, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 1.266346464691346e-06, | |
| "loss": 0.4126, | |
| "mean_token_accuracy": 0.8710288777947426, | |
| "num_tokens": 3526380.0, | |
| "step": 599 | |
| }, | |
| { | |
| "entropy": 1.5034915506839752, | |
| "epoch": 1.367475035663338, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 1.25816442183193e-06, | |
| "loss": 0.5211, | |
| "mean_token_accuracy": 0.837138943374157, | |
| "num_tokens": 3531865.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.367475035663338, | |
| "eval_entropy": 1.4793427891201443, | |
| "eval_loss": 0.473636656999588, | |
| "eval_mean_token_accuracy": 0.8656807369656033, | |
| "eval_num_tokens": 3531865.0, | |
| "eval_runtime": 4.3898, | |
| "eval_samples_per_second": 20.502, | |
| "eval_steps_per_second": 20.502, | |
| "step": 600 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 878, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.360105773011712e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |