{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.953191489361702, "eval_steps": 500, "global_step": 440, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011347517730496455, "grad_norm": 42.11255056967552, "learning_rate": 4.999936276068748e-05, "loss": 2.7084, "num_input_tokens_seen": 262144, "step": 1 }, { "epoch": 0.02269503546099291, "grad_norm": 54.92874275677881, "learning_rate": 4.9997451075235834e-05, "loss": 3.9726, "num_input_tokens_seen": 524288, "step": 2 }, { "epoch": 0.03404255319148936, "grad_norm": 42.37953862248386, "learning_rate": 4.999426504110115e-05, "loss": 3.6425, "num_input_tokens_seen": 786432, "step": 3 }, { "epoch": 0.04539007092198582, "grad_norm": 91.1622917243015, "learning_rate": 4.9989804820704735e-05, "loss": 3.9759, "num_input_tokens_seen": 1048576, "step": 4 }, { "epoch": 0.05673758865248227, "grad_norm": 18.918639143868496, "learning_rate": 4.99840706414248e-05, "loss": 2.7491, "num_input_tokens_seen": 1310720, "step": 5 }, { "epoch": 0.06808510638297872, "grad_norm": 1015.4776658382041, "learning_rate": 4.9977062795584893e-05, "loss": 6.0462, "num_input_tokens_seen": 1572864, "step": 6 }, { "epoch": 0.07943262411347518, "grad_norm": 72.92199611683951, "learning_rate": 4.9968781640439026e-05, "loss": 4.8547, "num_input_tokens_seen": 1835008, "step": 7 }, { "epoch": 0.09078014184397164, "grad_norm": 2053.7092461560514, "learning_rate": 4.995922759815339e-05, "loss": 3.4759, "num_input_tokens_seen": 2097152, "step": 8 }, { "epoch": 0.10212765957446808, "grad_norm": 301.76868717065724, "learning_rate": 4.9948401155784904e-05, "loss": 5.716, "num_input_tokens_seen": 2359296, "step": 9 }, { "epoch": 0.11347517730496454, "grad_norm": 22.772625297554043, "learning_rate": 4.993630286525634e-05, "loss": 3.0399, "num_input_tokens_seen": 2621440, "step": 10 }, { "epoch": 0.12482269503546099, "grad_norm": 30.392870542185943, "learning_rate": 4.99229333433282e-05, "loss": 2.7072, "num_input_tokens_seen": 2883584, "step": 11 }, { "epoch": 0.13617021276595745, "grad_norm": 7.488897181026025, "learning_rate": 4.9908293271567286e-05, "loss": 2.3501, "num_input_tokens_seen": 3145728, "step": 12 }, { "epoch": 0.1475177304964539, "grad_norm": 7.085681299970245, "learning_rate": 4.9892383396311934e-05, "loss": 2.157, "num_input_tokens_seen": 3407872, "step": 13 }, { "epoch": 0.15886524822695036, "grad_norm": 8.190856615122739, "learning_rate": 4.987520452863399e-05, "loss": 2.0567, "num_input_tokens_seen": 3670016, "step": 14 }, { "epoch": 0.1702127659574468, "grad_norm": 10.596165600490249, "learning_rate": 4.985675754429744e-05, "loss": 2.0914, "num_input_tokens_seen": 3932160, "step": 15 }, { "epoch": 0.18156028368794327, "grad_norm": 4.228212579415264, "learning_rate": 4.9837043383713753e-05, "loss": 1.9227, "num_input_tokens_seen": 4194304, "step": 16 }, { "epoch": 0.19290780141843972, "grad_norm": 7.545954107986025, "learning_rate": 4.981606305189401e-05, "loss": 1.9327, "num_input_tokens_seen": 4456448, "step": 17 }, { "epoch": 0.20425531914893616, "grad_norm": 5.46892073376645, "learning_rate": 4.979381761839757e-05, "loss": 1.9021, "num_input_tokens_seen": 4718592, "step": 18 }, { "epoch": 0.21560283687943263, "grad_norm": 4.169344521056397, "learning_rate": 4.9770308217277614e-05, "loss": 1.8432, "num_input_tokens_seen": 4980736, "step": 19 }, { "epoch": 0.22695035460992907, "grad_norm": 4.236687724660673, "learning_rate": 4.9745536047023324e-05, "loss": 1.8072, "num_input_tokens_seen": 5242880, "step": 20 }, { "epoch": 0.23829787234042554, "grad_norm": 2.060568655336032, "learning_rate": 4.971950237049874e-05, "loss": 1.7635, "num_input_tokens_seen": 5505024, "step": 21 }, { "epoch": 0.24964539007092199, "grad_norm": 4.122485050689507, "learning_rate": 4.9692208514878444e-05, "loss": 1.7203, "num_input_tokens_seen": 5767168, "step": 22 }, { "epoch": 0.26099290780141843, "grad_norm": 2.219869083818665, "learning_rate": 4.966365587157986e-05, "loss": 1.697, "num_input_tokens_seen": 6029312, "step": 23 }, { "epoch": 0.2723404255319149, "grad_norm": 2.978772859366142, "learning_rate": 4.963384589619233e-05, "loss": 1.6882, "num_input_tokens_seen": 6291456, "step": 24 }, { "epoch": 0.28368794326241137, "grad_norm": 2.3166879594611562, "learning_rate": 4.96027801084029e-05, "loss": 1.6619, "num_input_tokens_seen": 6553600, "step": 25 }, { "epoch": 0.2950354609929078, "grad_norm": 2.660252399042677, "learning_rate": 4.957046009191889e-05, "loss": 1.6709, "num_input_tokens_seen": 6815744, "step": 26 }, { "epoch": 0.30638297872340425, "grad_norm": 1.9436043867392558, "learning_rate": 4.95368874943871e-05, "loss": 1.6295, "num_input_tokens_seen": 7077888, "step": 27 }, { "epoch": 0.3177304964539007, "grad_norm": 3.2367808954229056, "learning_rate": 4.9502064027309836e-05, "loss": 1.6475, "num_input_tokens_seen": 7340032, "step": 28 }, { "epoch": 0.32907801418439714, "grad_norm": 2.9573963451533163, "learning_rate": 4.946599146595769e-05, "loss": 1.6421, "num_input_tokens_seen": 7602176, "step": 29 }, { "epoch": 0.3404255319148936, "grad_norm": 2.0096985811179584, "learning_rate": 4.942867164927899e-05, "loss": 1.6335, "num_input_tokens_seen": 7864320, "step": 30 }, { "epoch": 0.3517730496453901, "grad_norm": 3.2248151762473904, "learning_rate": 4.9390106479806085e-05, "loss": 1.6085, "num_input_tokens_seen": 8126464, "step": 31 }, { "epoch": 0.36312056737588655, "grad_norm": 2.03961317414281, "learning_rate": 4.935029792355834e-05, "loss": 1.5966, "num_input_tokens_seen": 8388608, "step": 32 }, { "epoch": 0.37446808510638296, "grad_norm": 3.106580267812995, "learning_rate": 4.9309248009941914e-05, "loss": 1.5939, "num_input_tokens_seen": 8650752, "step": 33 }, { "epoch": 0.38581560283687943, "grad_norm": 2.356069611331115, "learning_rate": 4.9266958831646315e-05, "loss": 1.5713, "num_input_tokens_seen": 8912896, "step": 34 }, { "epoch": 0.3971631205673759, "grad_norm": 2.555281051112305, "learning_rate": 4.922343254453768e-05, "loss": 1.5476, "num_input_tokens_seen": 9175040, "step": 35 }, { "epoch": 0.4085106382978723, "grad_norm": 2.1382267614522377, "learning_rate": 4.917867136754893e-05, "loss": 1.5501, "num_input_tokens_seen": 9437184, "step": 36 }, { "epoch": 0.4198581560283688, "grad_norm": 2.182480841186189, "learning_rate": 4.913267758256658e-05, "loss": 1.5678, "num_input_tokens_seen": 9699328, "step": 37 }, { "epoch": 0.43120567375886526, "grad_norm": 1.9946631054615584, "learning_rate": 4.9085453534314476e-05, "loss": 1.5604, "num_input_tokens_seen": 9961472, "step": 38 }, { "epoch": 0.4425531914893617, "grad_norm": 2.1201916556543505, "learning_rate": 4.9037001630234215e-05, "loss": 1.5265, "num_input_tokens_seen": 10223616, "step": 39 }, { "epoch": 0.45390070921985815, "grad_norm": 1.9422378506921851, "learning_rate": 4.898732434036244e-05, "loss": 1.5269, "num_input_tokens_seen": 10485760, "step": 40 }, { "epoch": 0.4652482269503546, "grad_norm": 1.7767185160628844, "learning_rate": 4.893642419720491e-05, "loss": 1.4965, "num_input_tokens_seen": 10747904, "step": 41 }, { "epoch": 0.4765957446808511, "grad_norm": 1.6802879544286047, "learning_rate": 4.888430379560742e-05, "loss": 1.5254, "num_input_tokens_seen": 11010048, "step": 42 }, { "epoch": 0.4879432624113475, "grad_norm": 1.6860871708585086, "learning_rate": 4.883096579262346e-05, "loss": 1.4975, "num_input_tokens_seen": 11272192, "step": 43 }, { "epoch": 0.49929078014184397, "grad_norm": 1.7303274117706944, "learning_rate": 4.877641290737884e-05, "loss": 1.5209, "num_input_tokens_seen": 11534336, "step": 44 }, { "epoch": 0.5106382978723404, "grad_norm": 1.4626721944583252, "learning_rate": 4.872064792093299e-05, "loss": 1.51, "num_input_tokens_seen": 11796480, "step": 45 }, { "epoch": 0.5219858156028369, "grad_norm": 1.6681840945027697, "learning_rate": 4.866367367613725e-05, "loss": 1.5086, "num_input_tokens_seen": 12058624, "step": 46 }, { "epoch": 0.5333333333333333, "grad_norm": 1.5207849294270255, "learning_rate": 4.86054930774899e-05, "loss": 1.4588, "num_input_tokens_seen": 12320768, "step": 47 }, { "epoch": 0.5446808510638298, "grad_norm": 1.616710324729056, "learning_rate": 4.854610909098812e-05, "loss": 1.4805, "num_input_tokens_seen": 12582912, "step": 48 }, { "epoch": 0.5560283687943263, "grad_norm": 1.60340831300244, "learning_rate": 4.848552474397676e-05, "loss": 1.4696, "num_input_tokens_seen": 12845056, "step": 49 }, { "epoch": 0.5673758865248227, "grad_norm": 1.7880113956949981, "learning_rate": 4.842374312499405e-05, "loss": 1.4821, "num_input_tokens_seen": 13107200, "step": 50 }, { "epoch": 0.5787234042553191, "grad_norm": 1.3568656846238325, "learning_rate": 4.836076738361408e-05, "loss": 1.4884, "num_input_tokens_seen": 13369344, "step": 51 }, { "epoch": 0.5900709219858156, "grad_norm": 1.5552837054733266, "learning_rate": 4.829660073028631e-05, "loss": 1.4686, "num_input_tokens_seen": 13631488, "step": 52 }, { "epoch": 0.601418439716312, "grad_norm": 1.4041461982644978, "learning_rate": 4.823124643617187e-05, "loss": 1.4522, "num_input_tokens_seen": 13893632, "step": 53 }, { "epoch": 0.6127659574468085, "grad_norm": 1.475780206398884, "learning_rate": 4.8164707832976783e-05, "loss": 1.464, "num_input_tokens_seen": 14155776, "step": 54 }, { "epoch": 0.624113475177305, "grad_norm": 1.4112950034248575, "learning_rate": 4.8096988312782174e-05, "loss": 1.445, "num_input_tokens_seen": 14417920, "step": 55 }, { "epoch": 0.6354609929078014, "grad_norm": 1.6206014969548581, "learning_rate": 4.802809132787125e-05, "loss": 1.45, "num_input_tokens_seen": 14680064, "step": 56 }, { "epoch": 0.6468085106382979, "grad_norm": 1.2999206377427606, "learning_rate": 4.7958020390553426e-05, "loss": 1.4923, "num_input_tokens_seen": 14942208, "step": 57 }, { "epoch": 0.6581560283687943, "grad_norm": 1.2629239809770683, "learning_rate": 4.7886779072985156e-05, "loss": 1.4459, "num_input_tokens_seen": 15204352, "step": 58 }, { "epoch": 0.6695035460992907, "grad_norm": 1.6115720276042818, "learning_rate": 4.78143710069879e-05, "loss": 1.4211, "num_input_tokens_seen": 15466496, "step": 59 }, { "epoch": 0.6808510638297872, "grad_norm": 1.3564661495186316, "learning_rate": 4.774079988386296e-05, "loss": 1.4354, "num_input_tokens_seen": 15728640, "step": 60 }, { "epoch": 0.6921985815602837, "grad_norm": 1.4029089027053228, "learning_rate": 4.766606945420329e-05, "loss": 1.4552, "num_input_tokens_seen": 15990784, "step": 61 }, { "epoch": 0.7035460992907802, "grad_norm": 1.609508929492919, "learning_rate": 4.759018352770229e-05, "loss": 1.4401, "num_input_tokens_seen": 16252928, "step": 62 }, { "epoch": 0.7148936170212766, "grad_norm": 1.5314498833493546, "learning_rate": 4.751314597295963e-05, "loss": 1.4717, "num_input_tokens_seen": 16515072, "step": 63 }, { "epoch": 0.7262411347517731, "grad_norm": 1.4584944515244869, "learning_rate": 4.743496071728396e-05, "loss": 1.4372, "num_input_tokens_seen": 16777216, "step": 64 }, { "epoch": 0.7375886524822695, "grad_norm": 1.3848187202296613, "learning_rate": 4.735563174649278e-05, "loss": 1.4222, "num_input_tokens_seen": 17039360, "step": 65 }, { "epoch": 0.7489361702127659, "grad_norm": 1.322399845197604, "learning_rate": 4.72751631047092e-05, "loss": 1.4328, "num_input_tokens_seen": 17301504, "step": 66 }, { "epoch": 0.7602836879432624, "grad_norm": 1.2041955676624019, "learning_rate": 4.719355889415576e-05, "loss": 1.4133, "num_input_tokens_seen": 17563648, "step": 67 }, { "epoch": 0.7716312056737589, "grad_norm": 1.4025507710172729, "learning_rate": 4.711082327494536e-05, "loss": 1.4239, "num_input_tokens_seen": 17825792, "step": 68 }, { "epoch": 0.7829787234042553, "grad_norm": 1.5777455822322792, "learning_rate": 4.7026960464869116e-05, "loss": 1.437, "num_input_tokens_seen": 18087936, "step": 69 }, { "epoch": 0.7943262411347518, "grad_norm": 1.3436386092926063, "learning_rate": 4.6941974739181395e-05, "loss": 1.4243, "num_input_tokens_seen": 18350080, "step": 70 }, { "epoch": 0.8056737588652483, "grad_norm": 1.7031338750612581, "learning_rate": 4.6855870430381816e-05, "loss": 1.4272, "num_input_tokens_seen": 18612224, "step": 71 }, { "epoch": 0.8170212765957446, "grad_norm": 1.3269888785525612, "learning_rate": 4.6768651927994434e-05, "loss": 1.4054, "num_input_tokens_seen": 18874368, "step": 72 }, { "epoch": 0.8283687943262411, "grad_norm": 1.4476048219302555, "learning_rate": 4.668032367834392e-05, "loss": 1.413, "num_input_tokens_seen": 19136512, "step": 73 }, { "epoch": 0.8397163120567376, "grad_norm": 1.4535317388744813, "learning_rate": 4.6590890184328925e-05, "loss": 1.4054, "num_input_tokens_seen": 19398656, "step": 74 }, { "epoch": 0.851063829787234, "grad_norm": 1.2269074929733113, "learning_rate": 4.6500356005192514e-05, "loss": 1.3898, "num_input_tokens_seen": 19660800, "step": 75 }, { "epoch": 0.8624113475177305, "grad_norm": 1.3008326960072658, "learning_rate": 4.640872575628973e-05, "loss": 1.4042, "num_input_tokens_seen": 19922944, "step": 76 }, { "epoch": 0.873758865248227, "grad_norm": 1.2343345399913839, "learning_rate": 4.6316004108852305e-05, "loss": 1.4181, "num_input_tokens_seen": 20185088, "step": 77 }, { "epoch": 0.8851063829787233, "grad_norm": 1.40051458320969, "learning_rate": 4.622219578975057e-05, "loss": 1.3986, "num_input_tokens_seen": 20447232, "step": 78 }, { "epoch": 0.8964539007092198, "grad_norm": 1.2497430360048092, "learning_rate": 4.6127305581252414e-05, "loss": 1.3769, "num_input_tokens_seen": 20709376, "step": 79 }, { "epoch": 0.9078014184397163, "grad_norm": 1.2431538833258515, "learning_rate": 4.6031338320779534e-05, "loss": 1.4002, "num_input_tokens_seen": 20971520, "step": 80 }, { "epoch": 0.9191489361702128, "grad_norm": 1.0814373534222335, "learning_rate": 4.593429890066082e-05, "loss": 1.4156, "num_input_tokens_seen": 21233664, "step": 81 }, { "epoch": 0.9304964539007092, "grad_norm": 1.115346242762015, "learning_rate": 4.583619226788294e-05, "loss": 1.3867, "num_input_tokens_seen": 21495808, "step": 82 }, { "epoch": 0.9418439716312057, "grad_norm": 1.2127601722053167, "learning_rate": 4.573702342383816e-05, "loss": 1.3751, "num_input_tokens_seen": 21757952, "step": 83 }, { "epoch": 0.9531914893617022, "grad_norm": 1.1748214200248, "learning_rate": 4.563679742406935e-05, "loss": 1.3834, "num_input_tokens_seen": 22020096, "step": 84 }, { "epoch": 0.9645390070921985, "grad_norm": 1.1331339281580772, "learning_rate": 4.5535519378012295e-05, "loss": 1.3791, "num_input_tokens_seen": 22282240, "step": 85 }, { "epoch": 0.975886524822695, "grad_norm": 1.4004411805319554, "learning_rate": 4.543319444873517e-05, "loss": 1.3785, "num_input_tokens_seen": 22544384, "step": 86 }, { "epoch": 0.9872340425531915, "grad_norm": 1.2118340365527955, "learning_rate": 4.532982785267541e-05, "loss": 1.3635, "num_input_tokens_seen": 22806528, "step": 87 }, { "epoch": 0.9985815602836879, "grad_norm": 1.5899548683498645, "learning_rate": 4.522542485937369e-05, "loss": 1.3879, "num_input_tokens_seen": 23068672, "step": 88 }, { "epoch": 1.0, "grad_norm": 1.5899548683498645, "learning_rate": 4.511999079120534e-05, "loss": 1.2364, "num_input_tokens_seen": 23101440, "step": 89 }, { "epoch": 1.0113475177304965, "grad_norm": 3.1471576605311427, "learning_rate": 4.5013531023109014e-05, "loss": 1.0204, "num_input_tokens_seen": 23363584, "step": 90 }, { "epoch": 1.022695035460993, "grad_norm": 1.9813456886904501, "learning_rate": 4.4906050982312664e-05, "loss": 1.0799, "num_input_tokens_seen": 23625728, "step": 91 }, { "epoch": 1.0340425531914894, "grad_norm": 1.7175351501466867, "learning_rate": 4.479755614805688e-05, "loss": 1.0052, "num_input_tokens_seen": 23887872, "step": 92 }, { "epoch": 1.0453900709219859, "grad_norm": 1.970342186079611, "learning_rate": 4.4688052051315545e-05, "loss": 1.0531, "num_input_tokens_seen": 24150016, "step": 93 }, { "epoch": 1.0567375886524824, "grad_norm": 1.7872662354821134, "learning_rate": 4.457754427451389e-05, "loss": 1.0159, "num_input_tokens_seen": 24412160, "step": 94 }, { "epoch": 1.0680851063829788, "grad_norm": 1.8666514589785563, "learning_rate": 4.446603845124388e-05, "loss": 0.9947, "num_input_tokens_seen": 24674304, "step": 95 }, { "epoch": 1.0794326241134753, "grad_norm": 1.5777077629921268, "learning_rate": 4.4353540265977064e-05, "loss": 0.9969, "num_input_tokens_seen": 24936448, "step": 96 }, { "epoch": 1.0907801418439715, "grad_norm": 1.4442562002627628, "learning_rate": 4.4240055453774734e-05, "loss": 1.02, "num_input_tokens_seen": 25198592, "step": 97 }, { "epoch": 1.102127659574468, "grad_norm": 1.6801959052168647, "learning_rate": 4.412558979999558e-05, "loss": 0.9789, "num_input_tokens_seen": 25460736, "step": 98 }, { "epoch": 1.1134751773049645, "grad_norm": 1.3983076558719154, "learning_rate": 4.401014914000078e-05, "loss": 1.0035, "num_input_tokens_seen": 25722880, "step": 99 }, { "epoch": 1.124822695035461, "grad_norm": 1.4068549353820048, "learning_rate": 4.389373935885646e-05, "loss": 0.9751, "num_input_tokens_seen": 25985024, "step": 100 }, { "epoch": 1.1361702127659574, "grad_norm": 1.4162325676250134, "learning_rate": 4.3776366391033746e-05, "loss": 0.9464, "num_input_tokens_seen": 26247168, "step": 101 }, { "epoch": 1.147517730496454, "grad_norm": 1.3675197337363096, "learning_rate": 4.365803622010618e-05, "loss": 0.9592, "num_input_tokens_seen": 26509312, "step": 102 }, { "epoch": 1.1588652482269504, "grad_norm": 1.5007646821184917, "learning_rate": 4.35387548784447e-05, "loss": 0.9709, "num_input_tokens_seen": 26771456, "step": 103 }, { "epoch": 1.1702127659574468, "grad_norm": 1.4773281381482295, "learning_rate": 4.341852844691012e-05, "loss": 0.9834, "num_input_tokens_seen": 27033600, "step": 104 }, { "epoch": 1.1815602836879433, "grad_norm": 1.933895535557616, "learning_rate": 4.329736305454314e-05, "loss": 0.9999, "num_input_tokens_seen": 27295744, "step": 105 }, { "epoch": 1.1929078014184398, "grad_norm": 1.6902638904380827, "learning_rate": 4.3175264878251845e-05, "loss": 0.9294, "num_input_tokens_seen": 27557888, "step": 106 }, { "epoch": 1.2042553191489362, "grad_norm": 1.696391072445269, "learning_rate": 4.305224014249688e-05, "loss": 0.9906, "num_input_tokens_seen": 27820032, "step": 107 }, { "epoch": 1.2156028368794327, "grad_norm": 1.7162350808896498, "learning_rate": 4.292829511897409e-05, "loss": 0.9731, "num_input_tokens_seen": 28082176, "step": 108 }, { "epoch": 1.226950354609929, "grad_norm": 1.736567992601448, "learning_rate": 4.280343612629479e-05, "loss": 0.978, "num_input_tokens_seen": 28344320, "step": 109 }, { "epoch": 1.2382978723404254, "grad_norm": 1.3602272146940024, "learning_rate": 4.267766952966369e-05, "loss": 0.9325, "num_input_tokens_seen": 28606464, "step": 110 }, { "epoch": 1.249645390070922, "grad_norm": 1.5745245166339177, "learning_rate": 4.255100174055434e-05, "loss": 0.9657, "num_input_tokens_seen": 28868608, "step": 111 }, { "epoch": 1.2609929078014184, "grad_norm": 1.3239840597482868, "learning_rate": 4.242343921638234e-05, "loss": 0.9451, "num_input_tokens_seen": 29130752, "step": 112 }, { "epoch": 1.2723404255319148, "grad_norm": 1.392428377159933, "learning_rate": 4.22949884601761e-05, "loss": 0.9632, "num_input_tokens_seen": 29392896, "step": 113 }, { "epoch": 1.2836879432624113, "grad_norm": 1.382581854515865, "learning_rate": 4.2165656020245336e-05, "loss": 0.9483, "num_input_tokens_seen": 29655040, "step": 114 }, { "epoch": 1.2950354609929078, "grad_norm": 1.5462756056089444, "learning_rate": 4.2035448489847284e-05, "loss": 0.9899, "num_input_tokens_seen": 29917184, "step": 115 }, { "epoch": 1.3063829787234043, "grad_norm": 1.4147640794435994, "learning_rate": 4.1904372506850484e-05, "loss": 0.9092, "num_input_tokens_seen": 30179328, "step": 116 }, { "epoch": 1.3177304964539007, "grad_norm": 1.2432801814071204, "learning_rate": 4.1772434753396504e-05, "loss": 0.9501, "num_input_tokens_seen": 30441472, "step": 117 }, { "epoch": 1.3290780141843972, "grad_norm": 1.4932728466362941, "learning_rate": 4.1639641955559205e-05, "loss": 1.0093, "num_input_tokens_seen": 30703616, "step": 118 }, { "epoch": 1.3404255319148937, "grad_norm": 1.146394031379382, "learning_rate": 4.1506000883001875e-05, "loss": 0.95, "num_input_tokens_seen": 30965760, "step": 119 }, { "epoch": 1.3517730496453901, "grad_norm": 1.314570904504416, "learning_rate": 4.137151834863213e-05, "loss": 0.9733, "num_input_tokens_seen": 31227904, "step": 120 }, { "epoch": 1.3631205673758866, "grad_norm": 1.144532383386255, "learning_rate": 4.123620120825459e-05, "loss": 0.9503, "num_input_tokens_seen": 31490048, "step": 121 }, { "epoch": 1.374468085106383, "grad_norm": 1.407081639684963, "learning_rate": 4.1100056360221384e-05, "loss": 0.9787, "num_input_tokens_seen": 31752192, "step": 122 }, { "epoch": 1.3858156028368795, "grad_norm": 1.1478616889934938, "learning_rate": 4.096309074508046e-05, "loss": 0.9697, "num_input_tokens_seen": 32014336, "step": 123 }, { "epoch": 1.397163120567376, "grad_norm": 1.1798657684361409, "learning_rate": 4.082531134522176e-05, "loss": 0.9397, "num_input_tokens_seen": 32276480, "step": 124 }, { "epoch": 1.4085106382978723, "grad_norm": 1.1170685676293548, "learning_rate": 4.06867251845213e-05, "loss": 0.9466, "num_input_tokens_seen": 32538624, "step": 125 }, { "epoch": 1.4198581560283687, "grad_norm": 1.158583040981931, "learning_rate": 4.054733932798306e-05, "loss": 0.9517, "num_input_tokens_seen": 32800768, "step": 126 }, { "epoch": 1.4312056737588652, "grad_norm": 1.1206965982137238, "learning_rate": 4.0407160881378824e-05, "loss": 0.9299, "num_input_tokens_seen": 33062912, "step": 127 }, { "epoch": 1.4425531914893617, "grad_norm": 1.2166341082803098, "learning_rate": 4.0266196990885955e-05, "loss": 0.9674, "num_input_tokens_seen": 33325056, "step": 128 }, { "epoch": 1.4539007092198581, "grad_norm": 1.253313105155196, "learning_rate": 4.012445484272307e-05, "loss": 0.935, "num_input_tokens_seen": 33587200, "step": 129 }, { "epoch": 1.4652482269503546, "grad_norm": 1.1241425426902512, "learning_rate": 3.9981941662783674e-05, "loss": 0.9856, "num_input_tokens_seen": 33849344, "step": 130 }, { "epoch": 1.476595744680851, "grad_norm": 1.2496739496083311, "learning_rate": 3.9838664716267855e-05, "loss": 0.95, "num_input_tokens_seen": 34111488, "step": 131 }, { "epoch": 1.4879432624113476, "grad_norm": 1.2413295026999056, "learning_rate": 3.969463130731183e-05, "loss": 1.0095, "num_input_tokens_seen": 34373632, "step": 132 }, { "epoch": 1.499290780141844, "grad_norm": 1.0994149236511626, "learning_rate": 3.954984877861565e-05, "loss": 0.9711, "num_input_tokens_seen": 34635776, "step": 133 }, { "epoch": 1.5106382978723403, "grad_norm": 1.2204241140248873, "learning_rate": 3.9404324511068825e-05, "loss": 1.0004, "num_input_tokens_seen": 34897920, "step": 134 }, { "epoch": 1.5219858156028367, "grad_norm": 1.0639734593664278, "learning_rate": 3.92580659233741e-05, "loss": 0.9683, "num_input_tokens_seen": 35160064, "step": 135 }, { "epoch": 1.5333333333333332, "grad_norm": 1.286928460200864, "learning_rate": 3.911108047166924e-05, "loss": 0.9735, "num_input_tokens_seen": 35422208, "step": 136 }, { "epoch": 1.5446808510638297, "grad_norm": 1.1776256536499852, "learning_rate": 3.8963375649146866e-05, "loss": 0.9917, "num_input_tokens_seen": 35684352, "step": 137 }, { "epoch": 1.5560283687943262, "grad_norm": 1.1927233327599214, "learning_rate": 3.881495898567257e-05, "loss": 0.9585, "num_input_tokens_seen": 35946496, "step": 138 }, { "epoch": 1.5673758865248226, "grad_norm": 1.1616205054974948, "learning_rate": 3.866583804740095e-05, "loss": 0.9305, "num_input_tokens_seen": 36208640, "step": 139 }, { "epoch": 1.578723404255319, "grad_norm": 1.1608542297205653, "learning_rate": 3.851602043638994e-05, "loss": 0.9738, "num_input_tokens_seen": 36470784, "step": 140 }, { "epoch": 1.5900709219858156, "grad_norm": 1.1754828045051902, "learning_rate": 3.8365513790213265e-05, "loss": 0.9627, "num_input_tokens_seen": 36732928, "step": 141 }, { "epoch": 1.601418439716312, "grad_norm": 1.2412851531549465, "learning_rate": 3.821432578157105e-05, "loss": 0.9673, "num_input_tokens_seen": 36995072, "step": 142 }, { "epoch": 1.6127659574468085, "grad_norm": 1.1681668630687394, "learning_rate": 3.8062464117898724e-05, "loss": 0.9908, "num_input_tokens_seen": 37257216, "step": 143 }, { "epoch": 1.624113475177305, "grad_norm": 1.1771959095169333, "learning_rate": 3.790993654097405e-05, "loss": 0.9913, "num_input_tokens_seen": 37519360, "step": 144 }, { "epoch": 1.6354609929078014, "grad_norm": 1.2334490161088576, "learning_rate": 3.77567508265225e-05, "loss": 0.9911, "num_input_tokens_seen": 37781504, "step": 145 }, { "epoch": 1.646808510638298, "grad_norm": 1.132776925570845, "learning_rate": 3.76029147838208e-05, "loss": 0.9801, "num_input_tokens_seen": 38043648, "step": 146 }, { "epoch": 1.6581560283687944, "grad_norm": 1.2852017625106646, "learning_rate": 3.74484362552989e-05, "loss": 0.9725, "num_input_tokens_seen": 38305792, "step": 147 }, { "epoch": 1.6695035460992909, "grad_norm": 1.1423632361464884, "learning_rate": 3.72933231161401e-05, "loss": 0.961, "num_input_tokens_seen": 38567936, "step": 148 }, { "epoch": 1.6808510638297873, "grad_norm": 1.1887510715530398, "learning_rate": 3.713758327387961e-05, "loss": 0.944, "num_input_tokens_seen": 38830080, "step": 149 }, { "epoch": 1.6921985815602838, "grad_norm": 1.2005973709528561, "learning_rate": 3.6981224668001424e-05, "loss": 0.9855, "num_input_tokens_seen": 39092224, "step": 150 }, { "epoch": 1.7035460992907803, "grad_norm": 1.1796966769071517, "learning_rate": 3.682425526953359e-05, "loss": 0.9785, "num_input_tokens_seen": 39354368, "step": 151 }, { "epoch": 1.7148936170212767, "grad_norm": 1.1401769784499336, "learning_rate": 3.6666683080641846e-05, "loss": 0.9509, "num_input_tokens_seen": 39616512, "step": 152 }, { "epoch": 1.7262411347517732, "grad_norm": 1.2720626628857692, "learning_rate": 3.6508516134221635e-05, "loss": 0.965, "num_input_tokens_seen": 39878656, "step": 153 }, { "epoch": 1.7375886524822695, "grad_norm": 1.1887836841186321, "learning_rate": 3.634976249348867e-05, "loss": 0.9737, "num_input_tokens_seen": 40140800, "step": 154 }, { "epoch": 1.748936170212766, "grad_norm": 1.2318081552834408, "learning_rate": 3.619043025156782e-05, "loss": 0.947, "num_input_tokens_seen": 40402944, "step": 155 }, { "epoch": 1.7602836879432624, "grad_norm": 1.0371027068087095, "learning_rate": 3.603052753108053e-05, "loss": 0.9782, "num_input_tokens_seen": 40665088, "step": 156 }, { "epoch": 1.7716312056737589, "grad_norm": 1.2582450073527114, "learning_rate": 3.58700624837308e-05, "loss": 0.9852, "num_input_tokens_seen": 40927232, "step": 157 }, { "epoch": 1.7829787234042553, "grad_norm": 1.159400337663857, "learning_rate": 3.5709043289889536e-05, "loss": 0.9779, "num_input_tokens_seen": 41189376, "step": 158 }, { "epoch": 1.7943262411347518, "grad_norm": 1.1094376112354303, "learning_rate": 3.554747815817756e-05, "loss": 0.9958, "num_input_tokens_seen": 41451520, "step": 159 }, { "epoch": 1.8056737588652483, "grad_norm": 1.2198757703909164, "learning_rate": 3.5385375325047166e-05, "loss": 0.9784, "num_input_tokens_seen": 41713664, "step": 160 }, { "epoch": 1.8170212765957445, "grad_norm": 1.0480308860957073, "learning_rate": 3.522274305436217e-05, "loss": 0.9644, "num_input_tokens_seen": 41975808, "step": 161 }, { "epoch": 1.828368794326241, "grad_norm": 1.1883804521566375, "learning_rate": 3.50595896369767e-05, "loss": 0.9513, "num_input_tokens_seen": 42237952, "step": 162 }, { "epoch": 1.8397163120567375, "grad_norm": 1.1041573287946038, "learning_rate": 3.4895923390312466e-05, "loss": 0.9935, "num_input_tokens_seen": 42500096, "step": 163 }, { "epoch": 1.851063829787234, "grad_norm": 1.0322154463171043, "learning_rate": 3.4731752657934794e-05, "loss": 1.0127, "num_input_tokens_seen": 42762240, "step": 164 }, { "epoch": 1.8624113475177304, "grad_norm": 1.210272229526448, "learning_rate": 3.456708580912725e-05, "loss": 0.94, "num_input_tokens_seen": 43024384, "step": 165 }, { "epoch": 1.8737588652482269, "grad_norm": 1.1397057549276046, "learning_rate": 3.4401931238464994e-05, "loss": 1.0017, "num_input_tokens_seen": 43286528, "step": 166 }, { "epoch": 1.8851063829787233, "grad_norm": 1.052990158347976, "learning_rate": 3.423629736538685e-05, "loss": 0.9705, "num_input_tokens_seen": 43548672, "step": 167 }, { "epoch": 1.8964539007092198, "grad_norm": 1.247482706002507, "learning_rate": 3.4070192633766025e-05, "loss": 0.9841, "num_input_tokens_seen": 43810816, "step": 168 }, { "epoch": 1.9078014184397163, "grad_norm": 1.1048483913646374, "learning_rate": 3.390362551147974e-05, "loss": 0.9737, "num_input_tokens_seen": 44072960, "step": 169 }, { "epoch": 1.9191489361702128, "grad_norm": 1.0770863251176332, "learning_rate": 3.3736604489977466e-05, "loss": 1.0008, "num_input_tokens_seen": 44335104, "step": 170 }, { "epoch": 1.9304964539007092, "grad_norm": 1.130827655449932, "learning_rate": 3.356913808384807e-05, "loss": 0.9726, "num_input_tokens_seen": 44597248, "step": 171 }, { "epoch": 1.9418439716312057, "grad_norm": 1.0722010665917157, "learning_rate": 3.3401234830385756e-05, "loss": 0.9711, "num_input_tokens_seen": 44859392, "step": 172 }, { "epoch": 1.9531914893617022, "grad_norm": 1.1584145820517182, "learning_rate": 3.323290328915483e-05, "loss": 0.9989, "num_input_tokens_seen": 45121536, "step": 173 }, { "epoch": 1.9645390070921986, "grad_norm": 1.0846738851387514, "learning_rate": 3.306415204155335e-05, "loss": 0.9588, "num_input_tokens_seen": 45383680, "step": 174 }, { "epoch": 1.9758865248226951, "grad_norm": 1.102142271317201, "learning_rate": 3.2894989690375626e-05, "loss": 0.9476, "num_input_tokens_seen": 45645824, "step": 175 }, { "epoch": 1.9872340425531916, "grad_norm": 1.2026725696126155, "learning_rate": 3.272542485937369e-05, "loss": 0.9606, "num_input_tokens_seen": 45907968, "step": 176 }, { "epoch": 1.998581560283688, "grad_norm": 1.2427515943863843, "learning_rate": 3.255546619281765e-05, "loss": 1.0062, "num_input_tokens_seen": 46170112, "step": 177 }, { "epoch": 2.0, "grad_norm": 1.2427515943863843, "learning_rate": 3.2385122355055005e-05, "loss": 0.7789, "num_input_tokens_seen": 46202880, "step": 178 }, { "epoch": 2.0113475177304965, "grad_norm": 3.9851861322995004, "learning_rate": 3.221440203006897e-05, "loss": 0.5235, "num_input_tokens_seen": 46465024, "step": 179 }, { "epoch": 2.022695035460993, "grad_norm": 2.782182214592954, "learning_rate": 3.2043313921035743e-05, "loss": 0.5097, "num_input_tokens_seen": 46727168, "step": 180 }, { "epoch": 2.0340425531914894, "grad_norm": 1.8270642765269718, "learning_rate": 3.1871866749880846e-05, "loss": 0.4839, "num_input_tokens_seen": 46989312, "step": 181 }, { "epoch": 2.045390070921986, "grad_norm": 3.7339811640972473, "learning_rate": 3.170006925683448e-05, "loss": 0.4825, "num_input_tokens_seen": 47251456, "step": 182 }, { "epoch": 2.0567375886524824, "grad_norm": 2.9941865165214754, "learning_rate": 3.152793019998594e-05, "loss": 0.4497, "num_input_tokens_seen": 47513600, "step": 183 }, { "epoch": 2.068085106382979, "grad_norm": 1.8964492803313993, "learning_rate": 3.135545835483718e-05, "loss": 0.4443, "num_input_tokens_seen": 47775744, "step": 184 }, { "epoch": 2.0794326241134753, "grad_norm": 1.6098003889750838, "learning_rate": 3.118266251385539e-05, "loss": 0.4355, "num_input_tokens_seen": 48037888, "step": 185 }, { "epoch": 2.0907801418439718, "grad_norm": 1.523263157923423, "learning_rate": 3.100955148602481e-05, "loss": 0.4265, "num_input_tokens_seen": 48300032, "step": 186 }, { "epoch": 2.1021276595744682, "grad_norm": 1.3639153937211606, "learning_rate": 3.083613409639764e-05, "loss": 0.4058, "num_input_tokens_seen": 48562176, "step": 187 }, { "epoch": 2.1134751773049647, "grad_norm": 1.370922535424333, "learning_rate": 3.0662419185644115e-05, "loss": 0.4004, "num_input_tokens_seen": 48824320, "step": 188 }, { "epoch": 2.124822695035461, "grad_norm": 1.440676408033895, "learning_rate": 3.0488415609601862e-05, "loss": 0.389, "num_input_tokens_seen": 49086464, "step": 189 }, { "epoch": 2.1361702127659576, "grad_norm": 1.310285355956054, "learning_rate": 3.0314132238824415e-05, "loss": 0.404, "num_input_tokens_seen": 49348608, "step": 190 }, { "epoch": 2.147517730496454, "grad_norm": 1.4941784665076816, "learning_rate": 3.013957795812902e-05, "loss": 0.382, "num_input_tokens_seen": 49610752, "step": 191 }, { "epoch": 2.1588652482269506, "grad_norm": 1.4030391388098873, "learning_rate": 2.996476166614364e-05, "loss": 0.3788, "num_input_tokens_seen": 49872896, "step": 192 }, { "epoch": 2.1702127659574466, "grad_norm": 1.256129131429154, "learning_rate": 2.9789692274853388e-05, "loss": 0.3925, "num_input_tokens_seen": 50135040, "step": 193 }, { "epoch": 2.181560283687943, "grad_norm": 1.4558080899884718, "learning_rate": 2.9614378709146133e-05, "loss": 0.3868, "num_input_tokens_seen": 50397184, "step": 194 }, { "epoch": 2.1929078014184396, "grad_norm": 1.250507734486578, "learning_rate": 2.943882990635759e-05, "loss": 0.3775, "num_input_tokens_seen": 50659328, "step": 195 }, { "epoch": 2.204255319148936, "grad_norm": 1.2741202231518605, "learning_rate": 2.92630548158156e-05, "loss": 0.3897, "num_input_tokens_seen": 50921472, "step": 196 }, { "epoch": 2.2156028368794325, "grad_norm": 1.2133688441052128, "learning_rate": 2.9087062398384e-05, "loss": 0.3644, "num_input_tokens_seen": 51183616, "step": 197 }, { "epoch": 2.226950354609929, "grad_norm": 1.1909730406616328, "learning_rate": 2.8910861626005776e-05, "loss": 0.3933, "num_input_tokens_seen": 51445760, "step": 198 }, { "epoch": 2.2382978723404254, "grad_norm": 1.1826769961907178, "learning_rate": 2.873446148124563e-05, "loss": 0.4031, "num_input_tokens_seen": 51707904, "step": 199 }, { "epoch": 2.249645390070922, "grad_norm": 1.2503503019173514, "learning_rate": 2.8557870956832132e-05, "loss": 0.3774, "num_input_tokens_seen": 51970048, "step": 200 }, { "epoch": 2.2609929078014184, "grad_norm": 1.1228352195116507, "learning_rate": 2.8381099055199222e-05, "loss": 0.396, "num_input_tokens_seen": 52232192, "step": 201 }, { "epoch": 2.272340425531915, "grad_norm": 1.2833412130464927, "learning_rate": 2.8204154788027325e-05, "loss": 0.3589, "num_input_tokens_seen": 52494336, "step": 202 }, { "epoch": 2.2836879432624113, "grad_norm": 1.1493157288933067, "learning_rate": 2.8027047175783873e-05, "loss": 0.36, "num_input_tokens_seen": 52756480, "step": 203 }, { "epoch": 2.295035460992908, "grad_norm": 1.2057679652659834, "learning_rate": 2.7849785247263515e-05, "loss": 0.3938, "num_input_tokens_seen": 53018624, "step": 204 }, { "epoch": 2.3063829787234043, "grad_norm": 1.2859581898852654, "learning_rate": 2.767237803912783e-05, "loss": 0.4006, "num_input_tokens_seen": 53280768, "step": 205 }, { "epoch": 2.3177304964539007, "grad_norm": 1.2255059859165218, "learning_rate": 2.7494834595444568e-05, "loss": 0.3798, "num_input_tokens_seen": 53542912, "step": 206 }, { "epoch": 2.329078014184397, "grad_norm": 1.1213649523858145, "learning_rate": 2.731716396722672e-05, "loss": 0.3806, "num_input_tokens_seen": 53805056, "step": 207 }, { "epoch": 2.3404255319148937, "grad_norm": 1.215408045504973, "learning_rate": 2.7139375211970996e-05, "loss": 0.3794, "num_input_tokens_seen": 54067200, "step": 208 }, { "epoch": 2.35177304964539, "grad_norm": 1.1823599565086842, "learning_rate": 2.6961477393196126e-05, "loss": 0.3836, "num_input_tokens_seen": 54329344, "step": 209 }, { "epoch": 2.3631205673758866, "grad_norm": 1.1340883173812746, "learning_rate": 2.6783479579980807e-05, "loss": 0.3769, "num_input_tokens_seen": 54591488, "step": 210 }, { "epoch": 2.374468085106383, "grad_norm": 1.2066922550248658, "learning_rate": 2.6605390846501377e-05, "loss": 0.3854, "num_input_tokens_seen": 54853632, "step": 211 }, { "epoch": 2.3858156028368795, "grad_norm": 1.2171750570169015, "learning_rate": 2.6427220271569203e-05, "loss": 0.3872, "num_input_tokens_seen": 55115776, "step": 212 }, { "epoch": 2.397163120567376, "grad_norm": 1.154268662146649, "learning_rate": 2.624897693816785e-05, "loss": 0.3877, "num_input_tokens_seen": 55377920, "step": 213 }, { "epoch": 2.4085106382978725, "grad_norm": 1.1755024157603309, "learning_rate": 2.6070669932990067e-05, "loss": 0.3839, "num_input_tokens_seen": 55640064, "step": 214 }, { "epoch": 2.419858156028369, "grad_norm": 1.2230171275187467, "learning_rate": 2.5892308345974515e-05, "loss": 0.3765, "num_input_tokens_seen": 55902208, "step": 215 }, { "epoch": 2.4312056737588654, "grad_norm": 1.1601360552600042, "learning_rate": 2.5713901269842404e-05, "loss": 0.3788, "num_input_tokens_seen": 56164352, "step": 216 }, { "epoch": 2.4425531914893615, "grad_norm": 1.2731492044445878, "learning_rate": 2.5535457799633955e-05, "loss": 0.3773, "num_input_tokens_seen": 56426496, "step": 217 }, { "epoch": 2.453900709219858, "grad_norm": 1.1913197047744846, "learning_rate": 2.5356987032244683e-05, "loss": 0.3741, "num_input_tokens_seen": 56688640, "step": 218 }, { "epoch": 2.4652482269503544, "grad_norm": 1.3734178315666306, "learning_rate": 2.5178498065961736e-05, "loss": 0.3959, "num_input_tokens_seen": 56950784, "step": 219 }, { "epoch": 2.476595744680851, "grad_norm": 1.2130341884336506, "learning_rate": 2.5e-05, "loss": 0.3795, "num_input_tokens_seen": 57212928, "step": 220 }, { "epoch": 2.4879432624113473, "grad_norm": 1.2201255253281913, "learning_rate": 2.4821501934038266e-05, "loss": 0.3643, "num_input_tokens_seen": 57475072, "step": 221 }, { "epoch": 2.499290780141844, "grad_norm": 1.215550219664517, "learning_rate": 2.4643012967755326e-05, "loss": 0.3926, "num_input_tokens_seen": 57737216, "step": 222 }, { "epoch": 2.5106382978723403, "grad_norm": 1.390476571364045, "learning_rate": 2.446454220036605e-05, "loss": 0.3904, "num_input_tokens_seen": 57999360, "step": 223 }, { "epoch": 2.5219858156028367, "grad_norm": 1.117871728218981, "learning_rate": 2.42860987301576e-05, "loss": 0.373, "num_input_tokens_seen": 58261504, "step": 224 }, { "epoch": 2.533333333333333, "grad_norm": 1.3698199224662784, "learning_rate": 2.410769165402549e-05, "loss": 0.4068, "num_input_tokens_seen": 58523648, "step": 225 }, { "epoch": 2.5446808510638297, "grad_norm": 1.3551110495536183, "learning_rate": 2.3929330067009942e-05, "loss": 0.3859, "num_input_tokens_seen": 58785792, "step": 226 }, { "epoch": 2.556028368794326, "grad_norm": 1.294001666579498, "learning_rate": 2.3751023061832158e-05, "loss": 0.3738, "num_input_tokens_seen": 59047936, "step": 227 }, { "epoch": 2.5673758865248226, "grad_norm": 1.2033518812521018, "learning_rate": 2.35727797284308e-05, "loss": 0.4054, "num_input_tokens_seen": 59310080, "step": 228 }, { "epoch": 2.578723404255319, "grad_norm": 1.3809415688054603, "learning_rate": 2.339460915349862e-05, "loss": 0.3948, "num_input_tokens_seen": 59572224, "step": 229 }, { "epoch": 2.5900709219858156, "grad_norm": 1.144779349719294, "learning_rate": 2.3216520420019195e-05, "loss": 0.3976, "num_input_tokens_seen": 59834368, "step": 230 }, { "epoch": 2.601418439716312, "grad_norm": 1.2132146253753062, "learning_rate": 2.303852260680388e-05, "loss": 0.3899, "num_input_tokens_seen": 60096512, "step": 231 }, { "epoch": 2.6127659574468085, "grad_norm": 1.1499252028219595, "learning_rate": 2.2860624788029013e-05, "loss": 0.4025, "num_input_tokens_seen": 60358656, "step": 232 }, { "epoch": 2.624113475177305, "grad_norm": 1.1403014147417394, "learning_rate": 2.268283603277328e-05, "loss": 0.3858, "num_input_tokens_seen": 60620800, "step": 233 }, { "epoch": 2.6354609929078014, "grad_norm": 1.13239820788593, "learning_rate": 2.250516540455543e-05, "loss": 0.3676, "num_input_tokens_seen": 60882944, "step": 234 }, { "epoch": 2.646808510638298, "grad_norm": 1.1982201887250508, "learning_rate": 2.2327621960872187e-05, "loss": 0.3744, "num_input_tokens_seen": 61145088, "step": 235 }, { "epoch": 2.6581560283687944, "grad_norm": 1.091653134615552, "learning_rate": 2.2150214752736488e-05, "loss": 0.3804, "num_input_tokens_seen": 61407232, "step": 236 }, { "epoch": 2.669503546099291, "grad_norm": 1.0757322285847275, "learning_rate": 2.197295282421613e-05, "loss": 0.3636, "num_input_tokens_seen": 61669376, "step": 237 }, { "epoch": 2.6808510638297873, "grad_norm": 1.1907618432720997, "learning_rate": 2.179584521197268e-05, "loss": 0.3836, "num_input_tokens_seen": 61931520, "step": 238 }, { "epoch": 2.692198581560284, "grad_norm": 1.154673483311636, "learning_rate": 2.1618900944800777e-05, "loss": 0.3797, "num_input_tokens_seen": 62193664, "step": 239 }, { "epoch": 2.7035460992907803, "grad_norm": 1.155175360207987, "learning_rate": 2.1442129043167874e-05, "loss": 0.3783, "num_input_tokens_seen": 62455808, "step": 240 }, { "epoch": 2.7148936170212767, "grad_norm": 1.2332827172317569, "learning_rate": 2.1265538518754374e-05, "loss": 0.3778, "num_input_tokens_seen": 62717952, "step": 241 }, { "epoch": 2.726241134751773, "grad_norm": 1.1881992350835984, "learning_rate": 2.1089138373994223e-05, "loss": 0.4034, "num_input_tokens_seen": 62980096, "step": 242 }, { "epoch": 2.7375886524822697, "grad_norm": 1.2274329880141919, "learning_rate": 2.0912937601616005e-05, "loss": 0.3808, "num_input_tokens_seen": 63242240, "step": 243 }, { "epoch": 2.748936170212766, "grad_norm": 1.1187546128405046, "learning_rate": 2.0736945184184405e-05, "loss": 0.3743, "num_input_tokens_seen": 63504384, "step": 244 }, { "epoch": 2.7602836879432626, "grad_norm": 1.2604037547622573, "learning_rate": 2.0561170093642423e-05, "loss": 0.4019, "num_input_tokens_seen": 63766528, "step": 245 }, { "epoch": 2.771631205673759, "grad_norm": 1.1558782623081498, "learning_rate": 2.038562129085387e-05, "loss": 0.3791, "num_input_tokens_seen": 64028672, "step": 246 }, { "epoch": 2.7829787234042556, "grad_norm": 1.1080497501880997, "learning_rate": 2.0210307725146615e-05, "loss": 0.3696, "num_input_tokens_seen": 64290816, "step": 247 }, { "epoch": 2.794326241134752, "grad_norm": 1.3333275378619078, "learning_rate": 2.003523833385637e-05, "loss": 0.3945, "num_input_tokens_seen": 64552960, "step": 248 }, { "epoch": 2.8056737588652485, "grad_norm": 1.1083885717571962, "learning_rate": 1.9860422041870987e-05, "loss": 0.3922, "num_input_tokens_seen": 64815104, "step": 249 }, { "epoch": 2.8170212765957445, "grad_norm": 1.2103467593229682, "learning_rate": 1.9685867761175584e-05, "loss": 0.3921, "num_input_tokens_seen": 65077248, "step": 250 }, { "epoch": 2.828368794326241, "grad_norm": 1.0852218255349009, "learning_rate": 1.9511584390398147e-05, "loss": 0.3846, "num_input_tokens_seen": 65339392, "step": 251 }, { "epoch": 2.8397163120567375, "grad_norm": 1.2112711184032858, "learning_rate": 1.9337580814355888e-05, "loss": 0.3741, "num_input_tokens_seen": 65601536, "step": 252 }, { "epoch": 2.851063829787234, "grad_norm": 1.1548719932949865, "learning_rate": 1.9163865903602374e-05, "loss": 0.3688, "num_input_tokens_seen": 65863680, "step": 253 }, { "epoch": 2.8624113475177304, "grad_norm": 1.0632108357505439, "learning_rate": 1.899044851397519e-05, "loss": 0.3809, "num_input_tokens_seen": 66125824, "step": 254 }, { "epoch": 2.873758865248227, "grad_norm": 1.2738307250224112, "learning_rate": 1.881733748614461e-05, "loss": 0.3861, "num_input_tokens_seen": 66387968, "step": 255 }, { "epoch": 2.8851063829787233, "grad_norm": 1.2665120384000739, "learning_rate": 1.8644541645162834e-05, "loss": 0.3905, "num_input_tokens_seen": 66650112, "step": 256 }, { "epoch": 2.89645390070922, "grad_norm": 1.1562998336132535, "learning_rate": 1.8472069800014068e-05, "loss": 0.3614, "num_input_tokens_seen": 66912256, "step": 257 }, { "epoch": 2.9078014184397163, "grad_norm": 1.1877334097829628, "learning_rate": 1.8299930743165535e-05, "loss": 0.383, "num_input_tokens_seen": 67174400, "step": 258 }, { "epoch": 2.9191489361702128, "grad_norm": 1.077396638449169, "learning_rate": 1.8128133250119157e-05, "loss": 0.3735, "num_input_tokens_seen": 67436544, "step": 259 }, { "epoch": 2.9304964539007092, "grad_norm": 1.1563881856425553, "learning_rate": 1.795668607896426e-05, "loss": 0.3978, "num_input_tokens_seen": 67698688, "step": 260 }, { "epoch": 2.9418439716312057, "grad_norm": 1.0938743557105097, "learning_rate": 1.778559796993104e-05, "loss": 0.3877, "num_input_tokens_seen": 67960832, "step": 261 }, { "epoch": 2.953191489361702, "grad_norm": 1.1062018663854845, "learning_rate": 1.7614877644945e-05, "loss": 0.3862, "num_input_tokens_seen": 68222976, "step": 262 }, { "epoch": 2.9645390070921986, "grad_norm": 1.0442669458478842, "learning_rate": 1.7444533807182357e-05, "loss": 0.3506, "num_input_tokens_seen": 68485120, "step": 263 }, { "epoch": 2.975886524822695, "grad_norm": 1.0755649234512432, "learning_rate": 1.7274575140626318e-05, "loss": 0.371, "num_input_tokens_seen": 68747264, "step": 264 }, { "epoch": 2.9872340425531916, "grad_norm": 1.0856119742333918, "learning_rate": 1.710501030962438e-05, "loss": 0.3866, "num_input_tokens_seen": 69009408, "step": 265 }, { "epoch": 2.998581560283688, "grad_norm": 1.0445462962582197, "learning_rate": 1.6935847958446657e-05, "loss": 0.3701, "num_input_tokens_seen": 69271552, "step": 266 }, { "epoch": 3.0, "grad_norm": 1.0445462962582197, "learning_rate": 1.6767096710845174e-05, "loss": 0.2901, "num_input_tokens_seen": 69304320, "step": 267 }, { "epoch": 3.0113475177304965, "grad_norm": 2.74466686658697, "learning_rate": 1.6598765169614243e-05, "loss": 0.1312, "num_input_tokens_seen": 69566464, "step": 268 }, { "epoch": 3.022695035460993, "grad_norm": 1.5834322773598914, "learning_rate": 1.643086191615194e-05, "loss": 0.1237, "num_input_tokens_seen": 69828608, "step": 269 }, { "epoch": 3.0340425531914894, "grad_norm": 1.303912889019024, "learning_rate": 1.6263395510022543e-05, "loss": 0.1163, "num_input_tokens_seen": 70090752, "step": 270 }, { "epoch": 3.045390070921986, "grad_norm": 0.9262307477869844, "learning_rate": 1.6096374488520265e-05, "loss": 0.101, "num_input_tokens_seen": 70352896, "step": 271 }, { "epoch": 3.0567375886524824, "grad_norm": 0.9926400294079983, "learning_rate": 1.5929807366233977e-05, "loss": 0.1097, "num_input_tokens_seen": 70615040, "step": 272 }, { "epoch": 3.068085106382979, "grad_norm": 1.275115327394668, "learning_rate": 1.5763702634613152e-05, "loss": 0.1049, "num_input_tokens_seen": 70877184, "step": 273 }, { "epoch": 3.0794326241134753, "grad_norm": 1.5930282703149587, "learning_rate": 1.559806876153501e-05, "loss": 0.1168, "num_input_tokens_seen": 71139328, "step": 274 }, { "epoch": 3.0907801418439718, "grad_norm": 1.4943109059271082, "learning_rate": 1.5432914190872757e-05, "loss": 0.1063, "num_input_tokens_seen": 71401472, "step": 275 }, { "epoch": 3.1021276595744682, "grad_norm": 1.3224758273342347, "learning_rate": 1.5268247342065215e-05, "loss": 0.0996, "num_input_tokens_seen": 71663616, "step": 276 }, { "epoch": 3.1134751773049647, "grad_norm": 1.0401363088355635, "learning_rate": 1.5104076609687545e-05, "loss": 0.099, "num_input_tokens_seen": 71925760, "step": 277 }, { "epoch": 3.124822695035461, "grad_norm": 0.9168634985756315, "learning_rate": 1.4940410363023306e-05, "loss": 0.0904, "num_input_tokens_seen": 72187904, "step": 278 }, { "epoch": 3.1361702127659576, "grad_norm": 0.8236578810199936, "learning_rate": 1.4777256945637834e-05, "loss": 0.0986, "num_input_tokens_seen": 72450048, "step": 279 }, { "epoch": 3.147517730496454, "grad_norm": 0.7819456617953726, "learning_rate": 1.4614624674952842e-05, "loss": 0.0922, "num_input_tokens_seen": 72712192, "step": 280 }, { "epoch": 3.1588652482269506, "grad_norm": 0.811675121286178, "learning_rate": 1.4452521841822436e-05, "loss": 0.0981, "num_input_tokens_seen": 72974336, "step": 281 }, { "epoch": 3.1702127659574466, "grad_norm": 0.7976686044001465, "learning_rate": 1.4290956710110475e-05, "loss": 0.0924, "num_input_tokens_seen": 73236480, "step": 282 }, { "epoch": 3.181560283687943, "grad_norm": 0.751816526314869, "learning_rate": 1.4129937516269203e-05, "loss": 0.0947, "num_input_tokens_seen": 73498624, "step": 283 }, { "epoch": 3.1929078014184396, "grad_norm": 0.8177754717456022, "learning_rate": 1.3969472468919461e-05, "loss": 0.0921, "num_input_tokens_seen": 73760768, "step": 284 }, { "epoch": 3.204255319148936, "grad_norm": 0.7571785449155486, "learning_rate": 1.3809569748432189e-05, "loss": 0.0906, "num_input_tokens_seen": 74022912, "step": 285 }, { "epoch": 3.2156028368794325, "grad_norm": 0.747961359332696, "learning_rate": 1.3650237506511331e-05, "loss": 0.0872, "num_input_tokens_seen": 74285056, "step": 286 }, { "epoch": 3.226950354609929, "grad_norm": 0.792288904094197, "learning_rate": 1.3491483865778365e-05, "loss": 0.0884, "num_input_tokens_seen": 74547200, "step": 287 }, { "epoch": 3.2382978723404254, "grad_norm": 0.8907746700880744, "learning_rate": 1.3333316919358157e-05, "loss": 0.0967, "num_input_tokens_seen": 74809344, "step": 288 }, { "epoch": 3.249645390070922, "grad_norm": 0.7775009871286768, "learning_rate": 1.3175744730466408e-05, "loss": 0.0818, "num_input_tokens_seen": 75071488, "step": 289 }, { "epoch": 3.2609929078014184, "grad_norm": 0.7695449735932754, "learning_rate": 1.301877533199859e-05, "loss": 0.0888, "num_input_tokens_seen": 75333632, "step": 290 }, { "epoch": 3.272340425531915, "grad_norm": 0.8110072874482183, "learning_rate": 1.2862416726120396e-05, "loss": 0.0906, "num_input_tokens_seen": 75595776, "step": 291 }, { "epoch": 3.2836879432624113, "grad_norm": 0.7766644179424415, "learning_rate": 1.2706676883859903e-05, "loss": 0.0872, "num_input_tokens_seen": 75857920, "step": 292 }, { "epoch": 3.295035460992908, "grad_norm": 0.7189275332544894, "learning_rate": 1.2551563744701109e-05, "loss": 0.087, "num_input_tokens_seen": 76120064, "step": 293 }, { "epoch": 3.3063829787234043, "grad_norm": 0.711512593628328, "learning_rate": 1.2397085216179208e-05, "loss": 0.0852, "num_input_tokens_seen": 76382208, "step": 294 }, { "epoch": 3.3177304964539007, "grad_norm": 0.7338724262748597, "learning_rate": 1.2243249173477513e-05, "loss": 0.0877, "num_input_tokens_seen": 76644352, "step": 295 }, { "epoch": 3.329078014184397, "grad_norm": 0.7269268914811858, "learning_rate": 1.2090063459025955e-05, "loss": 0.086, "num_input_tokens_seen": 76906496, "step": 296 }, { "epoch": 3.3404255319148937, "grad_norm": 0.7772637508360701, "learning_rate": 1.1937535882101281e-05, "loss": 0.0898, "num_input_tokens_seen": 77168640, "step": 297 }, { "epoch": 3.35177304964539, "grad_norm": 0.7412964391797219, "learning_rate": 1.1785674218428952e-05, "loss": 0.0936, "num_input_tokens_seen": 77430784, "step": 298 }, { "epoch": 3.3631205673758866, "grad_norm": 0.717507466030445, "learning_rate": 1.163448620978674e-05, "loss": 0.0837, "num_input_tokens_seen": 77692928, "step": 299 }, { "epoch": 3.374468085106383, "grad_norm": 0.7491045281676182, "learning_rate": 1.148397956361007e-05, "loss": 0.0902, "num_input_tokens_seen": 77955072, "step": 300 }, { "epoch": 3.3858156028368795, "grad_norm": 0.7816973300591532, "learning_rate": 1.1334161952599054e-05, "loss": 0.085, "num_input_tokens_seen": 78217216, "step": 301 }, { "epoch": 3.397163120567376, "grad_norm": 0.7754872404477443, "learning_rate": 1.1185041014327433e-05, "loss": 0.0967, "num_input_tokens_seen": 78479360, "step": 302 }, { "epoch": 3.4085106382978725, "grad_norm": 0.7562220941293399, "learning_rate": 1.1036624350853145e-05, "loss": 0.0861, "num_input_tokens_seen": 78741504, "step": 303 }, { "epoch": 3.419858156028369, "grad_norm": 0.7446834922212843, "learning_rate": 1.0888919528330777e-05, "loss": 0.0866, "num_input_tokens_seen": 79003648, "step": 304 }, { "epoch": 3.4312056737588654, "grad_norm": 0.7368714630650928, "learning_rate": 1.0741934076625895e-05, "loss": 0.0909, "num_input_tokens_seen": 79265792, "step": 305 }, { "epoch": 3.4425531914893615, "grad_norm": 0.7586993943731002, "learning_rate": 1.059567548893118e-05, "loss": 0.0864, "num_input_tokens_seen": 79527936, "step": 306 }, { "epoch": 3.453900709219858, "grad_norm": 0.7026395743924377, "learning_rate": 1.0450151221384358e-05, "loss": 0.0907, "num_input_tokens_seen": 79790080, "step": 307 }, { "epoch": 3.4652482269503544, "grad_norm": 0.7698549183761029, "learning_rate": 1.0305368692688174e-05, "loss": 0.0875, "num_input_tokens_seen": 80052224, "step": 308 }, { "epoch": 3.476595744680851, "grad_norm": 0.7275021764735461, "learning_rate": 1.016133528373215e-05, "loss": 0.0876, "num_input_tokens_seen": 80314368, "step": 309 }, { "epoch": 3.4879432624113473, "grad_norm": 0.6801616141816872, "learning_rate": 1.0018058337216327e-05, "loss": 0.0757, "num_input_tokens_seen": 80576512, "step": 310 }, { "epoch": 3.499290780141844, "grad_norm": 0.7242637394130332, "learning_rate": 9.875545157276939e-06, "loss": 0.0805, "num_input_tokens_seen": 80838656, "step": 311 }, { "epoch": 3.5106382978723403, "grad_norm": 0.695464243410679, "learning_rate": 9.733803009114045e-06, "loss": 0.0853, "num_input_tokens_seen": 81100800, "step": 312 }, { "epoch": 3.5219858156028367, "grad_norm": 0.7091878866235037, "learning_rate": 9.592839118621187e-06, "loss": 0.0871, "num_input_tokens_seen": 81362944, "step": 313 }, { "epoch": 3.533333333333333, "grad_norm": 0.7211061205609051, "learning_rate": 9.452660672016949e-06, "loss": 0.0787, "num_input_tokens_seen": 81625088, "step": 314 }, { "epoch": 3.5446808510638297, "grad_norm": 0.7222949726436061, "learning_rate": 9.313274815478698e-06, "loss": 0.0862, "num_input_tokens_seen": 81887232, "step": 315 }, { "epoch": 3.556028368794326, "grad_norm": 0.7838259636406755, "learning_rate": 9.174688654778243e-06, "loss": 0.0936, "num_input_tokens_seen": 82149376, "step": 316 }, { "epoch": 3.5673758865248226, "grad_norm": 0.7463416558404256, "learning_rate": 9.036909254919549e-06, "loss": 0.0802, "num_input_tokens_seen": 82411520, "step": 317 }, { "epoch": 3.578723404255319, "grad_norm": 0.7081806282392477, "learning_rate": 8.899943639778619e-06, "loss": 0.0889, "num_input_tokens_seen": 82673664, "step": 318 }, { "epoch": 3.5900709219858156, "grad_norm": 0.725305882992691, "learning_rate": 8.763798791745411e-06, "loss": 0.0787, "num_input_tokens_seen": 82935808, "step": 319 }, { "epoch": 3.601418439716312, "grad_norm": 0.6909525153125249, "learning_rate": 8.628481651367876e-06, "loss": 0.084, "num_input_tokens_seen": 83197952, "step": 320 }, { "epoch": 3.6127659574468085, "grad_norm": 0.8073887823074956, "learning_rate": 8.49399911699814e-06, "loss": 0.0897, "num_input_tokens_seen": 83460096, "step": 321 }, { "epoch": 3.624113475177305, "grad_norm": 0.7270215654465839, "learning_rate": 8.360358044440797e-06, "loss": 0.0876, "num_input_tokens_seen": 83722240, "step": 322 }, { "epoch": 3.6354609929078014, "grad_norm": 0.7664259700341645, "learning_rate": 8.227565246603493e-06, "loss": 0.0815, "num_input_tokens_seen": 83984384, "step": 323 }, { "epoch": 3.646808510638298, "grad_norm": 0.6912127705169636, "learning_rate": 8.09562749314952e-06, "loss": 0.0855, "num_input_tokens_seen": 84246528, "step": 324 }, { "epoch": 3.6581560283687944, "grad_norm": 0.7391846111163392, "learning_rate": 7.96455151015272e-06, "loss": 0.086, "num_input_tokens_seen": 84508672, "step": 325 }, { "epoch": 3.669503546099291, "grad_norm": 0.7040432780957079, "learning_rate": 7.83434397975466e-06, "loss": 0.0869, "num_input_tokens_seen": 84770816, "step": 326 }, { "epoch": 3.6808510638297873, "grad_norm": 0.7049978471337497, "learning_rate": 7.705011539823911e-06, "loss": 0.0791, "num_input_tokens_seen": 85032960, "step": 327 }, { "epoch": 3.692198581560284, "grad_norm": 0.6773004765686385, "learning_rate": 7.576560783617668e-06, "loss": 0.0807, "num_input_tokens_seen": 85295104, "step": 328 }, { "epoch": 3.7035460992907803, "grad_norm": 0.7205733650917361, "learning_rate": 7.448998259445664e-06, "loss": 0.0846, "num_input_tokens_seen": 85557248, "step": 329 }, { "epoch": 3.7148936170212767, "grad_norm": 0.7090150829436532, "learning_rate": 7.3223304703363135e-06, "loss": 0.0755, "num_input_tokens_seen": 85819392, "step": 330 }, { "epoch": 3.726241134751773, "grad_norm": 0.7317047711785631, "learning_rate": 7.196563873705209e-06, "loss": 0.0897, "num_input_tokens_seen": 86081536, "step": 331 }, { "epoch": 3.7375886524822697, "grad_norm": 0.7593361249228872, "learning_rate": 7.071704881025915e-06, "loss": 0.0807, "num_input_tokens_seen": 86343680, "step": 332 }, { "epoch": 3.748936170212766, "grad_norm": 0.7235635799774105, "learning_rate": 6.947759857503119e-06, "loss": 0.0839, "num_input_tokens_seen": 86605824, "step": 333 }, { "epoch": 3.7602836879432626, "grad_norm": 0.7456576507189918, "learning_rate": 6.824735121748163e-06, "loss": 0.0811, "num_input_tokens_seen": 86867968, "step": 334 }, { "epoch": 3.771631205673759, "grad_norm": 0.693061575666079, "learning_rate": 6.70263694545687e-06, "loss": 0.0759, "num_input_tokens_seen": 87130112, "step": 335 }, { "epoch": 3.7829787234042556, "grad_norm": 0.6702503675155104, "learning_rate": 6.5814715530898745e-06, "loss": 0.0764, "num_input_tokens_seen": 87392256, "step": 336 }, { "epoch": 3.794326241134752, "grad_norm": 0.7321643097852675, "learning_rate": 6.461245121555307e-06, "loss": 0.0855, "num_input_tokens_seen": 87654400, "step": 337 }, { "epoch": 3.8056737588652485, "grad_norm": 0.7231929761648604, "learning_rate": 6.341963779893828e-06, "loss": 0.0789, "num_input_tokens_seen": 87916544, "step": 338 }, { "epoch": 3.8170212765957445, "grad_norm": 0.6959508065425039, "learning_rate": 6.223633608966254e-06, "loss": 0.0804, "num_input_tokens_seen": 88178688, "step": 339 }, { "epoch": 3.828368794326241, "grad_norm": 0.6689396318017862, "learning_rate": 6.106260641143546e-06, "loss": 0.0846, "num_input_tokens_seen": 88440832, "step": 340 }, { "epoch": 3.8397163120567375, "grad_norm": 0.7000335984216709, "learning_rate": 5.989850859999227e-06, "loss": 0.0768, "num_input_tokens_seen": 88702976, "step": 341 }, { "epoch": 3.851063829787234, "grad_norm": 0.6819366248376687, "learning_rate": 5.874410200004421e-06, "loss": 0.073, "num_input_tokens_seen": 88965120, "step": 342 }, { "epoch": 3.8624113475177304, "grad_norm": 0.6970883337622755, "learning_rate": 5.759944546225271e-06, "loss": 0.0826, "num_input_tokens_seen": 89227264, "step": 343 }, { "epoch": 3.873758865248227, "grad_norm": 0.6874281419558664, "learning_rate": 5.646459734022938e-06, "loss": 0.0697, "num_input_tokens_seen": 89489408, "step": 344 }, { "epoch": 3.8851063829787233, "grad_norm": 0.7106888911769208, "learning_rate": 5.533961548756128e-06, "loss": 0.0812, "num_input_tokens_seen": 89751552, "step": 345 }, { "epoch": 3.89645390070922, "grad_norm": 0.7331620079344322, "learning_rate": 5.422455725486114e-06, "loss": 0.0836, "num_input_tokens_seen": 90013696, "step": 346 }, { "epoch": 3.9078014184397163, "grad_norm": 0.7035598390060888, "learning_rate": 5.311947948684457e-06, "loss": 0.0815, "num_input_tokens_seen": 90275840, "step": 347 }, { "epoch": 3.9191489361702128, "grad_norm": 0.7060132556751886, "learning_rate": 5.202443851943126e-06, "loss": 0.0806, "num_input_tokens_seen": 90537984, "step": 348 }, { "epoch": 3.9304964539007092, "grad_norm": 0.6788669783816418, "learning_rate": 5.093949017687341e-06, "loss": 0.0794, "num_input_tokens_seen": 90800128, "step": 349 }, { "epoch": 3.9418439716312057, "grad_norm": 0.6687951756948881, "learning_rate": 4.986468976890993e-06, "loss": 0.0734, "num_input_tokens_seen": 91062272, "step": 350 }, { "epoch": 3.953191489361702, "grad_norm": 0.691944078100738, "learning_rate": 4.880009208794667e-06, "loss": 0.0814, "num_input_tokens_seen": 91324416, "step": 351 }, { "epoch": 3.9645390070921986, "grad_norm": 0.6500781762399683, "learning_rate": 4.7745751406263165e-06, "loss": 0.0726, "num_input_tokens_seen": 91586560, "step": 352 }, { "epoch": 3.975886524822695, "grad_norm": 0.671905484958583, "learning_rate": 4.670172147324592e-06, "loss": 0.077, "num_input_tokens_seen": 91848704, "step": 353 }, { "epoch": 3.9872340425531916, "grad_norm": 0.6796917288094408, "learning_rate": 4.566805551264827e-06, "loss": 0.0796, "num_input_tokens_seen": 92110848, "step": 354 }, { "epoch": 3.998581560283688, "grad_norm": 0.6833947734522067, "learning_rate": 4.4644806219877184e-06, "loss": 0.0723, "num_input_tokens_seen": 92372992, "step": 355 }, { "epoch": 4.0, "grad_norm": 0.6833947734522067, "learning_rate": 4.36320257593065e-06, "loss": 0.045, "num_input_tokens_seen": 92405760, "step": 356 }, { "epoch": 4.0113475177304965, "grad_norm": 1.477142079845877, "learning_rate": 4.262976576161842e-06, "loss": 0.0244, "num_input_tokens_seen": 92667904, "step": 357 }, { "epoch": 4.022695035460993, "grad_norm": 0.5476807803856774, "learning_rate": 4.1638077321170646e-06, "loss": 0.0227, "num_input_tokens_seen": 92930048, "step": 358 }, { "epoch": 4.034042553191489, "grad_norm": 0.48697982739940027, "learning_rate": 4.0657010993391865e-06, "loss": 0.0213, "num_input_tokens_seen": 93192192, "step": 359 }, { "epoch": 4.045390070921986, "grad_norm": 0.44199476911249697, "learning_rate": 3.968661679220468e-06, "loss": 0.0196, "num_input_tokens_seen": 93454336, "step": 360 }, { "epoch": 4.056737588652482, "grad_norm": 0.40308653677545786, "learning_rate": 3.872694418747594e-06, "loss": 0.018, "num_input_tokens_seen": 93716480, "step": 361 }, { "epoch": 4.068085106382979, "grad_norm": 0.37702562689636665, "learning_rate": 3.777804210249436e-06, "loss": 0.0176, "num_input_tokens_seen": 93978624, "step": 362 }, { "epoch": 4.079432624113475, "grad_norm": 0.36421639955022134, "learning_rate": 3.6839958911476957e-06, "loss": 0.0193, "num_input_tokens_seen": 94240768, "step": 363 }, { "epoch": 4.090780141843972, "grad_norm": 0.3805365338343004, "learning_rate": 3.591274243710277e-06, "loss": 0.0176, "num_input_tokens_seen": 94502912, "step": 364 }, { "epoch": 4.102127659574468, "grad_norm": 0.3382957037488846, "learning_rate": 3.499643994807486e-06, "loss": 0.0163, "num_input_tokens_seen": 94765056, "step": 365 }, { "epoch": 4.113475177304965, "grad_norm": 0.32506441384380935, "learning_rate": 3.4091098156710744e-06, "loss": 0.0159, "num_input_tokens_seen": 95027200, "step": 366 }, { "epoch": 4.124822695035461, "grad_norm": 0.3165720865193881, "learning_rate": 3.319676321656082e-06, "loss": 0.0164, "num_input_tokens_seen": 95289344, "step": 367 }, { "epoch": 4.136170212765958, "grad_norm": 0.35920729686783553, "learning_rate": 3.2313480720055745e-06, "loss": 0.0159, "num_input_tokens_seen": 95551488, "step": 368 }, { "epoch": 4.147517730496454, "grad_norm": 0.35575880232588847, "learning_rate": 3.1441295696181897e-06, "loss": 0.0161, "num_input_tokens_seen": 95813632, "step": 369 }, { "epoch": 4.158865248226951, "grad_norm": 0.35269152277096033, "learning_rate": 3.058025260818609e-06, "loss": 0.0149, "num_input_tokens_seen": 96075776, "step": 370 }, { "epoch": 4.170212765957447, "grad_norm": 0.38681949998462883, "learning_rate": 2.9730395351308866e-06, "loss": 0.0147, "num_input_tokens_seen": 96337920, "step": 371 }, { "epoch": 4.1815602836879435, "grad_norm": 0.34874854085333057, "learning_rate": 2.889176725054643e-06, "loss": 0.0164, "num_input_tokens_seen": 96600064, "step": 372 }, { "epoch": 4.19290780141844, "grad_norm": 0.42993916416930195, "learning_rate": 2.80644110584424e-06, "loss": 0.0149, "num_input_tokens_seen": 96862208, "step": 373 }, { "epoch": 4.2042553191489365, "grad_norm": 0.3819874015881562, "learning_rate": 2.7248368952908053e-06, "loss": 0.0162, "num_input_tokens_seen": 97124352, "step": 374 }, { "epoch": 4.215602836879433, "grad_norm": 0.39897276144640115, "learning_rate": 2.6443682535072177e-06, "loss": 0.0144, "num_input_tokens_seen": 97386496, "step": 375 }, { "epoch": 4.226950354609929, "grad_norm": 0.3570885820962534, "learning_rate": 2.565039282716045e-06, "loss": 0.0143, "num_input_tokens_seen": 97648640, "step": 376 }, { "epoch": 4.238297872340426, "grad_norm": 0.36959919713875067, "learning_rate": 2.486854027040375e-06, "loss": 0.0147, "num_input_tokens_seen": 97910784, "step": 377 }, { "epoch": 4.249645390070922, "grad_norm": 0.3387567156073276, "learning_rate": 2.4098164722977073e-06, "loss": 0.0142, "num_input_tokens_seen": 98172928, "step": 378 }, { "epoch": 4.260992907801419, "grad_norm": 0.35273580349206224, "learning_rate": 2.333930545796717e-06, "loss": 0.0162, "num_input_tokens_seen": 98435072, "step": 379 }, { "epoch": 4.272340425531915, "grad_norm": 0.41537674406100333, "learning_rate": 2.2592001161370392e-06, "loss": 0.0163, "num_input_tokens_seen": 98697216, "step": 380 }, { "epoch": 4.283687943262412, "grad_norm": 0.3722991986523798, "learning_rate": 2.185628993012101e-06, "loss": 0.0146, "num_input_tokens_seen": 98959360, "step": 381 }, { "epoch": 4.295035460992908, "grad_norm": 0.34823793557474586, "learning_rate": 2.11322092701485e-06, "loss": 0.0155, "num_input_tokens_seen": 99221504, "step": 382 }, { "epoch": 4.306382978723404, "grad_norm": 0.36284584139761145, "learning_rate": 2.0419796094465788e-06, "loss": 0.014, "num_input_tokens_seen": 99483648, "step": 383 }, { "epoch": 4.317730496453901, "grad_norm": 0.33330741537946007, "learning_rate": 1.97190867212875e-06, "loss": 0.0158, "num_input_tokens_seen": 99745792, "step": 384 }, { "epoch": 4.329078014184397, "grad_norm": 0.3678153199936105, "learning_rate": 1.9030116872178316e-06, "loss": 0.0145, "num_input_tokens_seen": 100007936, "step": 385 }, { "epoch": 4.340425531914893, "grad_norm": 0.3179402575720803, "learning_rate": 1.8352921670232143e-06, "loss": 0.0142, "num_input_tokens_seen": 100270080, "step": 386 }, { "epoch": 4.35177304964539, "grad_norm": 0.3139410780519724, "learning_rate": 1.768753563828135e-06, "loss": 0.0145, "num_input_tokens_seen": 100532224, "step": 387 }, { "epoch": 4.363120567375886, "grad_norm": 0.3543527103722898, "learning_rate": 1.703399269713693e-06, "loss": 0.0139, "num_input_tokens_seen": 100794368, "step": 388 }, { "epoch": 4.374468085106383, "grad_norm": 0.335580074515755, "learning_rate": 1.6392326163859273e-06, "loss": 0.0165, "num_input_tokens_seen": 101056512, "step": 389 }, { "epoch": 4.385815602836879, "grad_norm": 0.34683584517312804, "learning_rate": 1.5762568750059604e-06, "loss": 0.0152, "num_input_tokens_seen": 101318656, "step": 390 }, { "epoch": 4.397163120567376, "grad_norm": 0.3626865573265347, "learning_rate": 1.5144752560232372e-06, "loss": 0.016, "num_input_tokens_seen": 101580800, "step": 391 }, { "epoch": 4.408510638297872, "grad_norm": 0.3038959732006494, "learning_rate": 1.4538909090118846e-06, "loss": 0.0135, "num_input_tokens_seen": 101842944, "step": 392 }, { "epoch": 4.4198581560283685, "grad_norm": 0.33499076540774964, "learning_rate": 1.3945069225101026e-06, "loss": 0.0152, "num_input_tokens_seen": 102105088, "step": 393 }, { "epoch": 4.431205673758865, "grad_norm": 0.3607028863228608, "learning_rate": 1.3363263238627493e-06, "loss": 0.0161, "num_input_tokens_seen": 102367232, "step": 394 }, { "epoch": 4.4425531914893615, "grad_norm": 0.33049646901291296, "learning_rate": 1.2793520790670116e-06, "loss": 0.0153, "num_input_tokens_seen": 102629376, "step": 395 }, { "epoch": 4.453900709219858, "grad_norm": 0.3409835608306395, "learning_rate": 1.2235870926211619e-06, "loss": 0.0147, "num_input_tokens_seen": 102891520, "step": 396 }, { "epoch": 4.465248226950354, "grad_norm": 0.3426495926722642, "learning_rate": 1.1690342073765375e-06, "loss": 0.0148, "num_input_tokens_seen": 103153664, "step": 397 }, { "epoch": 4.476595744680851, "grad_norm": 0.3385258820673031, "learning_rate": 1.1156962043925828e-06, "loss": 0.0158, "num_input_tokens_seen": 103415808, "step": 398 }, { "epoch": 4.487943262411347, "grad_norm": 0.3366041472428315, "learning_rate": 1.0635758027950888e-06, "loss": 0.0135, "num_input_tokens_seen": 103677952, "step": 399 }, { "epoch": 4.499290780141844, "grad_norm": 0.3580533391262604, "learning_rate": 1.0126756596375686e-06, "loss": 0.0153, "num_input_tokens_seen": 103940096, "step": 400 }, { "epoch": 4.51063829787234, "grad_norm": 0.32986746041795145, "learning_rate": 9.629983697657886e-07, "loss": 0.0148, "num_input_tokens_seen": 104202240, "step": 401 }, { "epoch": 4.521985815602837, "grad_norm": 0.32413763896921793, "learning_rate": 9.145464656855257e-07, "loss": 0.0137, "num_input_tokens_seen": 104464384, "step": 402 }, { "epoch": 4.533333333333333, "grad_norm": 0.2817637969270016, "learning_rate": 8.673224174334221e-07, "loss": 0.0134, "num_input_tokens_seen": 104726528, "step": 403 }, { "epoch": 4.54468085106383, "grad_norm": 0.348191153365174, "learning_rate": 8.213286324510738e-07, "loss": 0.0147, "num_input_tokens_seen": 104988672, "step": 404 }, { "epoch": 4.556028368794326, "grad_norm": 0.3602776848534372, "learning_rate": 7.765674554623181e-07, "loss": 0.0168, "num_input_tokens_seen": 105250816, "step": 405 }, { "epoch": 4.567375886524823, "grad_norm": 0.31058750754194875, "learning_rate": 7.330411683536876e-07, "loss": 0.0147, "num_input_tokens_seen": 105512960, "step": 406 }, { "epoch": 4.578723404255319, "grad_norm": 0.3075632948113939, "learning_rate": 6.907519900580861e-07, "loss": 0.0155, "num_input_tokens_seen": 105775104, "step": 407 }, { "epoch": 4.590070921985816, "grad_norm": 0.3413483582876884, "learning_rate": 6.497020764416633e-07, "loss": 0.0153, "num_input_tokens_seen": 106037248, "step": 408 }, { "epoch": 4.601418439716312, "grad_norm": 0.3299810205766221, "learning_rate": 6.098935201939187e-07, "loss": 0.0139, "num_input_tokens_seen": 106299392, "step": 409 }, { "epoch": 4.6127659574468085, "grad_norm": 0.3045488270246865, "learning_rate": 5.713283507210148e-07, "loss": 0.0137, "num_input_tokens_seen": 106561536, "step": 410 }, { "epoch": 4.624113475177305, "grad_norm": 0.30919394459980154, "learning_rate": 5.340085340423129e-07, "loss": 0.0136, "num_input_tokens_seen": 106823680, "step": 411 }, { "epoch": 4.6354609929078014, "grad_norm": 0.3317176085043783, "learning_rate": 4.979359726901639e-07, "loss": 0.0145, "num_input_tokens_seen": 107085824, "step": 412 }, { "epoch": 4.646808510638298, "grad_norm": 0.3157192290221517, "learning_rate": 4.63112505612906e-07, "loss": 0.0137, "num_input_tokens_seen": 107347968, "step": 413 }, { "epoch": 4.658156028368794, "grad_norm": 0.304631562860152, "learning_rate": 4.2953990808111135e-07, "loss": 0.015, "num_input_tokens_seen": 107610112, "step": 414 }, { "epoch": 4.669503546099291, "grad_norm": 0.3526905412516224, "learning_rate": 3.972198915970976e-07, "loss": 0.0152, "num_input_tokens_seen": 107872256, "step": 415 }, { "epoch": 4.680851063829787, "grad_norm": 0.33679224274071523, "learning_rate": 3.6615410380767544e-07, "loss": 0.0147, "num_input_tokens_seen": 108134400, "step": 416 }, { "epoch": 4.692198581560284, "grad_norm": 0.32343102481646735, "learning_rate": 3.3634412842014353e-07, "loss": 0.0139, "num_input_tokens_seen": 108396544, "step": 417 }, { "epoch": 4.70354609929078, "grad_norm": 0.2946484230167598, "learning_rate": 3.077914851215585e-07, "loss": 0.0142, "num_input_tokens_seen": 108658688, "step": 418 }, { "epoch": 4.714893617021277, "grad_norm": 0.30378846194550047, "learning_rate": 2.804976295012612e-07, "loss": 0.0139, "num_input_tokens_seen": 108920832, "step": 419 }, { "epoch": 4.726241134751773, "grad_norm": 0.33144732440436586, "learning_rate": 2.544639529766829e-07, "loss": 0.0136, "num_input_tokens_seen": 109182976, "step": 420 }, { "epoch": 4.73758865248227, "grad_norm": 0.3360054980469322, "learning_rate": 2.2969178272238545e-07, "loss": 0.0143, "num_input_tokens_seen": 109445120, "step": 421 }, { "epoch": 4.748936170212766, "grad_norm": 0.3119829984354626, "learning_rate": 2.061823816024322e-07, "loss": 0.0132, "num_input_tokens_seen": 109707264, "step": 422 }, { "epoch": 4.760283687943263, "grad_norm": 0.3182968779470437, "learning_rate": 1.8393694810599493e-07, "loss": 0.0153, "num_input_tokens_seen": 109969408, "step": 423 }, { "epoch": 4.771631205673759, "grad_norm": 0.2823020931727126, "learning_rate": 1.6295661628624447e-07, "loss": 0.0139, "num_input_tokens_seen": 110231552, "step": 424 }, { "epoch": 4.782978723404256, "grad_norm": 0.33810487516657384, "learning_rate": 1.4324245570256633e-07, "loss": 0.0167, "num_input_tokens_seen": 110493696, "step": 425 }, { "epoch": 4.794326241134752, "grad_norm": 0.33893844406644097, "learning_rate": 1.2479547136600989e-07, "loss": 0.0135, "num_input_tokens_seen": 110755840, "step": 426 }, { "epoch": 4.8056737588652485, "grad_norm": 0.3356203471174735, "learning_rate": 1.0761660368806548e-07, "loss": 0.0151, "num_input_tokens_seen": 111017984, "step": 427 }, { "epoch": 4.817021276595745, "grad_norm": 0.3334523590081176, "learning_rate": 9.170672843271666e-08, "loss": 0.0143, "num_input_tokens_seen": 111280128, "step": 428 }, { "epoch": 4.828368794326241, "grad_norm": 0.32060335064770346, "learning_rate": 7.706665667180091e-08, "loss": 0.013, "num_input_tokens_seen": 111542272, "step": 429 }, { "epoch": 4.839716312056738, "grad_norm": 0.30209084334682806, "learning_rate": 6.369713474366212e-08, "loss": 0.0141, "num_input_tokens_seen": 111804416, "step": 430 }, { "epoch": 4.851063829787234, "grad_norm": 0.35524870224084554, "learning_rate": 5.159884421509498e-08, "loss": 0.0144, "num_input_tokens_seen": 112066560, "step": 431 }, { "epoch": 4.862411347517731, "grad_norm": 0.3137882300233803, "learning_rate": 4.07724018466088e-08, "loss": 0.0162, "num_input_tokens_seen": 112328704, "step": 432 }, { "epoch": 4.873758865248227, "grad_norm": 0.31600464447069954, "learning_rate": 3.1218359560974966e-08, "loss": 0.0145, "num_input_tokens_seen": 112590848, "step": 433 }, { "epoch": 4.885106382978723, "grad_norm": 0.36130384891727935, "learning_rate": 2.2937204415107717e-08, "loss": 0.0149, "num_input_tokens_seen": 112852992, "step": 434 }, { "epoch": 4.89645390070922, "grad_norm": 0.28210880961362844, "learning_rate": 1.5929358575206275e-08, "loss": 0.014, "num_input_tokens_seen": 113115136, "step": 435 }, { "epoch": 4.907801418439716, "grad_norm": 0.3370034082905995, "learning_rate": 1.0195179295269252e-08, "loss": 0.0145, "num_input_tokens_seen": 113377280, "step": 436 }, { "epoch": 4.919148936170213, "grad_norm": 0.34659397478326986, "learning_rate": 5.7349588988481194e-09, "loss": 0.0131, "num_input_tokens_seen": 113639424, "step": 437 }, { "epoch": 4.930496453900709, "grad_norm": 0.3388609291955605, "learning_rate": 2.5489247641674596e-09, "loss": 0.0133, "num_input_tokens_seen": 113901568, "step": 438 }, { "epoch": 4.941843971631206, "grad_norm": 0.32782899822451794, "learning_rate": 6.372393125203546e-10, "loss": 0.0147, "num_input_tokens_seen": 114163712, "step": 439 }, { "epoch": 4.953191489361702, "grad_norm": 0.340643949367228, "learning_rate": 0.0, "loss": 0.0166, "num_input_tokens_seen": 114425856, "step": 440 }, { "epoch": 4.953191489361702, "num_input_tokens_seen": 114425856, "step": 440, "total_flos": 182736094494720.0, "train_loss": 0.6679792516632006, "train_runtime": 10942.5374, "train_samples_per_second": 10.298, "train_steps_per_second": 0.04 } ], "logging_steps": 1, "max_steps": 440, "num_input_tokens_seen": 114425856, "num_train_epochs": 5, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 182736094494720.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }