{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9994747767801315, "eval_steps": 500, "global_step": 14991, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010004251807017982, "grad_norm": 24.82086753845215, "learning_rate": 4.9999986275753206e-05, "loss": 6.6438, "num_input_tokens_seen": 16320, "step": 5 }, { "epoch": 0.0020008503614035965, "grad_norm": 19.489648818969727, "learning_rate": 4.99999451030279e-05, "loss": 2.22, "num_input_tokens_seen": 33088, "step": 10 }, { "epoch": 0.0030012755421053947, "grad_norm": 7.6068010330200195, "learning_rate": 4.999987648186929e-05, "loss": 1.1072, "num_input_tokens_seen": 49248, "step": 15 }, { "epoch": 0.004001700722807193, "grad_norm": 12.884929656982422, "learning_rate": 4.99997804123527e-05, "loss": 0.6484, "num_input_tokens_seen": 66496, "step": 20 }, { "epoch": 0.005002125903508991, "grad_norm": 4.620934009552002, "learning_rate": 4.999965689458363e-05, "loss": 0.3944, "num_input_tokens_seen": 83328, "step": 25 }, { "epoch": 0.0060025510842107895, "grad_norm": 5.337153434753418, "learning_rate": 4.9999505928697686e-05, "loss": 0.3009, "num_input_tokens_seen": 100480, "step": 30 }, { "epoch": 0.007002976264912588, "grad_norm": 3.612947702407837, "learning_rate": 4.9999327514860616e-05, "loss": 0.1965, "num_input_tokens_seen": 117856, "step": 35 }, { "epoch": 0.008003401445614386, "grad_norm": 3.843050956726074, "learning_rate": 4.999912165326831e-05, "loss": 0.1579, "num_input_tokens_seen": 134656, "step": 40 }, { "epoch": 0.009003826626316184, "grad_norm": 5.87134313583374, "learning_rate": 4.999888834414679e-05, "loss": 0.148, "num_input_tokens_seen": 151168, "step": 45 }, { "epoch": 0.010004251807017982, "grad_norm": 2.828153610229492, "learning_rate": 4.9998627587752225e-05, "loss": 0.1221, "num_input_tokens_seen": 168800, "step": 50 }, { "epoch": 0.01100467698771978, "grad_norm": 3.484650135040283, "learning_rate": 4.99983393843709e-05, "loss": 0.1377, "num_input_tokens_seen": 187296, "step": 55 }, { "epoch": 0.012005102168421579, "grad_norm": 1.3379253149032593, "learning_rate": 4.999802373431924e-05, "loss": 0.128, "num_input_tokens_seen": 203008, "step": 60 }, { "epoch": 0.013005527349123377, "grad_norm": 2.5667777061462402, "learning_rate": 4.999768063794383e-05, "loss": 0.0907, "num_input_tokens_seen": 219168, "step": 65 }, { "epoch": 0.014005952529825175, "grad_norm": 3.9828338623046875, "learning_rate": 4.999731009562135e-05, "loss": 0.1304, "num_input_tokens_seen": 235040, "step": 70 }, { "epoch": 0.015006377710526974, "grad_norm": 2.727963924407959, "learning_rate": 4.999691210775864e-05, "loss": 0.1006, "num_input_tokens_seen": 251616, "step": 75 }, { "epoch": 0.016006802891228772, "grad_norm": 4.61973762512207, "learning_rate": 4.999648667479266e-05, "loss": 0.1582, "num_input_tokens_seen": 268608, "step": 80 }, { "epoch": 0.017007228071930572, "grad_norm": 3.472965717315674, "learning_rate": 4.9996033797190524e-05, "loss": 0.1014, "num_input_tokens_seen": 285952, "step": 85 }, { "epoch": 0.01800765325263237, "grad_norm": 1.9901703596115112, "learning_rate": 4.999555347544946e-05, "loss": 0.1747, "num_input_tokens_seen": 302528, "step": 90 }, { "epoch": 0.01900807843333417, "grad_norm": 2.2979788780212402, "learning_rate": 4.999504571009682e-05, "loss": 0.0997, "num_input_tokens_seen": 317728, "step": 95 }, { "epoch": 0.020008503614035965, "grad_norm": 1.4213311672210693, "learning_rate": 4.999451050169011e-05, "loss": 0.1037, "num_input_tokens_seen": 333152, "step": 100 }, { "epoch": 0.021008928794737765, "grad_norm": 1.5144597291946411, "learning_rate": 4.999394785081696e-05, "loss": 0.1403, "num_input_tokens_seen": 350592, "step": 105 }, { "epoch": 0.02200935397543956, "grad_norm": 1.9958913326263428, "learning_rate": 4.9993357758095115e-05, "loss": 0.0654, "num_input_tokens_seen": 366624, "step": 110 }, { "epoch": 0.02300977915614136, "grad_norm": 2.1754813194274902, "learning_rate": 4.9992740224172476e-05, "loss": 0.081, "num_input_tokens_seen": 383296, "step": 115 }, { "epoch": 0.024010204336843158, "grad_norm": 1.4228804111480713, "learning_rate": 4.999209524972704e-05, "loss": 0.068, "num_input_tokens_seen": 399008, "step": 120 }, { "epoch": 0.025010629517544958, "grad_norm": 1.2373309135437012, "learning_rate": 4.9991422835466974e-05, "loss": 0.0602, "num_input_tokens_seen": 415552, "step": 125 }, { "epoch": 0.026011054698246754, "grad_norm": 1.2388474941253662, "learning_rate": 4.999072298213053e-05, "loss": 0.071, "num_input_tokens_seen": 431808, "step": 130 }, { "epoch": 0.027011479878948554, "grad_norm": 0.6230462193489075, "learning_rate": 4.99899956904861e-05, "loss": 0.1395, "num_input_tokens_seen": 448416, "step": 135 }, { "epoch": 0.02801190505965035, "grad_norm": 1.511096477508545, "learning_rate": 4.9989240961332226e-05, "loss": 0.0745, "num_input_tokens_seen": 464512, "step": 140 }, { "epoch": 0.02901233024035215, "grad_norm": 1.797107219696045, "learning_rate": 4.998845879549754e-05, "loss": 0.0943, "num_input_tokens_seen": 480448, "step": 145 }, { "epoch": 0.030012755421053947, "grad_norm": 1.3896641731262207, "learning_rate": 4.998764919384082e-05, "loss": 0.1391, "num_input_tokens_seen": 497600, "step": 150 }, { "epoch": 0.031013180601755747, "grad_norm": 1.2604399919509888, "learning_rate": 4.9986812157250955e-05, "loss": 0.0573, "num_input_tokens_seen": 513120, "step": 155 }, { "epoch": 0.032013605782457544, "grad_norm": 2.3319883346557617, "learning_rate": 4.998594768664697e-05, "loss": 0.0862, "num_input_tokens_seen": 531648, "step": 160 }, { "epoch": 0.033014030963159344, "grad_norm": 1.6746535301208496, "learning_rate": 4.9985055782977985e-05, "loss": 0.1011, "num_input_tokens_seen": 547520, "step": 165 }, { "epoch": 0.034014456143861144, "grad_norm": 3.3541808128356934, "learning_rate": 4.998413644722327e-05, "loss": 0.0811, "num_input_tokens_seen": 563328, "step": 170 }, { "epoch": 0.03501488132456294, "grad_norm": 3.3083388805389404, "learning_rate": 4.998318968039221e-05, "loss": 0.0714, "num_input_tokens_seen": 579872, "step": 175 }, { "epoch": 0.03601530650526474, "grad_norm": 1.5279896259307861, "learning_rate": 4.9982215483524264e-05, "loss": 0.0648, "num_input_tokens_seen": 595808, "step": 180 }, { "epoch": 0.03701573168596654, "grad_norm": 0.45309773087501526, "learning_rate": 4.998121385768907e-05, "loss": 0.0563, "num_input_tokens_seen": 612288, "step": 185 }, { "epoch": 0.03801615686666834, "grad_norm": 1.0328619480133057, "learning_rate": 4.998018480398635e-05, "loss": 0.0553, "num_input_tokens_seen": 628448, "step": 190 }, { "epoch": 0.03901658204737013, "grad_norm": 0.4076535701751709, "learning_rate": 4.997912832354593e-05, "loss": 0.0411, "num_input_tokens_seen": 645440, "step": 195 }, { "epoch": 0.04001700722807193, "grad_norm": 2.606926679611206, "learning_rate": 4.997804441752778e-05, "loss": 0.1207, "num_input_tokens_seen": 661056, "step": 200 }, { "epoch": 0.04101743240877373, "grad_norm": 3.2455272674560547, "learning_rate": 4.997693308712195e-05, "loss": 0.0781, "num_input_tokens_seen": 677312, "step": 205 }, { "epoch": 0.04201785758947553, "grad_norm": 2.260077476501465, "learning_rate": 4.997579433354861e-05, "loss": 0.0725, "num_input_tokens_seen": 694560, "step": 210 }, { "epoch": 0.04301828277017732, "grad_norm": 1.78510320186615, "learning_rate": 4.997462815805806e-05, "loss": 0.0848, "num_input_tokens_seen": 711968, "step": 215 }, { "epoch": 0.04401870795087912, "grad_norm": 1.5576282739639282, "learning_rate": 4.9973434561930674e-05, "loss": 0.0829, "num_input_tokens_seen": 729664, "step": 220 }, { "epoch": 0.04501913313158092, "grad_norm": 0.7414575815200806, "learning_rate": 4.997221354647695e-05, "loss": 0.0526, "num_input_tokens_seen": 746496, "step": 225 }, { "epoch": 0.04601955831228272, "grad_norm": 1.7496589422225952, "learning_rate": 4.9970965113037506e-05, "loss": 0.0671, "num_input_tokens_seen": 763200, "step": 230 }, { "epoch": 0.047019983492984516, "grad_norm": 1.5983532667160034, "learning_rate": 4.9969689262983036e-05, "loss": 0.0721, "num_input_tokens_seen": 779552, "step": 235 }, { "epoch": 0.048020408673686316, "grad_norm": 1.5230525732040405, "learning_rate": 4.996838599771433e-05, "loss": 0.0822, "num_input_tokens_seen": 796096, "step": 240 }, { "epoch": 0.049020833854388116, "grad_norm": 2.191336154937744, "learning_rate": 4.996705531866232e-05, "loss": 0.0525, "num_input_tokens_seen": 811904, "step": 245 }, { "epoch": 0.050021259035089916, "grad_norm": 0.9807636737823486, "learning_rate": 4.9965697227287997e-05, "loss": 0.0981, "num_input_tokens_seen": 828800, "step": 250 }, { "epoch": 0.05102168421579171, "grad_norm": 0.8154246211051941, "learning_rate": 4.996431172508248e-05, "loss": 0.0715, "num_input_tokens_seen": 846656, "step": 255 }, { "epoch": 0.05202210939649351, "grad_norm": 0.8523686528205872, "learning_rate": 4.996289881356695e-05, "loss": 0.0787, "num_input_tokens_seen": 862240, "step": 260 }, { "epoch": 0.05302253457719531, "grad_norm": 1.0898770093917847, "learning_rate": 4.99614584942927e-05, "loss": 0.072, "num_input_tokens_seen": 878528, "step": 265 }, { "epoch": 0.05402295975789711, "grad_norm": 0.9473728537559509, "learning_rate": 4.9959990768841115e-05, "loss": 0.1188, "num_input_tokens_seen": 895456, "step": 270 }, { "epoch": 0.0550233849385989, "grad_norm": 0.6116834878921509, "learning_rate": 4.9958495638823676e-05, "loss": 0.0519, "num_input_tokens_seen": 912416, "step": 275 }, { "epoch": 0.0560238101193007, "grad_norm": 1.0323799848556519, "learning_rate": 4.9956973105881947e-05, "loss": 0.0459, "num_input_tokens_seen": 929120, "step": 280 }, { "epoch": 0.0570242353000025, "grad_norm": 0.6817773580551147, "learning_rate": 4.995542317168756e-05, "loss": 0.0558, "num_input_tokens_seen": 945824, "step": 285 }, { "epoch": 0.0580246604807043, "grad_norm": 2.252847909927368, "learning_rate": 4.995384583794227e-05, "loss": 0.0584, "num_input_tokens_seen": 962976, "step": 290 }, { "epoch": 0.059025085661406095, "grad_norm": 1.859374761581421, "learning_rate": 4.995224110637788e-05, "loss": 0.0575, "num_input_tokens_seen": 980864, "step": 295 }, { "epoch": 0.060025510842107895, "grad_norm": 2.2975878715515137, "learning_rate": 4.99506089787563e-05, "loss": 0.0502, "num_input_tokens_seen": 997184, "step": 300 }, { "epoch": 0.061025936022809694, "grad_norm": 1.8312894105911255, "learning_rate": 4.99489494568695e-05, "loss": 0.0619, "num_input_tokens_seen": 1014688, "step": 305 }, { "epoch": 0.062026361203511494, "grad_norm": 1.4239940643310547, "learning_rate": 4.994726254253953e-05, "loss": 0.1254, "num_input_tokens_seen": 1030432, "step": 310 }, { "epoch": 0.06302678638421329, "grad_norm": 0.9822717308998108, "learning_rate": 4.9945548237618535e-05, "loss": 0.0643, "num_input_tokens_seen": 1046048, "step": 315 }, { "epoch": 0.06402721156491509, "grad_norm": 1.1589330434799194, "learning_rate": 4.994380654398871e-05, "loss": 0.0598, "num_input_tokens_seen": 1062496, "step": 320 }, { "epoch": 0.06502763674561689, "grad_norm": 1.6940858364105225, "learning_rate": 4.994203746356233e-05, "loss": 0.0545, "num_input_tokens_seen": 1080000, "step": 325 }, { "epoch": 0.06602806192631869, "grad_norm": 1.0039169788360596, "learning_rate": 4.994024099828174e-05, "loss": 0.0665, "num_input_tokens_seen": 1095712, "step": 330 }, { "epoch": 0.06702848710702049, "grad_norm": 0.5740472674369812, "learning_rate": 4.993841715011934e-05, "loss": 0.0706, "num_input_tokens_seen": 1113088, "step": 335 }, { "epoch": 0.06802891228772229, "grad_norm": 2.5024569034576416, "learning_rate": 4.993656592107763e-05, "loss": 0.0935, "num_input_tokens_seen": 1129952, "step": 340 }, { "epoch": 0.06902933746842407, "grad_norm": 1.3382577896118164, "learning_rate": 4.9934687313189124e-05, "loss": 0.0693, "num_input_tokens_seen": 1147104, "step": 345 }, { "epoch": 0.07002976264912587, "grad_norm": 1.107261300086975, "learning_rate": 4.993278132851643e-05, "loss": 0.0647, "num_input_tokens_seen": 1163232, "step": 350 }, { "epoch": 0.07103018782982767, "grad_norm": 1.4886616468429565, "learning_rate": 4.99308479691522e-05, "loss": 0.0763, "num_input_tokens_seen": 1180160, "step": 355 }, { "epoch": 0.07203061301052947, "grad_norm": 1.7975897789001465, "learning_rate": 4.992888723721916e-05, "loss": 0.0688, "num_input_tokens_seen": 1197632, "step": 360 }, { "epoch": 0.07303103819123127, "grad_norm": 0.9495922327041626, "learning_rate": 4.992689913487006e-05, "loss": 0.0537, "num_input_tokens_seen": 1214016, "step": 365 }, { "epoch": 0.07403146337193307, "grad_norm": 1.2237321138381958, "learning_rate": 4.9924883664287724e-05, "loss": 0.0775, "num_input_tokens_seen": 1230400, "step": 370 }, { "epoch": 0.07503188855263487, "grad_norm": 0.574914813041687, "learning_rate": 4.992284082768501e-05, "loss": 0.0599, "num_input_tokens_seen": 1246720, "step": 375 }, { "epoch": 0.07603231373333667, "grad_norm": 1.1832525730133057, "learning_rate": 4.9920770627304845e-05, "loss": 0.0605, "num_input_tokens_seen": 1265152, "step": 380 }, { "epoch": 0.07703273891403846, "grad_norm": 0.732682466506958, "learning_rate": 4.991867306542016e-05, "loss": 0.0544, "num_input_tokens_seen": 1281952, "step": 385 }, { "epoch": 0.07803316409474026, "grad_norm": 1.6287379264831543, "learning_rate": 4.9916548144333974e-05, "loss": 0.0678, "num_input_tokens_seen": 1299456, "step": 390 }, { "epoch": 0.07903358927544206, "grad_norm": 1.1713122129440308, "learning_rate": 4.991439586637931e-05, "loss": 0.0566, "num_input_tokens_seen": 1315808, "step": 395 }, { "epoch": 0.08003401445614386, "grad_norm": 1.2388888597488403, "learning_rate": 4.991221623391925e-05, "loss": 0.0779, "num_input_tokens_seen": 1332224, "step": 400 }, { "epoch": 0.08103443963684566, "grad_norm": 1.4474679231643677, "learning_rate": 4.991000924934689e-05, "loss": 0.077, "num_input_tokens_seen": 1348928, "step": 405 }, { "epoch": 0.08203486481754746, "grad_norm": 1.1414148807525635, "learning_rate": 4.990777491508536e-05, "loss": 0.0774, "num_input_tokens_seen": 1365600, "step": 410 }, { "epoch": 0.08303528999824926, "grad_norm": 0.7782608270645142, "learning_rate": 4.990551323358784e-05, "loss": 0.061, "num_input_tokens_seen": 1381696, "step": 415 }, { "epoch": 0.08403571517895106, "grad_norm": 0.6450750827789307, "learning_rate": 4.990322420733751e-05, "loss": 0.0484, "num_input_tokens_seen": 1398496, "step": 420 }, { "epoch": 0.08503614035965286, "grad_norm": 0.9892857670783997, "learning_rate": 4.990090783884759e-05, "loss": 0.0422, "num_input_tokens_seen": 1414368, "step": 425 }, { "epoch": 0.08603656554035465, "grad_norm": 1.7115873098373413, "learning_rate": 4.989856413066131e-05, "loss": 0.0594, "num_input_tokens_seen": 1431168, "step": 430 }, { "epoch": 0.08703699072105645, "grad_norm": 0.8290610313415527, "learning_rate": 4.9896193085351904e-05, "loss": 0.0448, "num_input_tokens_seen": 1447520, "step": 435 }, { "epoch": 0.08803741590175825, "grad_norm": 1.5415170192718506, "learning_rate": 4.9893794705522666e-05, "loss": 0.0707, "num_input_tokens_seen": 1464448, "step": 440 }, { "epoch": 0.08903784108246005, "grad_norm": 1.3662270307540894, "learning_rate": 4.989136899380686e-05, "loss": 0.0374, "num_input_tokens_seen": 1481280, "step": 445 }, { "epoch": 0.09003826626316185, "grad_norm": 0.9870457649230957, "learning_rate": 4.9888915952867766e-05, "loss": 0.0422, "num_input_tokens_seen": 1497792, "step": 450 }, { "epoch": 0.09103869144386365, "grad_norm": 2.193429470062256, "learning_rate": 4.988643558539867e-05, "loss": 0.0861, "num_input_tokens_seen": 1513920, "step": 455 }, { "epoch": 0.09203911662456545, "grad_norm": 0.9253290891647339, "learning_rate": 4.988392789412288e-05, "loss": 0.0626, "num_input_tokens_seen": 1530240, "step": 460 }, { "epoch": 0.09303954180526725, "grad_norm": 0.678046464920044, "learning_rate": 4.98813928817937e-05, "loss": 0.0354, "num_input_tokens_seen": 1547264, "step": 465 }, { "epoch": 0.09403996698596903, "grad_norm": 0.8769763708114624, "learning_rate": 4.987883055119439e-05, "loss": 0.0509, "num_input_tokens_seen": 1564288, "step": 470 }, { "epoch": 0.09504039216667083, "grad_norm": 0.7749297022819519, "learning_rate": 4.987624090513825e-05, "loss": 0.0408, "num_input_tokens_seen": 1581312, "step": 475 }, { "epoch": 0.09604081734737263, "grad_norm": 1.0127102136611938, "learning_rate": 4.987362394646856e-05, "loss": 0.0818, "num_input_tokens_seen": 1598432, "step": 480 }, { "epoch": 0.09704124252807443, "grad_norm": 1.330907940864563, "learning_rate": 4.987097967805858e-05, "loss": 0.0999, "num_input_tokens_seen": 1614560, "step": 485 }, { "epoch": 0.09804166770877623, "grad_norm": 1.55445396900177, "learning_rate": 4.986830810281156e-05, "loss": 0.0471, "num_input_tokens_seen": 1631136, "step": 490 }, { "epoch": 0.09904209288947803, "grad_norm": 0.9867338538169861, "learning_rate": 4.9865609223660714e-05, "loss": 0.0904, "num_input_tokens_seen": 1647872, "step": 495 }, { "epoch": 0.10004251807017983, "grad_norm": 1.0111794471740723, "learning_rate": 4.986288304356926e-05, "loss": 0.061, "num_input_tokens_seen": 1664800, "step": 500 }, { "epoch": 0.10104294325088163, "grad_norm": 1.2285752296447754, "learning_rate": 4.986012956553038e-05, "loss": 0.0839, "num_input_tokens_seen": 1681056, "step": 505 }, { "epoch": 0.10204336843158342, "grad_norm": 0.5017119646072388, "learning_rate": 4.985734879256723e-05, "loss": 0.0393, "num_input_tokens_seen": 1696960, "step": 510 }, { "epoch": 0.10304379361228522, "grad_norm": 0.6441024541854858, "learning_rate": 4.985454072773292e-05, "loss": 0.0452, "num_input_tokens_seen": 1714432, "step": 515 }, { "epoch": 0.10404421879298702, "grad_norm": 0.23873034119606018, "learning_rate": 4.9851705374110556e-05, "loss": 0.0572, "num_input_tokens_seen": 1731104, "step": 520 }, { "epoch": 0.10504464397368882, "grad_norm": 0.7182571291923523, "learning_rate": 4.984884273481316e-05, "loss": 0.0638, "num_input_tokens_seen": 1747168, "step": 525 }, { "epoch": 0.10604506915439062, "grad_norm": 1.063759207725525, "learning_rate": 4.984595281298375e-05, "loss": 0.0577, "num_input_tokens_seen": 1762912, "step": 530 }, { "epoch": 0.10704549433509242, "grad_norm": 0.7074857354164124, "learning_rate": 4.9843035611795295e-05, "loss": 0.0283, "num_input_tokens_seen": 1779904, "step": 535 }, { "epoch": 0.10804591951579422, "grad_norm": 1.392200231552124, "learning_rate": 4.98400911344507e-05, "loss": 0.0688, "num_input_tokens_seen": 1795360, "step": 540 }, { "epoch": 0.10904634469649602, "grad_norm": 0.5381610989570618, "learning_rate": 4.983711938418281e-05, "loss": 0.0209, "num_input_tokens_seen": 1812128, "step": 545 }, { "epoch": 0.1100467698771978, "grad_norm": 2.0378992557525635, "learning_rate": 4.9834120364254436e-05, "loss": 0.0653, "num_input_tokens_seen": 1828960, "step": 550 }, { "epoch": 0.1110471950578996, "grad_norm": 1.9617708921432495, "learning_rate": 4.9831094077958326e-05, "loss": 0.0744, "num_input_tokens_seen": 1845344, "step": 555 }, { "epoch": 0.1120476202386014, "grad_norm": 0.4756849706172943, "learning_rate": 4.9828040528617156e-05, "loss": 0.0814, "num_input_tokens_seen": 1861344, "step": 560 }, { "epoch": 0.1130480454193032, "grad_norm": 0.37431013584136963, "learning_rate": 4.9824959719583545e-05, "loss": 0.045, "num_input_tokens_seen": 1877696, "step": 565 }, { "epoch": 0.114048470600005, "grad_norm": 0.8085149526596069, "learning_rate": 4.9821851654240026e-05, "loss": 0.0442, "num_input_tokens_seen": 1894912, "step": 570 }, { "epoch": 0.1150488957807068, "grad_norm": 0.6764988303184509, "learning_rate": 4.981871633599908e-05, "loss": 0.0417, "num_input_tokens_seen": 1911520, "step": 575 }, { "epoch": 0.1160493209614086, "grad_norm": 0.9866617918014526, "learning_rate": 4.981555376830309e-05, "loss": 0.0564, "num_input_tokens_seen": 1928160, "step": 580 }, { "epoch": 0.1170497461421104, "grad_norm": 0.6540876626968384, "learning_rate": 4.9812363954624356e-05, "loss": 0.0401, "num_input_tokens_seen": 1944096, "step": 585 }, { "epoch": 0.11805017132281219, "grad_norm": 0.5941640138626099, "learning_rate": 4.980914689846512e-05, "loss": 0.036, "num_input_tokens_seen": 1959904, "step": 590 }, { "epoch": 0.11905059650351399, "grad_norm": 1.2908644676208496, "learning_rate": 4.98059026033575e-05, "loss": 0.0304, "num_input_tokens_seen": 1976160, "step": 595 }, { "epoch": 0.12005102168421579, "grad_norm": 1.6685086488723755, "learning_rate": 4.980263107286355e-05, "loss": 0.104, "num_input_tokens_seen": 1991648, "step": 600 }, { "epoch": 0.12105144686491759, "grad_norm": 1.4214237928390503, "learning_rate": 4.97993323105752e-05, "loss": 0.0701, "num_input_tokens_seen": 2006848, "step": 605 }, { "epoch": 0.12205187204561939, "grad_norm": 1.3490134477615356, "learning_rate": 4.9796006320114295e-05, "loss": 0.0475, "num_input_tokens_seen": 2022304, "step": 610 }, { "epoch": 0.12305229722632119, "grad_norm": 0.7132759094238281, "learning_rate": 4.979265310513258e-05, "loss": 0.0437, "num_input_tokens_seen": 2038624, "step": 615 }, { "epoch": 0.12405272240702299, "grad_norm": 1.148813009262085, "learning_rate": 4.978927266931167e-05, "loss": 0.0514, "num_input_tokens_seen": 2054656, "step": 620 }, { "epoch": 0.1250531475877248, "grad_norm": 1.119343876838684, "learning_rate": 4.9785865016363096e-05, "loss": 0.0478, "num_input_tokens_seen": 2070912, "step": 625 }, { "epoch": 0.12605357276842657, "grad_norm": 1.2424360513687134, "learning_rate": 4.9782430150028246e-05, "loss": 0.0646, "num_input_tokens_seen": 2086272, "step": 630 }, { "epoch": 0.1270539979491284, "grad_norm": 0.9024567008018494, "learning_rate": 4.97789680740784e-05, "loss": 0.0561, "num_input_tokens_seen": 2102464, "step": 635 }, { "epoch": 0.12805442312983017, "grad_norm": 0.31885960698127747, "learning_rate": 4.977547879231471e-05, "loss": 0.0416, "num_input_tokens_seen": 2119200, "step": 640 }, { "epoch": 0.129054848310532, "grad_norm": 0.7536543011665344, "learning_rate": 4.977196230856819e-05, "loss": 0.09, "num_input_tokens_seen": 2134560, "step": 645 }, { "epoch": 0.13005527349123377, "grad_norm": 0.8086920976638794, "learning_rate": 4.9768418626699734e-05, "loss": 0.0586, "num_input_tokens_seen": 2150464, "step": 650 }, { "epoch": 0.13105569867193556, "grad_norm": 0.47185018658638, "learning_rate": 4.9764847750600086e-05, "loss": 0.0331, "num_input_tokens_seen": 2167776, "step": 655 }, { "epoch": 0.13205612385263737, "grad_norm": 0.4128211438655853, "learning_rate": 4.976124968418985e-05, "loss": 0.0486, "num_input_tokens_seen": 2185792, "step": 660 }, { "epoch": 0.13305654903333916, "grad_norm": 1.0512120723724365, "learning_rate": 4.975762443141949e-05, "loss": 0.0673, "num_input_tokens_seen": 2202848, "step": 665 }, { "epoch": 0.13405697421404097, "grad_norm": 1.3533042669296265, "learning_rate": 4.975397199626933e-05, "loss": 0.0771, "num_input_tokens_seen": 2219904, "step": 670 }, { "epoch": 0.13505739939474276, "grad_norm": 0.9410749673843384, "learning_rate": 4.97502923827495e-05, "loss": 0.0692, "num_input_tokens_seen": 2236128, "step": 675 }, { "epoch": 0.13605782457544457, "grad_norm": 0.7121180891990662, "learning_rate": 4.9746585594900005e-05, "loss": 0.0424, "num_input_tokens_seen": 2253216, "step": 680 }, { "epoch": 0.13705824975614636, "grad_norm": 0.4436166286468506, "learning_rate": 4.9742851636790676e-05, "loss": 0.0518, "num_input_tokens_seen": 2269120, "step": 685 }, { "epoch": 0.13805867493684815, "grad_norm": 0.9069826006889343, "learning_rate": 4.973909051252117e-05, "loss": 0.0378, "num_input_tokens_seen": 2286560, "step": 690 }, { "epoch": 0.13905910011754996, "grad_norm": 0.47360703349113464, "learning_rate": 4.973530222622097e-05, "loss": 0.057, "num_input_tokens_seen": 2302880, "step": 695 }, { "epoch": 0.14005952529825175, "grad_norm": 0.3753754794597626, "learning_rate": 4.9731486782049404e-05, "loss": 0.0747, "num_input_tokens_seen": 2320384, "step": 700 }, { "epoch": 0.14105995047895356, "grad_norm": 0.3111303150653839, "learning_rate": 4.972764418419558e-05, "loss": 0.0506, "num_input_tokens_seen": 2336576, "step": 705 }, { "epoch": 0.14206037565965535, "grad_norm": 0.8782441020011902, "learning_rate": 4.972377443687845e-05, "loss": 0.0489, "num_input_tokens_seen": 2353152, "step": 710 }, { "epoch": 0.14306080084035716, "grad_norm": 0.26745501160621643, "learning_rate": 4.971987754434676e-05, "loss": 0.0236, "num_input_tokens_seen": 2370336, "step": 715 }, { "epoch": 0.14406122602105895, "grad_norm": 0.4768848121166229, "learning_rate": 4.971595351087907e-05, "loss": 0.0382, "num_input_tokens_seen": 2386496, "step": 720 }, { "epoch": 0.14506165120176076, "grad_norm": 1.0148855447769165, "learning_rate": 4.971200234078372e-05, "loss": 0.0584, "num_input_tokens_seen": 2402496, "step": 725 }, { "epoch": 0.14606207638246255, "grad_norm": 0.7883719801902771, "learning_rate": 4.970802403839886e-05, "loss": 0.032, "num_input_tokens_seen": 2419008, "step": 730 }, { "epoch": 0.14706250156316433, "grad_norm": 1.0178841352462769, "learning_rate": 4.970401860809244e-05, "loss": 0.0398, "num_input_tokens_seen": 2435360, "step": 735 }, { "epoch": 0.14806292674386615, "grad_norm": 1.6463992595672607, "learning_rate": 4.969998605426216e-05, "loss": 0.0649, "num_input_tokens_seen": 2451360, "step": 740 }, { "epoch": 0.14906335192456793, "grad_norm": 1.079568862915039, "learning_rate": 4.969592638133553e-05, "loss": 0.0326, "num_input_tokens_seen": 2467360, "step": 745 }, { "epoch": 0.15006377710526975, "grad_norm": 0.28534814715385437, "learning_rate": 4.9691839593769836e-05, "loss": 0.0699, "num_input_tokens_seen": 2483840, "step": 750 }, { "epoch": 0.15106420228597153, "grad_norm": 0.7496660351753235, "learning_rate": 4.968772569605211e-05, "loss": 0.0264, "num_input_tokens_seen": 2500640, "step": 755 }, { "epoch": 0.15206462746667335, "grad_norm": 1.5654454231262207, "learning_rate": 4.968358469269917e-05, "loss": 0.0303, "num_input_tokens_seen": 2516960, "step": 760 }, { "epoch": 0.15306505264737513, "grad_norm": 0.4835239350795746, "learning_rate": 4.967941658825759e-05, "loss": 0.0546, "num_input_tokens_seen": 2533152, "step": 765 }, { "epoch": 0.15406547782807692, "grad_norm": 0.7468734383583069, "learning_rate": 4.96752213873037e-05, "loss": 0.0277, "num_input_tokens_seen": 2549792, "step": 770 }, { "epoch": 0.15506590300877873, "grad_norm": 1.111474871635437, "learning_rate": 4.9670999094443574e-05, "loss": 0.0362, "num_input_tokens_seen": 2566272, "step": 775 }, { "epoch": 0.15606632818948052, "grad_norm": 0.8435876369476318, "learning_rate": 4.966674971431302e-05, "loss": 0.0585, "num_input_tokens_seen": 2583552, "step": 780 }, { "epoch": 0.15706675337018233, "grad_norm": 0.9625574350357056, "learning_rate": 4.966247325157763e-05, "loss": 0.0463, "num_input_tokens_seen": 2599648, "step": 785 }, { "epoch": 0.15806717855088412, "grad_norm": 0.7700160145759583, "learning_rate": 4.9658169710932676e-05, "loss": 0.0816, "num_input_tokens_seen": 2615712, "step": 790 }, { "epoch": 0.15906760373158593, "grad_norm": 0.42034628987312317, "learning_rate": 4.965383909710321e-05, "loss": 0.0543, "num_input_tokens_seen": 2633312, "step": 795 }, { "epoch": 0.16006802891228772, "grad_norm": 0.4920070469379425, "learning_rate": 4.964948141484397e-05, "loss": 0.0604, "num_input_tokens_seen": 2650496, "step": 800 }, { "epoch": 0.16106845409298953, "grad_norm": 0.43571746349334717, "learning_rate": 4.964509666893943e-05, "loss": 0.0334, "num_input_tokens_seen": 2666432, "step": 805 }, { "epoch": 0.16206887927369132, "grad_norm": 1.0017355680465698, "learning_rate": 4.9640684864203786e-05, "loss": 0.0324, "num_input_tokens_seen": 2683168, "step": 810 }, { "epoch": 0.1630693044543931, "grad_norm": 0.2180410772562027, "learning_rate": 4.9636246005480916e-05, "loss": 0.0711, "num_input_tokens_seen": 2698784, "step": 815 }, { "epoch": 0.16406972963509492, "grad_norm": 0.5853586792945862, "learning_rate": 4.963178009764444e-05, "loss": 0.0676, "num_input_tokens_seen": 2715104, "step": 820 }, { "epoch": 0.1650701548157967, "grad_norm": 0.7395941019058228, "learning_rate": 4.962728714559765e-05, "loss": 0.0539, "num_input_tokens_seen": 2731616, "step": 825 }, { "epoch": 0.16607057999649852, "grad_norm": 0.8851535320281982, "learning_rate": 4.962276715427351e-05, "loss": 0.0689, "num_input_tokens_seen": 2748448, "step": 830 }, { "epoch": 0.1670710051772003, "grad_norm": 0.2044210433959961, "learning_rate": 4.961822012863474e-05, "loss": 0.0397, "num_input_tokens_seen": 2766528, "step": 835 }, { "epoch": 0.16807143035790212, "grad_norm": 0.7691633105278015, "learning_rate": 4.9613646073673673e-05, "loss": 0.0404, "num_input_tokens_seen": 2783488, "step": 840 }, { "epoch": 0.1690718555386039, "grad_norm": 0.8963852524757385, "learning_rate": 4.9609044994412356e-05, "loss": 0.0359, "num_input_tokens_seen": 2799904, "step": 845 }, { "epoch": 0.17007228071930572, "grad_norm": 1.6576282978057861, "learning_rate": 4.960441689590249e-05, "loss": 0.0608, "num_input_tokens_seen": 2817376, "step": 850 }, { "epoch": 0.1710727059000075, "grad_norm": 0.40670549869537354, "learning_rate": 4.959976178322546e-05, "loss": 0.0646, "num_input_tokens_seen": 2834432, "step": 855 }, { "epoch": 0.1720731310807093, "grad_norm": 0.5909866690635681, "learning_rate": 4.959507966149229e-05, "loss": 0.047, "num_input_tokens_seen": 2851040, "step": 860 }, { "epoch": 0.1730735562614111, "grad_norm": 1.1720274686813354, "learning_rate": 4.9590370535843664e-05, "loss": 0.0504, "num_input_tokens_seen": 2867072, "step": 865 }, { "epoch": 0.1740739814421129, "grad_norm": 0.6624909043312073, "learning_rate": 4.9585634411449935e-05, "loss": 0.0558, "num_input_tokens_seen": 2882944, "step": 870 }, { "epoch": 0.1750744066228147, "grad_norm": 0.8819348216056824, "learning_rate": 4.958087129351106e-05, "loss": 0.0659, "num_input_tokens_seen": 2900832, "step": 875 }, { "epoch": 0.1760748318035165, "grad_norm": 0.5899621844291687, "learning_rate": 4.9576081187256666e-05, "loss": 0.0465, "num_input_tokens_seen": 2917056, "step": 880 }, { "epoch": 0.1770752569842183, "grad_norm": 0.7371861934661865, "learning_rate": 4.9571264097946e-05, "loss": 0.0401, "num_input_tokens_seen": 2934144, "step": 885 }, { "epoch": 0.1780756821649201, "grad_norm": 0.2233559936285019, "learning_rate": 4.956642003086794e-05, "loss": 0.0296, "num_input_tokens_seen": 2949984, "step": 890 }, { "epoch": 0.17907610734562188, "grad_norm": 1.363047480583191, "learning_rate": 4.956154899134098e-05, "loss": 0.0571, "num_input_tokens_seen": 2966176, "step": 895 }, { "epoch": 0.1800765325263237, "grad_norm": 0.9176114201545715, "learning_rate": 4.955665098471323e-05, "loss": 0.0433, "num_input_tokens_seen": 2982144, "step": 900 }, { "epoch": 0.18107695770702548, "grad_norm": 0.9925788640975952, "learning_rate": 4.9551726016362386e-05, "loss": 0.0587, "num_input_tokens_seen": 2998688, "step": 905 }, { "epoch": 0.1820773828877273, "grad_norm": 0.8520196676254272, "learning_rate": 4.954677409169579e-05, "loss": 0.044, "num_input_tokens_seen": 3015360, "step": 910 }, { "epoch": 0.18307780806842908, "grad_norm": 0.9778660535812378, "learning_rate": 4.954179521615035e-05, "loss": 0.0546, "num_input_tokens_seen": 3032416, "step": 915 }, { "epoch": 0.1840782332491309, "grad_norm": 0.682713508605957, "learning_rate": 4.953678939519256e-05, "loss": 0.0689, "num_input_tokens_seen": 3048608, "step": 920 }, { "epoch": 0.18507865842983268, "grad_norm": 0.3935362994670868, "learning_rate": 4.9531756634318524e-05, "loss": 0.047, "num_input_tokens_seen": 3065568, "step": 925 }, { "epoch": 0.1860790836105345, "grad_norm": 0.8297384977340698, "learning_rate": 4.952669693905391e-05, "loss": 0.07, "num_input_tokens_seen": 3083264, "step": 930 }, { "epoch": 0.18707950879123628, "grad_norm": 0.9590235948562622, "learning_rate": 4.952161031495395e-05, "loss": 0.0725, "num_input_tokens_seen": 3099296, "step": 935 }, { "epoch": 0.18807993397193806, "grad_norm": 0.3106594383716583, "learning_rate": 4.951649676760346e-05, "loss": 0.032, "num_input_tokens_seen": 3116000, "step": 940 }, { "epoch": 0.18908035915263988, "grad_norm": 0.5347669124603271, "learning_rate": 4.951135630261679e-05, "loss": 0.0584, "num_input_tokens_seen": 3132544, "step": 945 }, { "epoch": 0.19008078433334166, "grad_norm": 0.9589521288871765, "learning_rate": 4.950618892563789e-05, "loss": 0.0426, "num_input_tokens_seen": 3149472, "step": 950 }, { "epoch": 0.19108120951404348, "grad_norm": 1.2461116313934326, "learning_rate": 4.9500994642340195e-05, "loss": 0.0605, "num_input_tokens_seen": 3165152, "step": 955 }, { "epoch": 0.19208163469474526, "grad_norm": 0.48353829979896545, "learning_rate": 4.949577345842674e-05, "loss": 0.053, "num_input_tokens_seen": 3181568, "step": 960 }, { "epoch": 0.19308205987544708, "grad_norm": 0.8187334537506104, "learning_rate": 4.949052537963006e-05, "loss": 0.0426, "num_input_tokens_seen": 3198336, "step": 965 }, { "epoch": 0.19408248505614886, "grad_norm": 0.49673548340797424, "learning_rate": 4.9485250411712224e-05, "loss": 0.047, "num_input_tokens_seen": 3215392, "step": 970 }, { "epoch": 0.19508291023685065, "grad_norm": 0.5972616076469421, "learning_rate": 4.947994856046484e-05, "loss": 0.07, "num_input_tokens_seen": 3232576, "step": 975 }, { "epoch": 0.19608333541755246, "grad_norm": 0.7993170619010925, "learning_rate": 4.947461983170902e-05, "loss": 0.0381, "num_input_tokens_seen": 3248320, "step": 980 }, { "epoch": 0.19708376059825425, "grad_norm": 0.7358092665672302, "learning_rate": 4.9469264231295376e-05, "loss": 0.0422, "num_input_tokens_seen": 3266432, "step": 985 }, { "epoch": 0.19808418577895606, "grad_norm": 0.4396766722202301, "learning_rate": 4.9463881765104045e-05, "loss": 0.0448, "num_input_tokens_seen": 3283168, "step": 990 }, { "epoch": 0.19908461095965785, "grad_norm": 0.2774893343448639, "learning_rate": 4.945847243904466e-05, "loss": 0.0579, "num_input_tokens_seen": 3301280, "step": 995 }, { "epoch": 0.20008503614035966, "grad_norm": 0.6029003858566284, "learning_rate": 4.945303625905631e-05, "loss": 0.0329, "num_input_tokens_seen": 3317536, "step": 1000 }, { "epoch": 0.20108546132106145, "grad_norm": 1.284956932067871, "learning_rate": 4.944757323110761e-05, "loss": 0.06, "num_input_tokens_seen": 3334336, "step": 1005 }, { "epoch": 0.20208588650176326, "grad_norm": 0.3467988073825836, "learning_rate": 4.9442083361196627e-05, "loss": 0.0432, "num_input_tokens_seen": 3351616, "step": 1010 }, { "epoch": 0.20308631168246505, "grad_norm": 0.5883722901344299, "learning_rate": 4.9436566655350917e-05, "loss": 0.036, "num_input_tokens_seen": 3367840, "step": 1015 }, { "epoch": 0.20408673686316683, "grad_norm": 0.6289622783660889, "learning_rate": 4.9431023119627484e-05, "loss": 0.0511, "num_input_tokens_seen": 3384128, "step": 1020 }, { "epoch": 0.20508716204386865, "grad_norm": 0.5500460267066956, "learning_rate": 4.94254527601128e-05, "loss": 0.0319, "num_input_tokens_seen": 3400000, "step": 1025 }, { "epoch": 0.20608758722457043, "grad_norm": 0.6038098335266113, "learning_rate": 4.941985558292278e-05, "loss": 0.0427, "num_input_tokens_seen": 3417504, "step": 1030 }, { "epoch": 0.20708801240527225, "grad_norm": 0.3944091498851776, "learning_rate": 4.9414231594202784e-05, "loss": 0.0407, "num_input_tokens_seen": 3434080, "step": 1035 }, { "epoch": 0.20808843758597403, "grad_norm": 0.7011579275131226, "learning_rate": 4.940858080012763e-05, "loss": 0.0479, "num_input_tokens_seen": 3450336, "step": 1040 }, { "epoch": 0.20908886276667585, "grad_norm": 0.8541505336761475, "learning_rate": 4.940290320690153e-05, "loss": 0.0663, "num_input_tokens_seen": 3468064, "step": 1045 }, { "epoch": 0.21008928794737763, "grad_norm": 0.6061210036277771, "learning_rate": 4.9397198820758144e-05, "loss": 0.0371, "num_input_tokens_seen": 3484384, "step": 1050 }, { "epoch": 0.21108971312807942, "grad_norm": 0.5961213707923889, "learning_rate": 4.939146764796055e-05, "loss": 0.022, "num_input_tokens_seen": 3502240, "step": 1055 }, { "epoch": 0.21209013830878123, "grad_norm": 0.48619797825813293, "learning_rate": 4.9385709694801227e-05, "loss": 0.0513, "num_input_tokens_seen": 3518784, "step": 1060 }, { "epoch": 0.21309056348948302, "grad_norm": 0.4226716160774231, "learning_rate": 4.937992496760207e-05, "loss": 0.041, "num_input_tokens_seen": 3535360, "step": 1065 }, { "epoch": 0.21409098867018483, "grad_norm": 0.11164263635873795, "learning_rate": 4.937411347271434e-05, "loss": 0.0479, "num_input_tokens_seen": 3550816, "step": 1070 }, { "epoch": 0.21509141385088662, "grad_norm": 0.3861413300037384, "learning_rate": 4.9368275216518716e-05, "loss": 0.047, "num_input_tokens_seen": 3568128, "step": 1075 }, { "epoch": 0.21609183903158843, "grad_norm": 0.9424998760223389, "learning_rate": 4.936241020542526e-05, "loss": 0.0394, "num_input_tokens_seen": 3585344, "step": 1080 }, { "epoch": 0.21709226421229022, "grad_norm": 0.4098639488220215, "learning_rate": 4.9356518445873404e-05, "loss": 0.0363, "num_input_tokens_seen": 3601472, "step": 1085 }, { "epoch": 0.21809268939299203, "grad_norm": 0.5707319378852844, "learning_rate": 4.935059994433192e-05, "loss": 0.0547, "num_input_tokens_seen": 3619584, "step": 1090 }, { "epoch": 0.21909311457369382, "grad_norm": 0.36102646589279175, "learning_rate": 4.9344654707298996e-05, "loss": 0.0299, "num_input_tokens_seen": 3636576, "step": 1095 }, { "epoch": 0.2200935397543956, "grad_norm": 0.8916822075843811, "learning_rate": 4.9338682741302134e-05, "loss": 0.0506, "num_input_tokens_seen": 3653440, "step": 1100 }, { "epoch": 0.22109396493509742, "grad_norm": 0.6885157823562622, "learning_rate": 4.933268405289818e-05, "loss": 0.0613, "num_input_tokens_seen": 3671072, "step": 1105 }, { "epoch": 0.2220943901157992, "grad_norm": 0.5270304083824158, "learning_rate": 4.932665864867335e-05, "loss": 0.0615, "num_input_tokens_seen": 3687968, "step": 1110 }, { "epoch": 0.22309481529650102, "grad_norm": 0.16785143315792084, "learning_rate": 4.932060653524316e-05, "loss": 0.0527, "num_input_tokens_seen": 3704768, "step": 1115 }, { "epoch": 0.2240952404772028, "grad_norm": 0.6057844758033752, "learning_rate": 4.931452771925248e-05, "loss": 0.0554, "num_input_tokens_seen": 3720512, "step": 1120 }, { "epoch": 0.22509566565790462, "grad_norm": 0.23519811034202576, "learning_rate": 4.9308422207375475e-05, "loss": 0.0406, "num_input_tokens_seen": 3738176, "step": 1125 }, { "epoch": 0.2260960908386064, "grad_norm": 1.2040951251983643, "learning_rate": 4.930229000631563e-05, "loss": 0.0585, "num_input_tokens_seen": 3754592, "step": 1130 }, { "epoch": 0.2270965160193082, "grad_norm": 0.15998463332653046, "learning_rate": 4.929613112280573e-05, "loss": 0.0479, "num_input_tokens_seen": 3770976, "step": 1135 }, { "epoch": 0.22809694120001, "grad_norm": 0.4066956341266632, "learning_rate": 4.928994556360787e-05, "loss": 0.0552, "num_input_tokens_seen": 3787872, "step": 1140 }, { "epoch": 0.2290973663807118, "grad_norm": 0.967108964920044, "learning_rate": 4.9283733335513404e-05, "loss": 0.0601, "num_input_tokens_seen": 3804416, "step": 1145 }, { "epoch": 0.2300977915614136, "grad_norm": 0.3622339963912964, "learning_rate": 4.9277494445342995e-05, "loss": 0.0451, "num_input_tokens_seen": 3820672, "step": 1150 }, { "epoch": 0.2310982167421154, "grad_norm": 0.39043447375297546, "learning_rate": 4.927122889994657e-05, "loss": 0.0348, "num_input_tokens_seen": 3836960, "step": 1155 }, { "epoch": 0.2320986419228172, "grad_norm": 0.3957688510417938, "learning_rate": 4.926493670620331e-05, "loss": 0.0369, "num_input_tokens_seen": 3854112, "step": 1160 }, { "epoch": 0.233099067103519, "grad_norm": 0.8970000147819519, "learning_rate": 4.9258617871021675e-05, "loss": 0.0324, "num_input_tokens_seen": 3870752, "step": 1165 }, { "epoch": 0.2340994922842208, "grad_norm": 0.2305895835161209, "learning_rate": 4.925227240133936e-05, "loss": 0.0372, "num_input_tokens_seen": 3887168, "step": 1170 }, { "epoch": 0.2350999174649226, "grad_norm": 0.4444639980792999, "learning_rate": 4.924590030412331e-05, "loss": 0.034, "num_input_tokens_seen": 3903104, "step": 1175 }, { "epoch": 0.23610034264562438, "grad_norm": 0.38624247908592224, "learning_rate": 4.92395015863697e-05, "loss": 0.0674, "num_input_tokens_seen": 3919232, "step": 1180 }, { "epoch": 0.2371007678263262, "grad_norm": 0.9376976490020752, "learning_rate": 4.923307625510395e-05, "loss": 0.069, "num_input_tokens_seen": 3935872, "step": 1185 }, { "epoch": 0.23810119300702798, "grad_norm": 0.168256938457489, "learning_rate": 4.922662431738066e-05, "loss": 0.0449, "num_input_tokens_seen": 3952896, "step": 1190 }, { "epoch": 0.2391016181877298, "grad_norm": 0.3504430651664734, "learning_rate": 4.92201457802837e-05, "loss": 0.0462, "num_input_tokens_seen": 3970400, "step": 1195 }, { "epoch": 0.24010204336843158, "grad_norm": 0.051882412284612656, "learning_rate": 4.921364065092609e-05, "loss": 0.0328, "num_input_tokens_seen": 3986880, "step": 1200 }, { "epoch": 0.2411024685491334, "grad_norm": 0.6538947820663452, "learning_rate": 4.920710893645008e-05, "loss": 0.032, "num_input_tokens_seen": 4002752, "step": 1205 }, { "epoch": 0.24210289372983518, "grad_norm": 0.5337681174278259, "learning_rate": 4.9200550644027096e-05, "loss": 0.048, "num_input_tokens_seen": 4019744, "step": 1210 }, { "epoch": 0.243103318910537, "grad_norm": 0.4791374206542969, "learning_rate": 4.9193965780857754e-05, "loss": 0.0306, "num_input_tokens_seen": 4035968, "step": 1215 }, { "epoch": 0.24410374409123878, "grad_norm": 1.150654911994934, "learning_rate": 4.9187354354171835e-05, "loss": 0.0907, "num_input_tokens_seen": 4053472, "step": 1220 }, { "epoch": 0.24510416927194056, "grad_norm": 0.6459364891052246, "learning_rate": 4.9180716371228275e-05, "loss": 0.0526, "num_input_tokens_seen": 4070208, "step": 1225 }, { "epoch": 0.24610459445264238, "grad_norm": 0.7645431160926819, "learning_rate": 4.9174051839315196e-05, "loss": 0.047, "num_input_tokens_seen": 4085728, "step": 1230 }, { "epoch": 0.24710501963334416, "grad_norm": 0.3979432284832001, "learning_rate": 4.916736076574984e-05, "loss": 0.0278, "num_input_tokens_seen": 4102368, "step": 1235 }, { "epoch": 0.24810544481404598, "grad_norm": 0.6675825715065002, "learning_rate": 4.9160643157878616e-05, "loss": 0.0395, "num_input_tokens_seen": 4118912, "step": 1240 }, { "epoch": 0.24910586999474776, "grad_norm": 0.6228586435317993, "learning_rate": 4.915389902307704e-05, "loss": 0.0384, "num_input_tokens_seen": 4136192, "step": 1245 }, { "epoch": 0.2501062951754496, "grad_norm": 0.48848792910575867, "learning_rate": 4.914712836874977e-05, "loss": 0.0375, "num_input_tokens_seen": 4153024, "step": 1250 }, { "epoch": 0.2511067203561514, "grad_norm": 0.7998842597007751, "learning_rate": 4.914033120233057e-05, "loss": 0.0433, "num_input_tokens_seen": 4169984, "step": 1255 }, { "epoch": 0.25210714553685315, "grad_norm": 0.8730126023292542, "learning_rate": 4.913350753128233e-05, "loss": 0.0702, "num_input_tokens_seen": 4187168, "step": 1260 }, { "epoch": 0.25310757071755496, "grad_norm": 0.9048335552215576, "learning_rate": 4.912665736309703e-05, "loss": 0.0475, "num_input_tokens_seen": 4205120, "step": 1265 }, { "epoch": 0.2541079958982568, "grad_norm": 0.4183575212955475, "learning_rate": 4.9119780705295735e-05, "loss": 0.0418, "num_input_tokens_seen": 4221184, "step": 1270 }, { "epoch": 0.25510842107895854, "grad_norm": 0.2847769260406494, "learning_rate": 4.91128775654286e-05, "loss": 0.0421, "num_input_tokens_seen": 4236928, "step": 1275 }, { "epoch": 0.25610884625966035, "grad_norm": 0.45041540265083313, "learning_rate": 4.9105947951074874e-05, "loss": 0.0422, "num_input_tokens_seen": 4254752, "step": 1280 }, { "epoch": 0.25710927144036216, "grad_norm": 0.6893653273582458, "learning_rate": 4.909899186984283e-05, "loss": 0.0594, "num_input_tokens_seen": 4271648, "step": 1285 }, { "epoch": 0.258109696621064, "grad_norm": 0.876491904258728, "learning_rate": 4.9092009329369834e-05, "loss": 0.0618, "num_input_tokens_seen": 4288224, "step": 1290 }, { "epoch": 0.25911012180176574, "grad_norm": 0.7342787981033325, "learning_rate": 4.908500033732231e-05, "loss": 0.0511, "num_input_tokens_seen": 4304512, "step": 1295 }, { "epoch": 0.26011054698246755, "grad_norm": 0.6910397410392761, "learning_rate": 4.9077964901395686e-05, "loss": 0.0735, "num_input_tokens_seen": 4320992, "step": 1300 }, { "epoch": 0.26111097216316936, "grad_norm": 0.5218247771263123, "learning_rate": 4.907090302931446e-05, "loss": 0.0532, "num_input_tokens_seen": 4336160, "step": 1305 }, { "epoch": 0.2621113973438711, "grad_norm": 0.5280895233154297, "learning_rate": 4.906381472883215e-05, "loss": 0.0516, "num_input_tokens_seen": 4352640, "step": 1310 }, { "epoch": 0.26311182252457294, "grad_norm": 0.47305095195770264, "learning_rate": 4.905670000773126e-05, "loss": 0.0354, "num_input_tokens_seen": 4369312, "step": 1315 }, { "epoch": 0.26411224770527475, "grad_norm": 0.8043111562728882, "learning_rate": 4.904955887382335e-05, "loss": 0.0365, "num_input_tokens_seen": 4384512, "step": 1320 }, { "epoch": 0.26511267288597656, "grad_norm": 0.705629289150238, "learning_rate": 4.904239133494893e-05, "loss": 0.0306, "num_input_tokens_seen": 4401056, "step": 1325 }, { "epoch": 0.2661130980666783, "grad_norm": 0.7742530107498169, "learning_rate": 4.903519739897755e-05, "loss": 0.0341, "num_input_tokens_seen": 4417440, "step": 1330 }, { "epoch": 0.26711352324738014, "grad_norm": 0.48411840200424194, "learning_rate": 4.902797707380771e-05, "loss": 0.033, "num_input_tokens_seen": 4434080, "step": 1335 }, { "epoch": 0.26811394842808195, "grad_norm": 1.2817881107330322, "learning_rate": 4.9020730367366875e-05, "loss": 0.0652, "num_input_tokens_seen": 4450592, "step": 1340 }, { "epoch": 0.2691143736087837, "grad_norm": 0.5104709267616272, "learning_rate": 4.901345728761151e-05, "loss": 0.0342, "num_input_tokens_seen": 4466496, "step": 1345 }, { "epoch": 0.2701147987894855, "grad_norm": 0.7885850071907043, "learning_rate": 4.9006157842527024e-05, "loss": 0.0485, "num_input_tokens_seen": 4483584, "step": 1350 }, { "epoch": 0.27111522397018734, "grad_norm": 0.8310132026672363, "learning_rate": 4.899883204012775e-05, "loss": 0.0605, "num_input_tokens_seen": 4498976, "step": 1355 }, { "epoch": 0.27211564915088915, "grad_norm": 0.43888920545578003, "learning_rate": 4.899147988845698e-05, "loss": 0.034, "num_input_tokens_seen": 4515072, "step": 1360 }, { "epoch": 0.2731160743315909, "grad_norm": 0.47633183002471924, "learning_rate": 4.898410139558695e-05, "loss": 0.0652, "num_input_tokens_seen": 4532160, "step": 1365 }, { "epoch": 0.2741164995122927, "grad_norm": 0.5134224891662598, "learning_rate": 4.8976696569618785e-05, "loss": 0.0312, "num_input_tokens_seen": 4548672, "step": 1370 }, { "epoch": 0.27511692469299454, "grad_norm": 0.9848766326904297, "learning_rate": 4.896926541868254e-05, "loss": 0.0432, "num_input_tokens_seen": 4565728, "step": 1375 }, { "epoch": 0.2761173498736963, "grad_norm": 0.6101632118225098, "learning_rate": 4.896180795093718e-05, "loss": 0.0403, "num_input_tokens_seen": 4583680, "step": 1380 }, { "epoch": 0.2771177750543981, "grad_norm": 0.3657839596271515, "learning_rate": 4.895432417457054e-05, "loss": 0.0331, "num_input_tokens_seen": 4600096, "step": 1385 }, { "epoch": 0.2781182002350999, "grad_norm": 0.634440541267395, "learning_rate": 4.8946814097799375e-05, "loss": 0.0365, "num_input_tokens_seen": 4618208, "step": 1390 }, { "epoch": 0.27911862541580174, "grad_norm": 0.4174501299858093, "learning_rate": 4.893927772886928e-05, "loss": 0.0529, "num_input_tokens_seen": 4634720, "step": 1395 }, { "epoch": 0.2801190505965035, "grad_norm": 1.0442984104156494, "learning_rate": 4.893171507605474e-05, "loss": 0.0493, "num_input_tokens_seen": 4650688, "step": 1400 }, { "epoch": 0.2811194757772053, "grad_norm": 0.3786543309688568, "learning_rate": 4.8924126147659085e-05, "loss": 0.0468, "num_input_tokens_seen": 4666912, "step": 1405 }, { "epoch": 0.2821199009579071, "grad_norm": 0.8404293656349182, "learning_rate": 4.8916510952014515e-05, "loss": 0.0476, "num_input_tokens_seen": 4682688, "step": 1410 }, { "epoch": 0.28312032613860894, "grad_norm": 1.00434410572052, "learning_rate": 4.890886949748205e-05, "loss": 0.0612, "num_input_tokens_seen": 4699168, "step": 1415 }, { "epoch": 0.2841207513193107, "grad_norm": 0.49716895818710327, "learning_rate": 4.890120179245155e-05, "loss": 0.036, "num_input_tokens_seen": 4715488, "step": 1420 }, { "epoch": 0.2851211765000125, "grad_norm": 0.8475269079208374, "learning_rate": 4.889350784534168e-05, "loss": 0.0558, "num_input_tokens_seen": 4731456, "step": 1425 }, { "epoch": 0.2861216016807143, "grad_norm": 1.1672545671463013, "learning_rate": 4.888578766459994e-05, "loss": 0.0411, "num_input_tokens_seen": 4748224, "step": 1430 }, { "epoch": 0.2871220268614161, "grad_norm": 0.5872476100921631, "learning_rate": 4.887804125870262e-05, "loss": 0.0404, "num_input_tokens_seen": 4765280, "step": 1435 }, { "epoch": 0.2881224520421179, "grad_norm": 0.16453994810581207, "learning_rate": 4.887026863615482e-05, "loss": 0.0446, "num_input_tokens_seen": 4781760, "step": 1440 }, { "epoch": 0.2891228772228197, "grad_norm": 0.47434720396995544, "learning_rate": 4.8862469805490396e-05, "loss": 0.0262, "num_input_tokens_seen": 4798432, "step": 1445 }, { "epoch": 0.2901233024035215, "grad_norm": 0.9123254418373108, "learning_rate": 4.8854644775271996e-05, "loss": 0.026, "num_input_tokens_seen": 4815808, "step": 1450 }, { "epoch": 0.2911237275842233, "grad_norm": 0.26198312640190125, "learning_rate": 4.884679355409103e-05, "loss": 0.0598, "num_input_tokens_seen": 4831328, "step": 1455 }, { "epoch": 0.2921241527649251, "grad_norm": 0.6132438778877258, "learning_rate": 4.883891615056768e-05, "loss": 0.0673, "num_input_tokens_seen": 4847616, "step": 1460 }, { "epoch": 0.2931245779456269, "grad_norm": 1.4866887331008911, "learning_rate": 4.883101257335084e-05, "loss": 0.0445, "num_input_tokens_seen": 4863872, "step": 1465 }, { "epoch": 0.29412500312632867, "grad_norm": 0.7224521040916443, "learning_rate": 4.882308283111818e-05, "loss": 0.0547, "num_input_tokens_seen": 4881280, "step": 1470 }, { "epoch": 0.2951254283070305, "grad_norm": 1.2841455936431885, "learning_rate": 4.881512693257607e-05, "loss": 0.0731, "num_input_tokens_seen": 4898432, "step": 1475 }, { "epoch": 0.2961258534877323, "grad_norm": 0.356650710105896, "learning_rate": 4.88071448864596e-05, "loss": 0.0431, "num_input_tokens_seen": 4915744, "step": 1480 }, { "epoch": 0.2971262786684341, "grad_norm": 0.39753177762031555, "learning_rate": 4.879913670153259e-05, "loss": 0.0332, "num_input_tokens_seen": 4932800, "step": 1485 }, { "epoch": 0.29812670384913587, "grad_norm": 1.4896060228347778, "learning_rate": 4.879110238658754e-05, "loss": 0.0437, "num_input_tokens_seen": 4949664, "step": 1490 }, { "epoch": 0.2991271290298377, "grad_norm": 0.6999287605285645, "learning_rate": 4.8783041950445644e-05, "loss": 0.0452, "num_input_tokens_seen": 4966432, "step": 1495 }, { "epoch": 0.3001275542105395, "grad_norm": 0.699177086353302, "learning_rate": 4.877495540195677e-05, "loss": 0.0673, "num_input_tokens_seen": 4983392, "step": 1500 }, { "epoch": 0.30112797939124125, "grad_norm": 0.6445897221565247, "learning_rate": 4.8766842749999465e-05, "loss": 0.0438, "num_input_tokens_seen": 5000736, "step": 1505 }, { "epoch": 0.30212840457194307, "grad_norm": 0.2621614634990692, "learning_rate": 4.875870400348093e-05, "loss": 0.0631, "num_input_tokens_seen": 5017696, "step": 1510 }, { "epoch": 0.3031288297526449, "grad_norm": 0.7124674320220947, "learning_rate": 4.875053917133702e-05, "loss": 0.0576, "num_input_tokens_seen": 5034496, "step": 1515 }, { "epoch": 0.3041292549333467, "grad_norm": 0.4209171533584595, "learning_rate": 4.874234826253223e-05, "loss": 0.0398, "num_input_tokens_seen": 5051968, "step": 1520 }, { "epoch": 0.30512968011404845, "grad_norm": 0.6975089311599731, "learning_rate": 4.873413128605968e-05, "loss": 0.0291, "num_input_tokens_seen": 5067776, "step": 1525 }, { "epoch": 0.30613010529475027, "grad_norm": 0.41621458530426025, "learning_rate": 4.872588825094112e-05, "loss": 0.0465, "num_input_tokens_seen": 5083680, "step": 1530 }, { "epoch": 0.3071305304754521, "grad_norm": 1.3077008724212646, "learning_rate": 4.8717619166226904e-05, "loss": 0.0611, "num_input_tokens_seen": 5099744, "step": 1535 }, { "epoch": 0.30813095565615384, "grad_norm": 0.12024356424808502, "learning_rate": 4.8709324040995985e-05, "loss": 0.0527, "num_input_tokens_seen": 5117632, "step": 1540 }, { "epoch": 0.30913138083685565, "grad_norm": 0.7145063281059265, "learning_rate": 4.8701002884355915e-05, "loss": 0.046, "num_input_tokens_seen": 5134592, "step": 1545 }, { "epoch": 0.31013180601755747, "grad_norm": 0.8199822902679443, "learning_rate": 4.869265570544282e-05, "loss": 0.0561, "num_input_tokens_seen": 5151648, "step": 1550 }, { "epoch": 0.3111322311982593, "grad_norm": 0.5546838641166687, "learning_rate": 4.86842825134214e-05, "loss": 0.0398, "num_input_tokens_seen": 5169440, "step": 1555 }, { "epoch": 0.31213265637896104, "grad_norm": 1.3934036493301392, "learning_rate": 4.8675883317484916e-05, "loss": 0.0438, "num_input_tokens_seen": 5185984, "step": 1560 }, { "epoch": 0.31313308155966285, "grad_norm": 0.3828260004520416, "learning_rate": 4.8667458126855184e-05, "loss": 0.0371, "num_input_tokens_seen": 5202784, "step": 1565 }, { "epoch": 0.31413350674036467, "grad_norm": 0.6127660274505615, "learning_rate": 4.865900695078255e-05, "loss": 0.0428, "num_input_tokens_seen": 5219776, "step": 1570 }, { "epoch": 0.3151339319210665, "grad_norm": 0.2966005504131317, "learning_rate": 4.86505297985459e-05, "loss": 0.0333, "num_input_tokens_seen": 5235904, "step": 1575 }, { "epoch": 0.31613435710176824, "grad_norm": 0.5187402963638306, "learning_rate": 4.8642026679452626e-05, "loss": 0.049, "num_input_tokens_seen": 5253216, "step": 1580 }, { "epoch": 0.31713478228247005, "grad_norm": 1.12813401222229, "learning_rate": 4.8633497602838654e-05, "loss": 0.0393, "num_input_tokens_seen": 5269632, "step": 1585 }, { "epoch": 0.31813520746317187, "grad_norm": 0.7938510775566101, "learning_rate": 4.8624942578068386e-05, "loss": 0.0525, "num_input_tokens_seen": 5287296, "step": 1590 }, { "epoch": 0.3191356326438736, "grad_norm": 0.5008676648139954, "learning_rate": 4.861636161453473e-05, "loss": 0.0295, "num_input_tokens_seen": 5304928, "step": 1595 }, { "epoch": 0.32013605782457544, "grad_norm": 0.5503880381584167, "learning_rate": 4.860775472165906e-05, "loss": 0.0429, "num_input_tokens_seen": 5320960, "step": 1600 }, { "epoch": 0.32113648300527725, "grad_norm": 0.8359606862068176, "learning_rate": 4.859912190889123e-05, "loss": 0.0564, "num_input_tokens_seen": 5337536, "step": 1605 }, { "epoch": 0.32213690818597907, "grad_norm": 0.3948289752006531, "learning_rate": 4.8590463185709555e-05, "loss": 0.0284, "num_input_tokens_seen": 5354272, "step": 1610 }, { "epoch": 0.3231373333666808, "grad_norm": 0.5302601456642151, "learning_rate": 4.858177856162078e-05, "loss": 0.0538, "num_input_tokens_seen": 5371968, "step": 1615 }, { "epoch": 0.32413775854738264, "grad_norm": 1.3221774101257324, "learning_rate": 4.857306804616011e-05, "loss": 0.0618, "num_input_tokens_seen": 5387936, "step": 1620 }, { "epoch": 0.32513818372808445, "grad_norm": 1.5478042364120483, "learning_rate": 4.856433164889116e-05, "loss": 0.0556, "num_input_tokens_seen": 5404800, "step": 1625 }, { "epoch": 0.3261386089087862, "grad_norm": 0.3014490604400635, "learning_rate": 4.855556937940596e-05, "loss": 0.0394, "num_input_tokens_seen": 5420992, "step": 1630 }, { "epoch": 0.327139034089488, "grad_norm": 0.424380362033844, "learning_rate": 4.854678124732496e-05, "loss": 0.0219, "num_input_tokens_seen": 5436704, "step": 1635 }, { "epoch": 0.32813945927018984, "grad_norm": 0.38589712977409363, "learning_rate": 4.853796726229701e-05, "loss": 0.0178, "num_input_tokens_seen": 5453472, "step": 1640 }, { "epoch": 0.32913988445089165, "grad_norm": 0.35621973872184753, "learning_rate": 4.852912743399932e-05, "loss": 0.0295, "num_input_tokens_seen": 5470656, "step": 1645 }, { "epoch": 0.3301403096315934, "grad_norm": 0.6708896160125732, "learning_rate": 4.85202617721375e-05, "loss": 0.0291, "num_input_tokens_seen": 5486752, "step": 1650 }, { "epoch": 0.3311407348122952, "grad_norm": 1.6848129034042358, "learning_rate": 4.85113702864455e-05, "loss": 0.0777, "num_input_tokens_seen": 5502464, "step": 1655 }, { "epoch": 0.33214115999299704, "grad_norm": 0.3120448887348175, "learning_rate": 4.850245298668564e-05, "loss": 0.0341, "num_input_tokens_seen": 5519680, "step": 1660 }, { "epoch": 0.3331415851736988, "grad_norm": 1.8794240951538086, "learning_rate": 4.849350988264859e-05, "loss": 0.0652, "num_input_tokens_seen": 5535200, "step": 1665 }, { "epoch": 0.3341420103544006, "grad_norm": 0.48631739616394043, "learning_rate": 4.848454098415333e-05, "loss": 0.0297, "num_input_tokens_seen": 5553664, "step": 1670 }, { "epoch": 0.3351424355351024, "grad_norm": 0.2804921567440033, "learning_rate": 4.847554630104716e-05, "loss": 0.0346, "num_input_tokens_seen": 5570304, "step": 1675 }, { "epoch": 0.33614286071580424, "grad_norm": 0.3838098347187042, "learning_rate": 4.846652584320571e-05, "loss": 0.0291, "num_input_tokens_seen": 5586624, "step": 1680 }, { "epoch": 0.337143285896506, "grad_norm": 0.7481986284255981, "learning_rate": 4.8457479620532906e-05, "loss": 0.0333, "num_input_tokens_seen": 5603424, "step": 1685 }, { "epoch": 0.3381437110772078, "grad_norm": 0.2822953760623932, "learning_rate": 4.8448407642960946e-05, "loss": 0.03, "num_input_tokens_seen": 5619040, "step": 1690 }, { "epoch": 0.3391441362579096, "grad_norm": 0.313073992729187, "learning_rate": 4.8439309920450314e-05, "loss": 0.0632, "num_input_tokens_seen": 5636512, "step": 1695 }, { "epoch": 0.34014456143861144, "grad_norm": 0.4042295217514038, "learning_rate": 4.8430186462989765e-05, "loss": 0.041, "num_input_tokens_seen": 5653408, "step": 1700 }, { "epoch": 0.3411449866193132, "grad_norm": 0.6126199960708618, "learning_rate": 4.8421037280596304e-05, "loss": 0.0275, "num_input_tokens_seen": 5670752, "step": 1705 }, { "epoch": 0.342145411800015, "grad_norm": 0.6158226728439331, "learning_rate": 4.8411862383315185e-05, "loss": 0.0315, "num_input_tokens_seen": 5686752, "step": 1710 }, { "epoch": 0.3431458369807168, "grad_norm": 0.19262298941612244, "learning_rate": 4.840266178121989e-05, "loss": 0.0419, "num_input_tokens_seen": 5702880, "step": 1715 }, { "epoch": 0.3441462621614186, "grad_norm": 0.6169284582138062, "learning_rate": 4.839343548441213e-05, "loss": 0.0423, "num_input_tokens_seen": 5720384, "step": 1720 }, { "epoch": 0.3451466873421204, "grad_norm": 0.5302843451499939, "learning_rate": 4.8384183503021815e-05, "loss": 0.0549, "num_input_tokens_seen": 5735808, "step": 1725 }, { "epoch": 0.3461471125228222, "grad_norm": 0.3886967897415161, "learning_rate": 4.837490584720707e-05, "loss": 0.0299, "num_input_tokens_seen": 5752544, "step": 1730 }, { "epoch": 0.347147537703524, "grad_norm": 0.5098423957824707, "learning_rate": 4.836560252715419e-05, "loss": 0.0405, "num_input_tokens_seen": 5769312, "step": 1735 }, { "epoch": 0.3481479628842258, "grad_norm": 0.8524506688117981, "learning_rate": 4.835627355307767e-05, "loss": 0.0393, "num_input_tokens_seen": 5785152, "step": 1740 }, { "epoch": 0.3491483880649276, "grad_norm": 1.0980675220489502, "learning_rate": 4.834691893522016e-05, "loss": 0.0711, "num_input_tokens_seen": 5802848, "step": 1745 }, { "epoch": 0.3501488132456294, "grad_norm": 0.59861159324646, "learning_rate": 4.833753868385247e-05, "loss": 0.0317, "num_input_tokens_seen": 5818816, "step": 1750 }, { "epoch": 0.35114923842633117, "grad_norm": 0.36550024151802063, "learning_rate": 4.832813280927355e-05, "loss": 0.0502, "num_input_tokens_seen": 5835648, "step": 1755 }, { "epoch": 0.352149663607033, "grad_norm": 0.2625948488712311, "learning_rate": 4.8318701321810475e-05, "loss": 0.0443, "num_input_tokens_seen": 5852192, "step": 1760 }, { "epoch": 0.3531500887877348, "grad_norm": 0.7051465511322021, "learning_rate": 4.830924423181846e-05, "loss": 0.0431, "num_input_tokens_seen": 5869184, "step": 1765 }, { "epoch": 0.3541505139684366, "grad_norm": 0.3694879412651062, "learning_rate": 4.829976154968081e-05, "loss": 0.0525, "num_input_tokens_seen": 5885856, "step": 1770 }, { "epoch": 0.35515093914913837, "grad_norm": 0.4274010956287384, "learning_rate": 4.829025328580895e-05, "loss": 0.045, "num_input_tokens_seen": 5902560, "step": 1775 }, { "epoch": 0.3561513643298402, "grad_norm": 0.6589645147323608, "learning_rate": 4.828071945064238e-05, "loss": 0.0434, "num_input_tokens_seen": 5920576, "step": 1780 }, { "epoch": 0.357151789510542, "grad_norm": 0.632914662361145, "learning_rate": 4.8271160054648665e-05, "loss": 0.0254, "num_input_tokens_seen": 5939456, "step": 1785 }, { "epoch": 0.35815221469124375, "grad_norm": 0.7634440660476685, "learning_rate": 4.8261575108323456e-05, "loss": 0.0541, "num_input_tokens_seen": 5955744, "step": 1790 }, { "epoch": 0.35915263987194557, "grad_norm": 0.7146453857421875, "learning_rate": 4.8251964622190436e-05, "loss": 0.0549, "num_input_tokens_seen": 5971488, "step": 1795 }, { "epoch": 0.3601530650526474, "grad_norm": 0.6739281415939331, "learning_rate": 4.824232860680136e-05, "loss": 0.0461, "num_input_tokens_seen": 5988160, "step": 1800 }, { "epoch": 0.3611534902333492, "grad_norm": 0.37205588817596436, "learning_rate": 4.8232667072735966e-05, "loss": 0.0325, "num_input_tokens_seen": 6004576, "step": 1805 }, { "epoch": 0.36215391541405095, "grad_norm": 0.5811402797698975, "learning_rate": 4.8222980030602054e-05, "loss": 0.0369, "num_input_tokens_seen": 6020992, "step": 1810 }, { "epoch": 0.36315434059475277, "grad_norm": 0.19677260518074036, "learning_rate": 4.8213267491035406e-05, "loss": 0.0326, "num_input_tokens_seen": 6039168, "step": 1815 }, { "epoch": 0.3641547657754546, "grad_norm": 0.8598901033401489, "learning_rate": 4.820352946469982e-05, "loss": 0.0409, "num_input_tokens_seen": 6055488, "step": 1820 }, { "epoch": 0.36515519095615634, "grad_norm": 0.5264309644699097, "learning_rate": 4.819376596228703e-05, "loss": 0.0162, "num_input_tokens_seen": 6072544, "step": 1825 }, { "epoch": 0.36615561613685815, "grad_norm": 0.5197092294692993, "learning_rate": 4.81839769945168e-05, "loss": 0.053, "num_input_tokens_seen": 6087808, "step": 1830 }, { "epoch": 0.36715604131755997, "grad_norm": 0.36588457226753235, "learning_rate": 4.817416257213681e-05, "loss": 0.0449, "num_input_tokens_seen": 6104320, "step": 1835 }, { "epoch": 0.3681564664982618, "grad_norm": 0.8020911812782288, "learning_rate": 4.8164322705922736e-05, "loss": 0.0289, "num_input_tokens_seen": 6120960, "step": 1840 }, { "epoch": 0.36915689167896354, "grad_norm": 0.32438233494758606, "learning_rate": 4.815445740667812e-05, "loss": 0.0443, "num_input_tokens_seen": 6138432, "step": 1845 }, { "epoch": 0.37015731685966535, "grad_norm": 0.5802657008171082, "learning_rate": 4.814456668523448e-05, "loss": 0.0549, "num_input_tokens_seen": 6154496, "step": 1850 }, { "epoch": 0.37115774204036717, "grad_norm": 0.11963453888893127, "learning_rate": 4.813465055245124e-05, "loss": 0.0433, "num_input_tokens_seen": 6172352, "step": 1855 }, { "epoch": 0.372158167221069, "grad_norm": 0.34996163845062256, "learning_rate": 4.8124709019215716e-05, "loss": 0.0216, "num_input_tokens_seen": 6188000, "step": 1860 }, { "epoch": 0.37315859240177074, "grad_norm": 0.24354368448257446, "learning_rate": 4.81147420964431e-05, "loss": 0.053, "num_input_tokens_seen": 6203712, "step": 1865 }, { "epoch": 0.37415901758247255, "grad_norm": 0.5403783321380615, "learning_rate": 4.810474979507648e-05, "loss": 0.0487, "num_input_tokens_seen": 6219968, "step": 1870 }, { "epoch": 0.37515944276317437, "grad_norm": 0.46348753571510315, "learning_rate": 4.80947321260868e-05, "loss": 0.0342, "num_input_tokens_seen": 6237184, "step": 1875 }, { "epoch": 0.3761598679438761, "grad_norm": 1.2938811779022217, "learning_rate": 4.808468910047286e-05, "loss": 0.0432, "num_input_tokens_seen": 6252800, "step": 1880 }, { "epoch": 0.37716029312457794, "grad_norm": 0.25433051586151123, "learning_rate": 4.8074620729261287e-05, "loss": 0.0449, "num_input_tokens_seen": 6269344, "step": 1885 }, { "epoch": 0.37816071830527975, "grad_norm": 0.9118472933769226, "learning_rate": 4.806452702350656e-05, "loss": 0.0413, "num_input_tokens_seen": 6286592, "step": 1890 }, { "epoch": 0.37916114348598157, "grad_norm": 0.7628657817840576, "learning_rate": 4.805440799429095e-05, "loss": 0.0428, "num_input_tokens_seen": 6302336, "step": 1895 }, { "epoch": 0.3801615686666833, "grad_norm": 0.45024359226226807, "learning_rate": 4.804426365272455e-05, "loss": 0.0259, "num_input_tokens_seen": 6318816, "step": 1900 }, { "epoch": 0.38116199384738514, "grad_norm": 1.1999708414077759, "learning_rate": 4.8034094009945215e-05, "loss": 0.0451, "num_input_tokens_seen": 6335008, "step": 1905 }, { "epoch": 0.38216241902808695, "grad_norm": 0.4844655394554138, "learning_rate": 4.802389907711863e-05, "loss": 0.0685, "num_input_tokens_seen": 6351904, "step": 1910 }, { "epoch": 0.3831628442087887, "grad_norm": 0.32273247838020325, "learning_rate": 4.801367886543819e-05, "loss": 0.0402, "num_input_tokens_seen": 6367680, "step": 1915 }, { "epoch": 0.3841632693894905, "grad_norm": 0.2264421135187149, "learning_rate": 4.80034333861251e-05, "loss": 0.0514, "num_input_tokens_seen": 6384768, "step": 1920 }, { "epoch": 0.38516369457019234, "grad_norm": 0.3693816661834717, "learning_rate": 4.799316265042825e-05, "loss": 0.0645, "num_input_tokens_seen": 6400928, "step": 1925 }, { "epoch": 0.38616411975089415, "grad_norm": 0.19210654497146606, "learning_rate": 4.798286666962431e-05, "loss": 0.0415, "num_input_tokens_seen": 6417984, "step": 1930 }, { "epoch": 0.3871645449315959, "grad_norm": 0.9294410943984985, "learning_rate": 4.797254545501763e-05, "loss": 0.0506, "num_input_tokens_seen": 6434432, "step": 1935 }, { "epoch": 0.3881649701122977, "grad_norm": 0.18243177235126495, "learning_rate": 4.7962199017940283e-05, "loss": 0.0712, "num_input_tokens_seen": 6450624, "step": 1940 }, { "epoch": 0.38916539529299954, "grad_norm": 0.40890705585479736, "learning_rate": 4.795182736975205e-05, "loss": 0.0394, "num_input_tokens_seen": 6466272, "step": 1945 }, { "epoch": 0.3901658204737013, "grad_norm": 0.22906385362148285, "learning_rate": 4.794143052184037e-05, "loss": 0.0418, "num_input_tokens_seen": 6482784, "step": 1950 }, { "epoch": 0.3911662456544031, "grad_norm": 0.27830296754837036, "learning_rate": 4.793100848562034e-05, "loss": 0.0245, "num_input_tokens_seen": 6500160, "step": 1955 }, { "epoch": 0.3921666708351049, "grad_norm": 0.32560041546821594, "learning_rate": 4.7920561272534745e-05, "loss": 0.0445, "num_input_tokens_seen": 6517312, "step": 1960 }, { "epoch": 0.39316709601580674, "grad_norm": 0.34138888120651245, "learning_rate": 4.7910088894053983e-05, "loss": 0.0602, "num_input_tokens_seen": 6533760, "step": 1965 }, { "epoch": 0.3941675211965085, "grad_norm": 0.45417314767837524, "learning_rate": 4.789959136167611e-05, "loss": 0.0299, "num_input_tokens_seen": 6550752, "step": 1970 }, { "epoch": 0.3951679463772103, "grad_norm": 0.6495351791381836, "learning_rate": 4.788906868692677e-05, "loss": 0.0194, "num_input_tokens_seen": 6567328, "step": 1975 }, { "epoch": 0.3961683715579121, "grad_norm": 0.8971573114395142, "learning_rate": 4.787852088135923e-05, "loss": 0.039, "num_input_tokens_seen": 6584096, "step": 1980 }, { "epoch": 0.39716879673861394, "grad_norm": 0.6822695732116699, "learning_rate": 4.7867947956554346e-05, "loss": 0.0233, "num_input_tokens_seen": 6600736, "step": 1985 }, { "epoch": 0.3981692219193157, "grad_norm": 0.8708227872848511, "learning_rate": 4.7857349924120556e-05, "loss": 0.0649, "num_input_tokens_seen": 6616896, "step": 1990 }, { "epoch": 0.3991696471000175, "grad_norm": 0.4138895273208618, "learning_rate": 4.7846726795693855e-05, "loss": 0.0444, "num_input_tokens_seen": 6633472, "step": 1995 }, { "epoch": 0.4001700722807193, "grad_norm": 0.2843296229839325, "learning_rate": 4.78360785829378e-05, "loss": 0.0362, "num_input_tokens_seen": 6650496, "step": 2000 }, { "epoch": 0.4011704974614211, "grad_norm": 0.4477090537548065, "learning_rate": 4.782540529754348e-05, "loss": 0.0282, "num_input_tokens_seen": 6666528, "step": 2005 }, { "epoch": 0.4021709226421229, "grad_norm": 0.31738659739494324, "learning_rate": 4.781470695122954e-05, "loss": 0.0334, "num_input_tokens_seen": 6683136, "step": 2010 }, { "epoch": 0.4031713478228247, "grad_norm": 0.3024028241634369, "learning_rate": 4.7803983555742096e-05, "loss": 0.0272, "num_input_tokens_seen": 6700128, "step": 2015 }, { "epoch": 0.4041717730035265, "grad_norm": 0.5160415172576904, "learning_rate": 4.77932351228548e-05, "loss": 0.0178, "num_input_tokens_seen": 6716608, "step": 2020 }, { "epoch": 0.4051721981842283, "grad_norm": 0.6938416361808777, "learning_rate": 4.7782461664368786e-05, "loss": 0.0464, "num_input_tokens_seen": 6733312, "step": 2025 }, { "epoch": 0.4061726233649301, "grad_norm": 0.5055555701255798, "learning_rate": 4.777166319211266e-05, "loss": 0.0489, "num_input_tokens_seen": 6748160, "step": 2030 }, { "epoch": 0.4071730485456319, "grad_norm": 0.6181210279464722, "learning_rate": 4.7760839717942495e-05, "loss": 0.0501, "num_input_tokens_seen": 6765088, "step": 2035 }, { "epoch": 0.40817347372633367, "grad_norm": 0.4572533667087555, "learning_rate": 4.774999125374181e-05, "loss": 0.0659, "num_input_tokens_seen": 6781760, "step": 2040 }, { "epoch": 0.4091738989070355, "grad_norm": 0.6048961877822876, "learning_rate": 4.7739117811421566e-05, "loss": 0.0367, "num_input_tokens_seen": 6798848, "step": 2045 }, { "epoch": 0.4101743240877373, "grad_norm": 0.15084992349147797, "learning_rate": 4.7728219402920156e-05, "loss": 0.0281, "num_input_tokens_seen": 6815264, "step": 2050 }, { "epoch": 0.4111747492684391, "grad_norm": 0.36724817752838135, "learning_rate": 4.771729604020336e-05, "loss": 0.0495, "num_input_tokens_seen": 6830816, "step": 2055 }, { "epoch": 0.41217517444914087, "grad_norm": 0.23971237242221832, "learning_rate": 4.7706347735264385e-05, "loss": 0.0271, "num_input_tokens_seen": 6847072, "step": 2060 }, { "epoch": 0.4131755996298427, "grad_norm": 0.7865434885025024, "learning_rate": 4.76953745001238e-05, "loss": 0.0301, "num_input_tokens_seen": 6863072, "step": 2065 }, { "epoch": 0.4141760248105445, "grad_norm": 0.28162121772766113, "learning_rate": 4.768437634682957e-05, "loss": 0.0299, "num_input_tokens_seen": 6880288, "step": 2070 }, { "epoch": 0.41517644999124625, "grad_norm": 0.6597485542297363, "learning_rate": 4.767335328745699e-05, "loss": 0.0639, "num_input_tokens_seen": 6896928, "step": 2075 }, { "epoch": 0.41617687517194807, "grad_norm": 0.0439864918589592, "learning_rate": 4.766230533410872e-05, "loss": 0.0387, "num_input_tokens_seen": 6912736, "step": 2080 }, { "epoch": 0.4171773003526499, "grad_norm": 0.547297477722168, "learning_rate": 4.7651232498914755e-05, "loss": 0.0766, "num_input_tokens_seen": 6929312, "step": 2085 }, { "epoch": 0.4181777255333517, "grad_norm": 0.5888490676879883, "learning_rate": 4.7640134794032386e-05, "loss": 0.0263, "num_input_tokens_seen": 6945888, "step": 2090 }, { "epoch": 0.41917815071405345, "grad_norm": 0.6623382568359375, "learning_rate": 4.762901223164624e-05, "loss": 0.0408, "num_input_tokens_seen": 6964000, "step": 2095 }, { "epoch": 0.42017857589475527, "grad_norm": 0.33597245812416077, "learning_rate": 4.7617864823968205e-05, "loss": 0.0318, "num_input_tokens_seen": 6980672, "step": 2100 }, { "epoch": 0.4211790010754571, "grad_norm": 0.5683799982070923, "learning_rate": 4.7606692583237476e-05, "loss": 0.035, "num_input_tokens_seen": 6998688, "step": 2105 }, { "epoch": 0.42217942625615884, "grad_norm": 1.111972689628601, "learning_rate": 4.759549552172049e-05, "loss": 0.0634, "num_input_tokens_seen": 7015104, "step": 2110 }, { "epoch": 0.42317985143686065, "grad_norm": 0.43278875946998596, "learning_rate": 4.7584273651710955e-05, "loss": 0.0396, "num_input_tokens_seen": 7032224, "step": 2115 }, { "epoch": 0.42418027661756247, "grad_norm": 0.49375009536743164, "learning_rate": 4.75730269855298e-05, "loss": 0.0433, "num_input_tokens_seen": 7048192, "step": 2120 }, { "epoch": 0.4251807017982643, "grad_norm": 0.6873200535774231, "learning_rate": 4.7561755535525196e-05, "loss": 0.043, "num_input_tokens_seen": 7064416, "step": 2125 }, { "epoch": 0.42618112697896604, "grad_norm": 0.6794641613960266, "learning_rate": 4.7550459314072505e-05, "loss": 0.0251, "num_input_tokens_seen": 7080320, "step": 2130 }, { "epoch": 0.42718155215966785, "grad_norm": 0.6741289496421814, "learning_rate": 4.75391383335743e-05, "loss": 0.0339, "num_input_tokens_seen": 7097120, "step": 2135 }, { "epoch": 0.42818197734036967, "grad_norm": 1.288682460784912, "learning_rate": 4.7527792606460344e-05, "loss": 0.0538, "num_input_tokens_seen": 7114848, "step": 2140 }, { "epoch": 0.4291824025210715, "grad_norm": 0.36019641160964966, "learning_rate": 4.751642214518756e-05, "loss": 0.0251, "num_input_tokens_seen": 7131008, "step": 2145 }, { "epoch": 0.43018282770177324, "grad_norm": 0.5206799507141113, "learning_rate": 4.750502696224002e-05, "loss": 0.0383, "num_input_tokens_seen": 7148160, "step": 2150 }, { "epoch": 0.43118325288247505, "grad_norm": 0.5463341474533081, "learning_rate": 4.749360707012895e-05, "loss": 0.0585, "num_input_tokens_seen": 7164288, "step": 2155 }, { "epoch": 0.43218367806317687, "grad_norm": 0.43360671401023865, "learning_rate": 4.7482162481392704e-05, "loss": 0.0328, "num_input_tokens_seen": 7181632, "step": 2160 }, { "epoch": 0.4331841032438786, "grad_norm": 0.2629452049732208, "learning_rate": 4.7470693208596764e-05, "loss": 0.025, "num_input_tokens_seen": 7198112, "step": 2165 }, { "epoch": 0.43418452842458044, "grad_norm": 0.801326334476471, "learning_rate": 4.745919926433368e-05, "loss": 0.0458, "num_input_tokens_seen": 7213696, "step": 2170 }, { "epoch": 0.43518495360528225, "grad_norm": 0.30609095096588135, "learning_rate": 4.7447680661223126e-05, "loss": 0.0294, "num_input_tokens_seen": 7229920, "step": 2175 }, { "epoch": 0.43618537878598407, "grad_norm": 0.5648137331008911, "learning_rate": 4.743613741191183e-05, "loss": 0.0497, "num_input_tokens_seen": 7245472, "step": 2180 }, { "epoch": 0.4371858039666858, "grad_norm": 0.9186530709266663, "learning_rate": 4.742456952907358e-05, "loss": 0.0508, "num_input_tokens_seen": 7261088, "step": 2185 }, { "epoch": 0.43818622914738764, "grad_norm": 0.7290270328521729, "learning_rate": 4.7412977025409225e-05, "loss": 0.0526, "num_input_tokens_seen": 7276160, "step": 2190 }, { "epoch": 0.43918665432808945, "grad_norm": 0.6139854788780212, "learning_rate": 4.740135991364662e-05, "loss": 0.0324, "num_input_tokens_seen": 7293600, "step": 2195 }, { "epoch": 0.4401870795087912, "grad_norm": 0.17772167921066284, "learning_rate": 4.738971820654066e-05, "loss": 0.0341, "num_input_tokens_seen": 7310304, "step": 2200 }, { "epoch": 0.441187504689493, "grad_norm": 0.6025674343109131, "learning_rate": 4.737805191687323e-05, "loss": 0.0473, "num_input_tokens_seen": 7327072, "step": 2205 }, { "epoch": 0.44218792987019484, "grad_norm": 0.23449702560901642, "learning_rate": 4.7366361057453236e-05, "loss": 0.0418, "num_input_tokens_seen": 7344064, "step": 2210 }, { "epoch": 0.44318835505089665, "grad_norm": 0.6478351950645447, "learning_rate": 4.7354645641116526e-05, "loss": 0.0332, "num_input_tokens_seen": 7360096, "step": 2215 }, { "epoch": 0.4441887802315984, "grad_norm": 0.8951166272163391, "learning_rate": 4.7342905680725906e-05, "loss": 0.0328, "num_input_tokens_seen": 7376704, "step": 2220 }, { "epoch": 0.4451892054123002, "grad_norm": 0.7574548721313477, "learning_rate": 4.733114118917117e-05, "loss": 0.0548, "num_input_tokens_seen": 7394304, "step": 2225 }, { "epoch": 0.44618963059300204, "grad_norm": 0.2258099615573883, "learning_rate": 4.7319352179369e-05, "loss": 0.0368, "num_input_tokens_seen": 7411456, "step": 2230 }, { "epoch": 0.4471900557737038, "grad_norm": 0.49726584553718567, "learning_rate": 4.730753866426303e-05, "loss": 0.0257, "num_input_tokens_seen": 7428512, "step": 2235 }, { "epoch": 0.4481904809544056, "grad_norm": 0.6473379135131836, "learning_rate": 4.729570065682378e-05, "loss": 0.0392, "num_input_tokens_seen": 7445408, "step": 2240 }, { "epoch": 0.4491909061351074, "grad_norm": 0.7364275455474854, "learning_rate": 4.7283838170048674e-05, "loss": 0.033, "num_input_tokens_seen": 7462464, "step": 2245 }, { "epoch": 0.45019133131580924, "grad_norm": 0.2830129861831665, "learning_rate": 4.727195121696202e-05, "loss": 0.046, "num_input_tokens_seen": 7479392, "step": 2250 }, { "epoch": 0.451191756496511, "grad_norm": 0.3266063332557678, "learning_rate": 4.7260039810614954e-05, "loss": 0.0278, "num_input_tokens_seen": 7495424, "step": 2255 }, { "epoch": 0.4521921816772128, "grad_norm": 0.38082149624824524, "learning_rate": 4.724810396408549e-05, "loss": 0.0245, "num_input_tokens_seen": 7512448, "step": 2260 }, { "epoch": 0.4531926068579146, "grad_norm": 1.0159626007080078, "learning_rate": 4.723614369047847e-05, "loss": 0.0455, "num_input_tokens_seen": 7528640, "step": 2265 }, { "epoch": 0.4541930320386164, "grad_norm": 0.5355516076087952, "learning_rate": 4.722415900292555e-05, "loss": 0.042, "num_input_tokens_seen": 7544576, "step": 2270 }, { "epoch": 0.4551934572193182, "grad_norm": 0.17202123999595642, "learning_rate": 4.721214991458521e-05, "loss": 0.0529, "num_input_tokens_seen": 7561664, "step": 2275 }, { "epoch": 0.45619388240002, "grad_norm": 0.4654875099658966, "learning_rate": 4.7200116438642686e-05, "loss": 0.0495, "num_input_tokens_seen": 7579136, "step": 2280 }, { "epoch": 0.4571943075807218, "grad_norm": 1.3036562204360962, "learning_rate": 4.718805858831002e-05, "loss": 0.039, "num_input_tokens_seen": 7597056, "step": 2285 }, { "epoch": 0.4581947327614236, "grad_norm": 0.6863433718681335, "learning_rate": 4.7175976376826e-05, "loss": 0.0385, "num_input_tokens_seen": 7615296, "step": 2290 }, { "epoch": 0.4591951579421254, "grad_norm": 0.6734871864318848, "learning_rate": 4.7163869817456176e-05, "loss": 0.0464, "num_input_tokens_seen": 7632064, "step": 2295 }, { "epoch": 0.4601955831228272, "grad_norm": 0.3591286242008209, "learning_rate": 4.7151738923492804e-05, "loss": 0.049, "num_input_tokens_seen": 7648224, "step": 2300 }, { "epoch": 0.461196008303529, "grad_norm": 0.5754177570343018, "learning_rate": 4.713958370825489e-05, "loss": 0.0304, "num_input_tokens_seen": 7664576, "step": 2305 }, { "epoch": 0.4621964334842308, "grad_norm": 0.21992503106594086, "learning_rate": 4.712740418508812e-05, "loss": 0.0412, "num_input_tokens_seen": 7681504, "step": 2310 }, { "epoch": 0.4631968586649326, "grad_norm": 0.712186336517334, "learning_rate": 4.711520036736488e-05, "loss": 0.0313, "num_input_tokens_seen": 7698656, "step": 2315 }, { "epoch": 0.4641972838456344, "grad_norm": 0.35100919008255005, "learning_rate": 4.7102972268484226e-05, "loss": 0.0301, "num_input_tokens_seen": 7716448, "step": 2320 }, { "epoch": 0.46519770902633617, "grad_norm": 0.6114473938941956, "learning_rate": 4.709071990187187e-05, "loss": 0.0262, "num_input_tokens_seen": 7732192, "step": 2325 }, { "epoch": 0.466198134207038, "grad_norm": 0.28752997517585754, "learning_rate": 4.707844328098018e-05, "loss": 0.0311, "num_input_tokens_seen": 7747904, "step": 2330 }, { "epoch": 0.4671985593877398, "grad_norm": 0.757440447807312, "learning_rate": 4.706614241928813e-05, "loss": 0.0379, "num_input_tokens_seen": 7764928, "step": 2335 }, { "epoch": 0.4681989845684416, "grad_norm": 1.7856647968292236, "learning_rate": 4.7053817330301345e-05, "loss": 0.0484, "num_input_tokens_seen": 7781248, "step": 2340 }, { "epoch": 0.46919940974914337, "grad_norm": 0.4046674966812134, "learning_rate": 4.7041468027552016e-05, "loss": 0.0363, "num_input_tokens_seen": 7796864, "step": 2345 }, { "epoch": 0.4701998349298452, "grad_norm": 0.8157446384429932, "learning_rate": 4.7029094524598936e-05, "loss": 0.0429, "num_input_tokens_seen": 7813888, "step": 2350 }, { "epoch": 0.471200260110547, "grad_norm": 0.7077159881591797, "learning_rate": 4.701669683502747e-05, "loss": 0.0748, "num_input_tokens_seen": 7829440, "step": 2355 }, { "epoch": 0.47220068529124876, "grad_norm": 0.5294864177703857, "learning_rate": 4.700427497244954e-05, "loss": 0.0427, "num_input_tokens_seen": 7847584, "step": 2360 }, { "epoch": 0.47320111047195057, "grad_norm": 0.44839006662368774, "learning_rate": 4.699182895050359e-05, "loss": 0.0316, "num_input_tokens_seen": 7864384, "step": 2365 }, { "epoch": 0.4742015356526524, "grad_norm": 0.2019323706626892, "learning_rate": 4.697935878285461e-05, "loss": 0.0354, "num_input_tokens_seen": 7881056, "step": 2370 }, { "epoch": 0.4752019608333542, "grad_norm": 0.6341044306755066, "learning_rate": 4.696686448319408e-05, "loss": 0.0392, "num_input_tokens_seen": 7897184, "step": 2375 }, { "epoch": 0.47620238601405596, "grad_norm": 0.020971478894352913, "learning_rate": 4.695434606524e-05, "loss": 0.0252, "num_input_tokens_seen": 7913824, "step": 2380 }, { "epoch": 0.47720281119475777, "grad_norm": 0.5249700546264648, "learning_rate": 4.694180354273684e-05, "loss": 0.0324, "num_input_tokens_seen": 7930560, "step": 2385 }, { "epoch": 0.4782032363754596, "grad_norm": 1.1561530828475952, "learning_rate": 4.692923692945553e-05, "loss": 0.0347, "num_input_tokens_seen": 7946976, "step": 2390 }, { "epoch": 0.47920366155616134, "grad_norm": 2.0148797035217285, "learning_rate": 4.6916646239193454e-05, "loss": 0.0359, "num_input_tokens_seen": 7963328, "step": 2395 }, { "epoch": 0.48020408673686316, "grad_norm": 0.909793496131897, "learning_rate": 4.690403148577443e-05, "loss": 0.0366, "num_input_tokens_seen": 7980576, "step": 2400 }, { "epoch": 0.48120451191756497, "grad_norm": 0.5779248476028442, "learning_rate": 4.68913926830487e-05, "loss": 0.0263, "num_input_tokens_seen": 7996224, "step": 2405 }, { "epoch": 0.4822049370982668, "grad_norm": 0.21549813449382782, "learning_rate": 4.6878729844892896e-05, "loss": 0.0306, "num_input_tokens_seen": 8012736, "step": 2410 }, { "epoch": 0.48320536227896854, "grad_norm": 0.44577720761299133, "learning_rate": 4.686604298521008e-05, "loss": 0.0468, "num_input_tokens_seen": 8029472, "step": 2415 }, { "epoch": 0.48420578745967036, "grad_norm": 0.7483829855918884, "learning_rate": 4.685333211792963e-05, "loss": 0.053, "num_input_tokens_seen": 8045376, "step": 2420 }, { "epoch": 0.48520621264037217, "grad_norm": 0.357555627822876, "learning_rate": 4.6840597257007325e-05, "loss": 0.0277, "num_input_tokens_seen": 8061504, "step": 2425 }, { "epoch": 0.486206637821074, "grad_norm": 0.308903306722641, "learning_rate": 4.682783841642527e-05, "loss": 0.0392, "num_input_tokens_seen": 8077824, "step": 2430 }, { "epoch": 0.48720706300177574, "grad_norm": 0.27499833703041077, "learning_rate": 4.681505561019192e-05, "loss": 0.0479, "num_input_tokens_seen": 8094400, "step": 2435 }, { "epoch": 0.48820748818247756, "grad_norm": 0.9674980640411377, "learning_rate": 4.6802248852342e-05, "loss": 0.0393, "num_input_tokens_seen": 8110880, "step": 2440 }, { "epoch": 0.48920791336317937, "grad_norm": 0.9127982258796692, "learning_rate": 4.678941815693657e-05, "loss": 0.0543, "num_input_tokens_seen": 8127040, "step": 2445 }, { "epoch": 0.49020833854388113, "grad_norm": 0.6197338104248047, "learning_rate": 4.677656353806297e-05, "loss": 0.0293, "num_input_tokens_seen": 8143712, "step": 2450 }, { "epoch": 0.49120876372458294, "grad_norm": 0.7034648656845093, "learning_rate": 4.6763685009834784e-05, "loss": 0.0416, "num_input_tokens_seen": 8159424, "step": 2455 }, { "epoch": 0.49220918890528476, "grad_norm": 0.5745025277137756, "learning_rate": 4.675078258639187e-05, "loss": 0.0449, "num_input_tokens_seen": 8176768, "step": 2460 }, { "epoch": 0.49320961408598657, "grad_norm": 0.3975605368614197, "learning_rate": 4.6737856281900295e-05, "loss": 0.0441, "num_input_tokens_seen": 8193248, "step": 2465 }, { "epoch": 0.49421003926668833, "grad_norm": 0.2682710587978363, "learning_rate": 4.672490611055238e-05, "loss": 0.0274, "num_input_tokens_seen": 8209824, "step": 2470 }, { "epoch": 0.49521046444739014, "grad_norm": 0.23972515761852264, "learning_rate": 4.671193208656662e-05, "loss": 0.0211, "num_input_tokens_seen": 8227232, "step": 2475 }, { "epoch": 0.49621088962809196, "grad_norm": 0.36315664649009705, "learning_rate": 4.669893422418773e-05, "loss": 0.0613, "num_input_tokens_seen": 8243520, "step": 2480 }, { "epoch": 0.4972113148087937, "grad_norm": 0.44635969400405884, "learning_rate": 4.668591253768656e-05, "loss": 0.0396, "num_input_tokens_seen": 8260384, "step": 2485 }, { "epoch": 0.49821173998949553, "grad_norm": 0.4753713011741638, "learning_rate": 4.667286704136014e-05, "loss": 0.0224, "num_input_tokens_seen": 8276192, "step": 2490 }, { "epoch": 0.49921216517019734, "grad_norm": 0.20319467782974243, "learning_rate": 4.665979774953165e-05, "loss": 0.0461, "num_input_tokens_seen": 8292288, "step": 2495 }, { "epoch": 0.5002125903508992, "grad_norm": 0.5693745613098145, "learning_rate": 4.664670467655038e-05, "loss": 0.0336, "num_input_tokens_seen": 8308960, "step": 2500 }, { "epoch": 0.5012130155316009, "grad_norm": 0.30443742871284485, "learning_rate": 4.663358783679173e-05, "loss": 0.0371, "num_input_tokens_seen": 8327200, "step": 2505 }, { "epoch": 0.5022134407123028, "grad_norm": 0.6215730309486389, "learning_rate": 4.662044724465719e-05, "loss": 0.0523, "num_input_tokens_seen": 8344864, "step": 2510 }, { "epoch": 0.5032138658930045, "grad_norm": 0.48964619636535645, "learning_rate": 4.660728291457437e-05, "loss": 0.0298, "num_input_tokens_seen": 8361696, "step": 2515 }, { "epoch": 0.5042142910737063, "grad_norm": 0.5137848854064941, "learning_rate": 4.659409486099689e-05, "loss": 0.0186, "num_input_tokens_seen": 8378048, "step": 2520 }, { "epoch": 0.5052147162544082, "grad_norm": 0.3688870370388031, "learning_rate": 4.658088309840444e-05, "loss": 0.034, "num_input_tokens_seen": 8395104, "step": 2525 }, { "epoch": 0.5062151414351099, "grad_norm": 0.5594642162322998, "learning_rate": 4.656764764130274e-05, "loss": 0.0219, "num_input_tokens_seen": 8411360, "step": 2530 }, { "epoch": 0.5072155666158117, "grad_norm": 0.21476143598556519, "learning_rate": 4.655438850422352e-05, "loss": 0.0466, "num_input_tokens_seen": 8428480, "step": 2535 }, { "epoch": 0.5082159917965136, "grad_norm": 0.4187186360359192, "learning_rate": 4.654110570172452e-05, "loss": 0.0319, "num_input_tokens_seen": 8445312, "step": 2540 }, { "epoch": 0.5092164169772153, "grad_norm": 0.7518160939216614, "learning_rate": 4.652779924838945e-05, "loss": 0.0251, "num_input_tokens_seen": 8462528, "step": 2545 }, { "epoch": 0.5102168421579171, "grad_norm": 0.717582106590271, "learning_rate": 4.6514469158828014e-05, "loss": 0.0294, "num_input_tokens_seen": 8479360, "step": 2550 }, { "epoch": 0.5112172673386189, "grad_norm": 0.16009634733200073, "learning_rate": 4.6501115447675814e-05, "loss": 0.0367, "num_input_tokens_seen": 8496672, "step": 2555 }, { "epoch": 0.5122176925193207, "grad_norm": 0.5227007865905762, "learning_rate": 4.648773812959445e-05, "loss": 0.0696, "num_input_tokens_seen": 8514176, "step": 2560 }, { "epoch": 0.5132181177000225, "grad_norm": 0.6089304685592651, "learning_rate": 4.647433721927139e-05, "loss": 0.0266, "num_input_tokens_seen": 8529824, "step": 2565 }, { "epoch": 0.5142185428807243, "grad_norm": 0.5312008857727051, "learning_rate": 4.6460912731420045e-05, "loss": 0.0512, "num_input_tokens_seen": 8546912, "step": 2570 }, { "epoch": 0.5152189680614261, "grad_norm": 0.2808869779109955, "learning_rate": 4.644746468077968e-05, "loss": 0.0354, "num_input_tokens_seen": 8563264, "step": 2575 }, { "epoch": 0.516219393242128, "grad_norm": 0.32890409231185913, "learning_rate": 4.6433993082115444e-05, "loss": 0.0659, "num_input_tokens_seen": 8580608, "step": 2580 }, { "epoch": 0.5172198184228297, "grad_norm": 0.6253911852836609, "learning_rate": 4.642049795021836e-05, "loss": 0.0375, "num_input_tokens_seen": 8596736, "step": 2585 }, { "epoch": 0.5182202436035315, "grad_norm": 0.7056679129600525, "learning_rate": 4.6406979299905236e-05, "loss": 0.0472, "num_input_tokens_seen": 8613184, "step": 2590 }, { "epoch": 0.5192206687842333, "grad_norm": 0.6428877115249634, "learning_rate": 4.639343714601876e-05, "loss": 0.0387, "num_input_tokens_seen": 8629984, "step": 2595 }, { "epoch": 0.5202210939649351, "grad_norm": 0.7325150370597839, "learning_rate": 4.63798715034274e-05, "loss": 0.0409, "num_input_tokens_seen": 8646688, "step": 2600 }, { "epoch": 0.5212215191456369, "grad_norm": 1.1029064655303955, "learning_rate": 4.636628238702541e-05, "loss": 0.0601, "num_input_tokens_seen": 8662240, "step": 2605 }, { "epoch": 0.5222219443263387, "grad_norm": 0.7259523868560791, "learning_rate": 4.6352669811732805e-05, "loss": 0.0404, "num_input_tokens_seen": 8679488, "step": 2610 }, { "epoch": 0.5232223695070405, "grad_norm": 0.5248921513557434, "learning_rate": 4.63390337924954e-05, "loss": 0.0411, "num_input_tokens_seen": 8695744, "step": 2615 }, { "epoch": 0.5242227946877422, "grad_norm": 0.9337000846862793, "learning_rate": 4.6325374344284704e-05, "loss": 0.0512, "num_input_tokens_seen": 8712544, "step": 2620 }, { "epoch": 0.5252232198684441, "grad_norm": 0.2723657488822937, "learning_rate": 4.6311691482097985e-05, "loss": 0.0343, "num_input_tokens_seen": 8729504, "step": 2625 }, { "epoch": 0.5262236450491459, "grad_norm": 0.29864421486854553, "learning_rate": 4.629798522095818e-05, "loss": 0.0361, "num_input_tokens_seen": 8745696, "step": 2630 }, { "epoch": 0.5272240702298476, "grad_norm": 0.750019371509552, "learning_rate": 4.628425557591395e-05, "loss": 0.0341, "num_input_tokens_seen": 8762016, "step": 2635 }, { "epoch": 0.5282244954105495, "grad_norm": 0.2977132499217987, "learning_rate": 4.627050256203962e-05, "loss": 0.0425, "num_input_tokens_seen": 8778400, "step": 2640 }, { "epoch": 0.5292249205912513, "grad_norm": 0.9307669401168823, "learning_rate": 4.6256726194435164e-05, "loss": 0.0541, "num_input_tokens_seen": 8795616, "step": 2645 }, { "epoch": 0.5302253457719531, "grad_norm": 0.29842454195022583, "learning_rate": 4.62429264882262e-05, "loss": 0.033, "num_input_tokens_seen": 8811584, "step": 2650 }, { "epoch": 0.5312257709526549, "grad_norm": 0.880999743938446, "learning_rate": 4.622910345856399e-05, "loss": 0.034, "num_input_tokens_seen": 8828096, "step": 2655 }, { "epoch": 0.5322261961333566, "grad_norm": 0.32106754183769226, "learning_rate": 4.621525712062537e-05, "loss": 0.0411, "num_input_tokens_seen": 8845216, "step": 2660 }, { "epoch": 0.5332266213140585, "grad_norm": 0.3914763331413269, "learning_rate": 4.62013874896128e-05, "loss": 0.0383, "num_input_tokens_seen": 8861216, "step": 2665 }, { "epoch": 0.5342270464947603, "grad_norm": 0.38581305742263794, "learning_rate": 4.6187494580754284e-05, "loss": 0.0438, "num_input_tokens_seen": 8877696, "step": 2670 }, { "epoch": 0.535227471675462, "grad_norm": 0.8075183033943176, "learning_rate": 4.617357840930341e-05, "loss": 0.0371, "num_input_tokens_seen": 8894688, "step": 2675 }, { "epoch": 0.5362278968561639, "grad_norm": 0.766015350818634, "learning_rate": 4.615963899053929e-05, "loss": 0.0526, "num_input_tokens_seen": 8910144, "step": 2680 }, { "epoch": 0.5372283220368657, "grad_norm": 1.2620230913162231, "learning_rate": 4.614567633976656e-05, "loss": 0.0485, "num_input_tokens_seen": 8925632, "step": 2685 }, { "epoch": 0.5382287472175674, "grad_norm": 0.4963071942329407, "learning_rate": 4.6131690472315394e-05, "loss": 0.027, "num_input_tokens_seen": 8942688, "step": 2690 }, { "epoch": 0.5392291723982693, "grad_norm": 0.4999218285083771, "learning_rate": 4.6117681403541406e-05, "loss": 0.0288, "num_input_tokens_seen": 8959744, "step": 2695 }, { "epoch": 0.540229597578971, "grad_norm": 0.45712122321128845, "learning_rate": 4.610364914882572e-05, "loss": 0.0353, "num_input_tokens_seen": 8976256, "step": 2700 }, { "epoch": 0.5412300227596729, "grad_norm": 0.7429212331771851, "learning_rate": 4.60895937235749e-05, "loss": 0.0274, "num_input_tokens_seen": 8992288, "step": 2705 }, { "epoch": 0.5422304479403747, "grad_norm": 0.8552815914154053, "learning_rate": 4.607551514322096e-05, "loss": 0.0519, "num_input_tokens_seen": 9008960, "step": 2710 }, { "epoch": 0.5432308731210764, "grad_norm": 0.2171797752380371, "learning_rate": 4.606141342322134e-05, "loss": 0.0296, "num_input_tokens_seen": 9024800, "step": 2715 }, { "epoch": 0.5442312983017783, "grad_norm": 0.6828743815422058, "learning_rate": 4.6047288579058864e-05, "loss": 0.0401, "num_input_tokens_seen": 9041152, "step": 2720 }, { "epoch": 0.5452317234824801, "grad_norm": 0.501909613609314, "learning_rate": 4.6033140626241776e-05, "loss": 0.0597, "num_input_tokens_seen": 9059040, "step": 2725 }, { "epoch": 0.5462321486631818, "grad_norm": 0.2294810712337494, "learning_rate": 4.6018969580303664e-05, "loss": 0.0281, "num_input_tokens_seen": 9076256, "step": 2730 }, { "epoch": 0.5472325738438837, "grad_norm": 0.20600394904613495, "learning_rate": 4.600477545680348e-05, "loss": 0.0271, "num_input_tokens_seen": 9093792, "step": 2735 }, { "epoch": 0.5482329990245854, "grad_norm": 0.3571174144744873, "learning_rate": 4.599055827132553e-05, "loss": 0.0293, "num_input_tokens_seen": 9110016, "step": 2740 }, { "epoch": 0.5492334242052872, "grad_norm": 0.4264056980609894, "learning_rate": 4.597631803947942e-05, "loss": 0.0377, "num_input_tokens_seen": 9127648, "step": 2745 }, { "epoch": 0.5502338493859891, "grad_norm": 0.6972314119338989, "learning_rate": 4.5962054776900055e-05, "loss": 0.0247, "num_input_tokens_seen": 9143840, "step": 2750 }, { "epoch": 0.5512342745666908, "grad_norm": 1.5259716510772705, "learning_rate": 4.5947768499247656e-05, "loss": 0.0358, "num_input_tokens_seen": 9159648, "step": 2755 }, { "epoch": 0.5522346997473926, "grad_norm": 0.6529611349105835, "learning_rate": 4.5933459222207694e-05, "loss": 0.0183, "num_input_tokens_seen": 9177728, "step": 2760 }, { "epoch": 0.5532351249280945, "grad_norm": 1.142457365989685, "learning_rate": 4.591912696149088e-05, "loss": 0.0493, "num_input_tokens_seen": 9193536, "step": 2765 }, { "epoch": 0.5542355501087962, "grad_norm": 0.47713854908943176, "learning_rate": 4.5904771732833176e-05, "loss": 0.0504, "num_input_tokens_seen": 9209920, "step": 2770 }, { "epoch": 0.5552359752894981, "grad_norm": 0.8752053380012512, "learning_rate": 4.5890393551995764e-05, "loss": 0.0376, "num_input_tokens_seen": 9225216, "step": 2775 }, { "epoch": 0.5562364004701998, "grad_norm": 0.5204348564147949, "learning_rate": 4.5875992434765014e-05, "loss": 0.0512, "num_input_tokens_seen": 9241440, "step": 2780 }, { "epoch": 0.5572368256509016, "grad_norm": 0.43067991733551025, "learning_rate": 4.586156839695249e-05, "loss": 0.0286, "num_input_tokens_seen": 9257984, "step": 2785 }, { "epoch": 0.5582372508316035, "grad_norm": 0.9396999478340149, "learning_rate": 4.5847121454394925e-05, "loss": 0.0404, "num_input_tokens_seen": 9275264, "step": 2790 }, { "epoch": 0.5592376760123052, "grad_norm": 0.22550955414772034, "learning_rate": 4.583265162295417e-05, "loss": 0.0528, "num_input_tokens_seen": 9292608, "step": 2795 }, { "epoch": 0.560238101193007, "grad_norm": 1.111317753791809, "learning_rate": 4.581815891851724e-05, "loss": 0.0521, "num_input_tokens_seen": 9308640, "step": 2800 }, { "epoch": 0.5612385263737089, "grad_norm": 0.24286577105522156, "learning_rate": 4.580364335699626e-05, "loss": 0.0315, "num_input_tokens_seen": 9326048, "step": 2805 }, { "epoch": 0.5622389515544106, "grad_norm": 0.6611106991767883, "learning_rate": 4.578910495432842e-05, "loss": 0.0456, "num_input_tokens_seen": 9341984, "step": 2810 }, { "epoch": 0.5632393767351124, "grad_norm": 0.38972610235214233, "learning_rate": 4.5774543726476026e-05, "loss": 0.0256, "num_input_tokens_seen": 9358016, "step": 2815 }, { "epoch": 0.5642398019158142, "grad_norm": 0.46140381693840027, "learning_rate": 4.5759959689426425e-05, "loss": 0.0279, "num_input_tokens_seen": 9374880, "step": 2820 }, { "epoch": 0.565240227096516, "grad_norm": 0.8493245244026184, "learning_rate": 4.5745352859192017e-05, "loss": 0.0323, "num_input_tokens_seen": 9391840, "step": 2825 }, { "epoch": 0.5662406522772179, "grad_norm": 0.1267855316400528, "learning_rate": 4.573072325181021e-05, "loss": 0.0227, "num_input_tokens_seen": 9407360, "step": 2830 }, { "epoch": 0.5672410774579196, "grad_norm": 0.7152971029281616, "learning_rate": 4.571607088334344e-05, "loss": 0.041, "num_input_tokens_seen": 9423200, "step": 2835 }, { "epoch": 0.5682415026386214, "grad_norm": 0.5422306060791016, "learning_rate": 4.570139576987912e-05, "loss": 0.045, "num_input_tokens_seen": 9438976, "step": 2840 }, { "epoch": 0.5692419278193233, "grad_norm": 0.2335476279258728, "learning_rate": 4.568669792752964e-05, "loss": 0.0368, "num_input_tokens_seen": 9455328, "step": 2845 }, { "epoch": 0.570242353000025, "grad_norm": 0.24949710071086884, "learning_rate": 4.567197737243235e-05, "loss": 0.0313, "num_input_tokens_seen": 9471520, "step": 2850 }, { "epoch": 0.5712427781807268, "grad_norm": 0.38113167881965637, "learning_rate": 4.565723412074953e-05, "loss": 0.0454, "num_input_tokens_seen": 9487904, "step": 2855 }, { "epoch": 0.5722432033614286, "grad_norm": 1.0451862812042236, "learning_rate": 4.564246818866838e-05, "loss": 0.0441, "num_input_tokens_seen": 9503872, "step": 2860 }, { "epoch": 0.5732436285421304, "grad_norm": 0.9802286028862, "learning_rate": 4.5627679592401e-05, "loss": 0.0406, "num_input_tokens_seen": 9519936, "step": 2865 }, { "epoch": 0.5742440537228322, "grad_norm": 0.46139222383499146, "learning_rate": 4.561286834818439e-05, "loss": 0.0278, "num_input_tokens_seen": 9536544, "step": 2870 }, { "epoch": 0.575244478903534, "grad_norm": 0.85197913646698, "learning_rate": 4.559803447228039e-05, "loss": 0.0343, "num_input_tokens_seen": 9552320, "step": 2875 }, { "epoch": 0.5762449040842358, "grad_norm": 0.7129600048065186, "learning_rate": 4.558317798097571e-05, "loss": 0.0294, "num_input_tokens_seen": 9568192, "step": 2880 }, { "epoch": 0.5772453292649375, "grad_norm": 0.34167492389678955, "learning_rate": 4.556829889058188e-05, "loss": 0.0323, "num_input_tokens_seen": 9584640, "step": 2885 }, { "epoch": 0.5782457544456394, "grad_norm": 0.6730559468269348, "learning_rate": 4.555339721743526e-05, "loss": 0.0478, "num_input_tokens_seen": 9602656, "step": 2890 }, { "epoch": 0.5792461796263412, "grad_norm": 0.5515514612197876, "learning_rate": 4.553847297789696e-05, "loss": 0.0181, "num_input_tokens_seen": 9620384, "step": 2895 }, { "epoch": 0.580246604807043, "grad_norm": 0.8256182670593262, "learning_rate": 4.552352618835291e-05, "loss": 0.0416, "num_input_tokens_seen": 9636640, "step": 2900 }, { "epoch": 0.5812470299877448, "grad_norm": 0.439656525850296, "learning_rate": 4.5508556865213784e-05, "loss": 0.0344, "num_input_tokens_seen": 9653248, "step": 2905 }, { "epoch": 0.5822474551684466, "grad_norm": 0.7512446641921997, "learning_rate": 4.5493565024915005e-05, "loss": 0.0607, "num_input_tokens_seen": 9670432, "step": 2910 }, { "epoch": 0.5832478803491484, "grad_norm": 0.5968084335327148, "learning_rate": 4.54785506839167e-05, "loss": 0.0401, "num_input_tokens_seen": 9685632, "step": 2915 }, { "epoch": 0.5842483055298502, "grad_norm": 0.41218385100364685, "learning_rate": 4.546351385870371e-05, "loss": 0.0838, "num_input_tokens_seen": 9702272, "step": 2920 }, { "epoch": 0.585248730710552, "grad_norm": 0.23477399349212646, "learning_rate": 4.544845456578557e-05, "loss": 0.0402, "num_input_tokens_seen": 9718432, "step": 2925 }, { "epoch": 0.5862491558912538, "grad_norm": 0.34075719118118286, "learning_rate": 4.5433372821696465e-05, "loss": 0.0376, "num_input_tokens_seen": 9735392, "step": 2930 }, { "epoch": 0.5872495810719556, "grad_norm": 0.4260369539260864, "learning_rate": 4.541826864299526e-05, "loss": 0.0264, "num_input_tokens_seen": 9751712, "step": 2935 }, { "epoch": 0.5882500062526573, "grad_norm": 0.2027159035205841, "learning_rate": 4.5403142046265415e-05, "loss": 0.0318, "num_input_tokens_seen": 9768864, "step": 2940 }, { "epoch": 0.5892504314333592, "grad_norm": 0.6176860332489014, "learning_rate": 4.538799304811503e-05, "loss": 0.0337, "num_input_tokens_seen": 9784768, "step": 2945 }, { "epoch": 0.590250856614061, "grad_norm": 1.2780542373657227, "learning_rate": 4.537282166517679e-05, "loss": 0.0409, "num_input_tokens_seen": 9801888, "step": 2950 }, { "epoch": 0.5912512817947628, "grad_norm": 1.051655650138855, "learning_rate": 4.5357627914107956e-05, "loss": 0.0387, "num_input_tokens_seen": 9818048, "step": 2955 }, { "epoch": 0.5922517069754646, "grad_norm": 0.1395687311887741, "learning_rate": 4.534241181159037e-05, "loss": 0.0465, "num_input_tokens_seen": 9834208, "step": 2960 }, { "epoch": 0.5932521321561663, "grad_norm": 0.737740159034729, "learning_rate": 4.532717337433037e-05, "loss": 0.0304, "num_input_tokens_seen": 9850848, "step": 2965 }, { "epoch": 0.5942525573368682, "grad_norm": 0.392226904630661, "learning_rate": 4.531191261905885e-05, "loss": 0.0542, "num_input_tokens_seen": 9866432, "step": 2970 }, { "epoch": 0.59525298251757, "grad_norm": 0.33025363087654114, "learning_rate": 4.5296629562531204e-05, "loss": 0.0281, "num_input_tokens_seen": 9883872, "step": 2975 }, { "epoch": 0.5962534076982717, "grad_norm": 0.18435664474964142, "learning_rate": 4.5281324221527314e-05, "loss": 0.0326, "num_input_tokens_seen": 9900512, "step": 2980 }, { "epoch": 0.5972538328789736, "grad_norm": 0.26673799753189087, "learning_rate": 4.5265996612851504e-05, "loss": 0.0255, "num_input_tokens_seen": 9917600, "step": 2985 }, { "epoch": 0.5982542580596754, "grad_norm": 0.2888360917568207, "learning_rate": 4.525064675333258e-05, "loss": 0.0256, "num_input_tokens_seen": 9933728, "step": 2990 }, { "epoch": 0.5992546832403771, "grad_norm": 0.37292373180389404, "learning_rate": 4.523527465982375e-05, "loss": 0.0277, "num_input_tokens_seen": 9950688, "step": 2995 }, { "epoch": 0.600255108421079, "grad_norm": 0.5753964185714722, "learning_rate": 4.521988034920266e-05, "loss": 0.0279, "num_input_tokens_seen": 9967072, "step": 3000 }, { "epoch": 0.6012555336017807, "grad_norm": 0.7338746190071106, "learning_rate": 4.5204463838371333e-05, "loss": 0.0381, "num_input_tokens_seen": 9983424, "step": 3005 }, { "epoch": 0.6022559587824825, "grad_norm": 0.1053917333483696, "learning_rate": 4.518902514425616e-05, "loss": 0.0168, "num_input_tokens_seen": 10000608, "step": 3010 }, { "epoch": 0.6032563839631844, "grad_norm": 0.3549809455871582, "learning_rate": 4.51735642838079e-05, "loss": 0.0602, "num_input_tokens_seen": 10017376, "step": 3015 }, { "epoch": 0.6042568091438861, "grad_norm": 0.2778543531894684, "learning_rate": 4.515808127400165e-05, "loss": 0.0358, "num_input_tokens_seen": 10033504, "step": 3020 }, { "epoch": 0.605257234324588, "grad_norm": 0.5423609614372253, "learning_rate": 4.514257613183682e-05, "loss": 0.0649, "num_input_tokens_seen": 10050816, "step": 3025 }, { "epoch": 0.6062576595052898, "grad_norm": 0.5223437547683716, "learning_rate": 4.5127048874337123e-05, "loss": 0.0475, "num_input_tokens_seen": 10066080, "step": 3030 }, { "epoch": 0.6072580846859915, "grad_norm": 0.2898062765598297, "learning_rate": 4.511149951855055e-05, "loss": 0.0546, "num_input_tokens_seen": 10083200, "step": 3035 }, { "epoch": 0.6082585098666934, "grad_norm": 0.3376716077327728, "learning_rate": 4.509592808154936e-05, "loss": 0.0263, "num_input_tokens_seen": 10099616, "step": 3040 }, { "epoch": 0.6092589350473951, "grad_norm": 0.6940332651138306, "learning_rate": 4.508033458043005e-05, "loss": 0.0412, "num_input_tokens_seen": 10115744, "step": 3045 }, { "epoch": 0.6102593602280969, "grad_norm": 0.4612104892730713, "learning_rate": 4.506471903231334e-05, "loss": 0.0272, "num_input_tokens_seen": 10132224, "step": 3050 }, { "epoch": 0.6112597854087988, "grad_norm": 0.33771753311157227, "learning_rate": 4.5049081454344173e-05, "loss": 0.039, "num_input_tokens_seen": 10151008, "step": 3055 }, { "epoch": 0.6122602105895005, "grad_norm": 0.5704123377799988, "learning_rate": 4.503342186369165e-05, "loss": 0.036, "num_input_tokens_seen": 10168096, "step": 3060 }, { "epoch": 0.6132606357702023, "grad_norm": 0.1791711002588272, "learning_rate": 4.501774027754908e-05, "loss": 0.0151, "num_input_tokens_seen": 10183616, "step": 3065 }, { "epoch": 0.6142610609509042, "grad_norm": 0.5915095806121826, "learning_rate": 4.5002036713133876e-05, "loss": 0.0321, "num_input_tokens_seen": 10200288, "step": 3070 }, { "epoch": 0.6152614861316059, "grad_norm": 0.5338975787162781, "learning_rate": 4.4986311187687624e-05, "loss": 0.0217, "num_input_tokens_seen": 10217248, "step": 3075 }, { "epoch": 0.6162619113123077, "grad_norm": 1.0453840494155884, "learning_rate": 4.4970563718476e-05, "loss": 0.0589, "num_input_tokens_seen": 10234688, "step": 3080 }, { "epoch": 0.6172623364930095, "grad_norm": 0.9612706899642944, "learning_rate": 4.495479432278876e-05, "loss": 0.0498, "num_input_tokens_seen": 10252096, "step": 3085 }, { "epoch": 0.6182627616737113, "grad_norm": 0.31238457560539246, "learning_rate": 4.493900301793977e-05, "loss": 0.0497, "num_input_tokens_seen": 10268512, "step": 3090 }, { "epoch": 0.6192631868544132, "grad_norm": 0.5907064080238342, "learning_rate": 4.492318982126693e-05, "loss": 0.0504, "num_input_tokens_seen": 10284736, "step": 3095 }, { "epoch": 0.6202636120351149, "grad_norm": 0.2940639853477478, "learning_rate": 4.490735475013217e-05, "loss": 0.0501, "num_input_tokens_seen": 10301632, "step": 3100 }, { "epoch": 0.6212640372158167, "grad_norm": 0.20434249937534332, "learning_rate": 4.4891497821921436e-05, "loss": 0.0248, "num_input_tokens_seen": 10319680, "step": 3105 }, { "epoch": 0.6222644623965186, "grad_norm": 0.44756656885147095, "learning_rate": 4.487561905404469e-05, "loss": 0.0591, "num_input_tokens_seen": 10336960, "step": 3110 }, { "epoch": 0.6232648875772203, "grad_norm": 0.3095456659793854, "learning_rate": 4.485971846393587e-05, "loss": 0.0334, "num_input_tokens_seen": 10352928, "step": 3115 }, { "epoch": 0.6242653127579221, "grad_norm": 0.6086257100105286, "learning_rate": 4.484379606905286e-05, "loss": 0.031, "num_input_tokens_seen": 10369632, "step": 3120 }, { "epoch": 0.6252657379386239, "grad_norm": 0.8644992709159851, "learning_rate": 4.482785188687747e-05, "loss": 0.0522, "num_input_tokens_seen": 10384992, "step": 3125 }, { "epoch": 0.6262661631193257, "grad_norm": 0.22823557257652283, "learning_rate": 4.4811885934915485e-05, "loss": 0.0193, "num_input_tokens_seen": 10401536, "step": 3130 }, { "epoch": 0.6272665883000275, "grad_norm": 0.35614490509033203, "learning_rate": 4.479589823069653e-05, "loss": 0.0364, "num_input_tokens_seen": 10418144, "step": 3135 }, { "epoch": 0.6282670134807293, "grad_norm": 0.395463228225708, "learning_rate": 4.477988879177416e-05, "loss": 0.0365, "num_input_tokens_seen": 10434176, "step": 3140 }, { "epoch": 0.6292674386614311, "grad_norm": 0.5990577936172485, "learning_rate": 4.476385763572576e-05, "loss": 0.0362, "num_input_tokens_seen": 10450432, "step": 3145 }, { "epoch": 0.630267863842133, "grad_norm": 0.7763248085975647, "learning_rate": 4.474780478015259e-05, "loss": 0.0413, "num_input_tokens_seen": 10466464, "step": 3150 }, { "epoch": 0.6312682890228347, "grad_norm": 0.3586103618144989, "learning_rate": 4.47317302426797e-05, "loss": 0.0341, "num_input_tokens_seen": 10483200, "step": 3155 }, { "epoch": 0.6322687142035365, "grad_norm": 0.2397426962852478, "learning_rate": 4.4715634040955977e-05, "loss": 0.0341, "num_input_tokens_seen": 10499008, "step": 3160 }, { "epoch": 0.6332691393842383, "grad_norm": 0.14368750154972076, "learning_rate": 4.469951619265408e-05, "loss": 0.0343, "num_input_tokens_seen": 10516928, "step": 3165 }, { "epoch": 0.6342695645649401, "grad_norm": 0.5184279680252075, "learning_rate": 4.468337671547042e-05, "loss": 0.0322, "num_input_tokens_seen": 10532800, "step": 3170 }, { "epoch": 0.6352699897456419, "grad_norm": 0.610159158706665, "learning_rate": 4.466721562712518e-05, "loss": 0.0294, "num_input_tokens_seen": 10550112, "step": 3175 }, { "epoch": 0.6362704149263437, "grad_norm": 0.4582042396068573, "learning_rate": 4.465103294536227e-05, "loss": 0.0614, "num_input_tokens_seen": 10565376, "step": 3180 }, { "epoch": 0.6372708401070455, "grad_norm": 0.4239751696586609, "learning_rate": 4.4634828687949296e-05, "loss": 0.0217, "num_input_tokens_seen": 10582112, "step": 3185 }, { "epoch": 0.6382712652877472, "grad_norm": 0.6495357155799866, "learning_rate": 4.461860287267754e-05, "loss": 0.0331, "num_input_tokens_seen": 10598592, "step": 3190 }, { "epoch": 0.6392716904684491, "grad_norm": 0.39509135484695435, "learning_rate": 4.460235551736199e-05, "loss": 0.0351, "num_input_tokens_seen": 10615392, "step": 3195 }, { "epoch": 0.6402721156491509, "grad_norm": 0.40606188774108887, "learning_rate": 4.458608663984125e-05, "loss": 0.0383, "num_input_tokens_seen": 10632352, "step": 3200 }, { "epoch": 0.6412725408298526, "grad_norm": 0.1430123746395111, "learning_rate": 4.4569796257977576e-05, "loss": 0.013, "num_input_tokens_seen": 10649504, "step": 3205 }, { "epoch": 0.6422729660105545, "grad_norm": 0.1357680708169937, "learning_rate": 4.455348438965682e-05, "loss": 0.025, "num_input_tokens_seen": 10667008, "step": 3210 }, { "epoch": 0.6432733911912563, "grad_norm": 0.4743283689022064, "learning_rate": 4.4537151052788425e-05, "loss": 0.0404, "num_input_tokens_seen": 10683744, "step": 3215 }, { "epoch": 0.6442738163719581, "grad_norm": 1.1936310529708862, "learning_rate": 4.452079626530543e-05, "loss": 0.0565, "num_input_tokens_seen": 10699840, "step": 3220 }, { "epoch": 0.6452742415526599, "grad_norm": 0.24838638305664062, "learning_rate": 4.450442004516439e-05, "loss": 0.0325, "num_input_tokens_seen": 10716416, "step": 3225 }, { "epoch": 0.6462746667333616, "grad_norm": 0.38544249534606934, "learning_rate": 4.44880224103454e-05, "loss": 0.0364, "num_input_tokens_seen": 10732960, "step": 3230 }, { "epoch": 0.6472750919140635, "grad_norm": 0.5896895527839661, "learning_rate": 4.44716033788521e-05, "loss": 0.0392, "num_input_tokens_seen": 10749632, "step": 3235 }, { "epoch": 0.6482755170947653, "grad_norm": 0.49441149830818176, "learning_rate": 4.4455162968711585e-05, "loss": 0.019, "num_input_tokens_seen": 10767808, "step": 3240 }, { "epoch": 0.649275942275467, "grad_norm": 0.14837044477462769, "learning_rate": 4.4438701197974424e-05, "loss": 0.0485, "num_input_tokens_seen": 10784512, "step": 3245 }, { "epoch": 0.6502763674561689, "grad_norm": 0.5606879591941833, "learning_rate": 4.4422218084714664e-05, "loss": 0.0354, "num_input_tokens_seen": 10800896, "step": 3250 }, { "epoch": 0.6512767926368707, "grad_norm": 0.6518362164497375, "learning_rate": 4.440571364702977e-05, "loss": 0.0263, "num_input_tokens_seen": 10817568, "step": 3255 }, { "epoch": 0.6522772178175724, "grad_norm": 0.6149171590805054, "learning_rate": 4.4389187903040605e-05, "loss": 0.0726, "num_input_tokens_seen": 10833824, "step": 3260 }, { "epoch": 0.6532776429982743, "grad_norm": 0.4839974343776703, "learning_rate": 4.437264087089146e-05, "loss": 0.0421, "num_input_tokens_seen": 10850432, "step": 3265 }, { "epoch": 0.654278068178976, "grad_norm": 0.5093735456466675, "learning_rate": 4.435607256874996e-05, "loss": 0.0282, "num_input_tokens_seen": 10867296, "step": 3270 }, { "epoch": 0.6552784933596779, "grad_norm": 0.5983829498291016, "learning_rate": 4.433948301480712e-05, "loss": 0.0304, "num_input_tokens_seen": 10883136, "step": 3275 }, { "epoch": 0.6562789185403797, "grad_norm": 0.37272733449935913, "learning_rate": 4.4322872227277255e-05, "loss": 0.0365, "num_input_tokens_seen": 10899904, "step": 3280 }, { "epoch": 0.6572793437210814, "grad_norm": 0.5240197777748108, "learning_rate": 4.4306240224398024e-05, "loss": 0.0304, "num_input_tokens_seen": 10915808, "step": 3285 }, { "epoch": 0.6582797689017833, "grad_norm": 0.26496824622154236, "learning_rate": 4.428958702443035e-05, "loss": 0.0191, "num_input_tokens_seen": 10932608, "step": 3290 }, { "epoch": 0.6592801940824851, "grad_norm": 0.16277429461479187, "learning_rate": 4.4272912645658454e-05, "loss": 0.0191, "num_input_tokens_seen": 10949152, "step": 3295 }, { "epoch": 0.6602806192631868, "grad_norm": 0.9645645618438721, "learning_rate": 4.4256217106389796e-05, "loss": 0.0397, "num_input_tokens_seen": 10965792, "step": 3300 }, { "epoch": 0.6612810444438887, "grad_norm": 0.878473699092865, "learning_rate": 4.423950042495506e-05, "loss": 0.0409, "num_input_tokens_seen": 10981184, "step": 3305 }, { "epoch": 0.6622814696245904, "grad_norm": 0.39659085869789124, "learning_rate": 4.422276261970818e-05, "loss": 0.0518, "num_input_tokens_seen": 10997088, "step": 3310 }, { "epoch": 0.6632818948052922, "grad_norm": 0.41067275404930115, "learning_rate": 4.420600370902623e-05, "loss": 0.0386, "num_input_tokens_seen": 11014784, "step": 3315 }, { "epoch": 0.6642823199859941, "grad_norm": 0.33324503898620605, "learning_rate": 4.4189223711309514e-05, "loss": 0.0388, "num_input_tokens_seen": 11030464, "step": 3320 }, { "epoch": 0.6652827451666958, "grad_norm": 0.4756176769733429, "learning_rate": 4.4172422644981435e-05, "loss": 0.0365, "num_input_tokens_seen": 11046464, "step": 3325 }, { "epoch": 0.6662831703473976, "grad_norm": 0.32805490493774414, "learning_rate": 4.415560052848855e-05, "loss": 0.0431, "num_input_tokens_seen": 11063744, "step": 3330 }, { "epoch": 0.6672835955280995, "grad_norm": 0.776425838470459, "learning_rate": 4.413875738030054e-05, "loss": 0.0389, "num_input_tokens_seen": 11080832, "step": 3335 }, { "epoch": 0.6682840207088012, "grad_norm": 0.3049832284450531, "learning_rate": 4.412189321891017e-05, "loss": 0.0338, "num_input_tokens_seen": 11098624, "step": 3340 }, { "epoch": 0.6692844458895031, "grad_norm": 0.5459462404251099, "learning_rate": 4.4105008062833264e-05, "loss": 0.0426, "num_input_tokens_seen": 11116064, "step": 3345 }, { "epoch": 0.6702848710702048, "grad_norm": 0.6240223050117493, "learning_rate": 4.408810193060871e-05, "loss": 0.0471, "num_input_tokens_seen": 11134208, "step": 3350 }, { "epoch": 0.6712852962509066, "grad_norm": 0.5435937643051147, "learning_rate": 4.407117484079843e-05, "loss": 0.0446, "num_input_tokens_seen": 11151168, "step": 3355 }, { "epoch": 0.6722857214316085, "grad_norm": 0.27000948786735535, "learning_rate": 4.405422681198732e-05, "loss": 0.0225, "num_input_tokens_seen": 11167296, "step": 3360 }, { "epoch": 0.6732861466123102, "grad_norm": 0.4941251873970032, "learning_rate": 4.403725786278333e-05, "loss": 0.0291, "num_input_tokens_seen": 11184128, "step": 3365 }, { "epoch": 0.674286571793012, "grad_norm": 0.551177442073822, "learning_rate": 4.4020268011817324e-05, "loss": 0.0207, "num_input_tokens_seen": 11201888, "step": 3370 }, { "epoch": 0.6752869969737139, "grad_norm": 0.43434232473373413, "learning_rate": 4.4003257277743135e-05, "loss": 0.0326, "num_input_tokens_seen": 11218912, "step": 3375 }, { "epoch": 0.6762874221544156, "grad_norm": 0.8790416717529297, "learning_rate": 4.3986225679237534e-05, "loss": 0.0464, "num_input_tokens_seen": 11234464, "step": 3380 }, { "epoch": 0.6772878473351174, "grad_norm": 0.6582038402557373, "learning_rate": 4.396917323500018e-05, "loss": 0.0226, "num_input_tokens_seen": 11251136, "step": 3385 }, { "epoch": 0.6782882725158192, "grad_norm": 0.34015730023384094, "learning_rate": 4.395209996375363e-05, "loss": 0.0342, "num_input_tokens_seen": 11268000, "step": 3390 }, { "epoch": 0.679288697696521, "grad_norm": 0.15952935814857483, "learning_rate": 4.393500588424333e-05, "loss": 0.0225, "num_input_tokens_seen": 11284256, "step": 3395 }, { "epoch": 0.6802891228772229, "grad_norm": 0.21244794130325317, "learning_rate": 4.391789101523751e-05, "loss": 0.0266, "num_input_tokens_seen": 11300896, "step": 3400 }, { "epoch": 0.6812895480579246, "grad_norm": 0.2520093023777008, "learning_rate": 4.390075537552729e-05, "loss": 0.0325, "num_input_tokens_seen": 11316768, "step": 3405 }, { "epoch": 0.6822899732386264, "grad_norm": 0.6954443454742432, "learning_rate": 4.388359898392656e-05, "loss": 0.074, "num_input_tokens_seen": 11333312, "step": 3410 }, { "epoch": 0.6832903984193283, "grad_norm": 0.2341863512992859, "learning_rate": 4.386642185927201e-05, "loss": 0.0247, "num_input_tokens_seen": 11350144, "step": 3415 }, { "epoch": 0.68429082360003, "grad_norm": 0.4664306044578552, "learning_rate": 4.3849224020423095e-05, "loss": 0.0323, "num_input_tokens_seen": 11365952, "step": 3420 }, { "epoch": 0.6852912487807318, "grad_norm": 0.22717294096946716, "learning_rate": 4.383200548626199e-05, "loss": 0.0527, "num_input_tokens_seen": 11382944, "step": 3425 }, { "epoch": 0.6862916739614336, "grad_norm": 0.8043995499610901, "learning_rate": 4.381476627569361e-05, "loss": 0.0347, "num_input_tokens_seen": 11398720, "step": 3430 }, { "epoch": 0.6872920991421354, "grad_norm": 0.6292429566383362, "learning_rate": 4.379750640764558e-05, "loss": 0.037, "num_input_tokens_seen": 11415168, "step": 3435 }, { "epoch": 0.6882925243228372, "grad_norm": 0.342271625995636, "learning_rate": 4.3780225901068194e-05, "loss": 0.0304, "num_input_tokens_seen": 11430912, "step": 3440 }, { "epoch": 0.689292949503539, "grad_norm": 0.6785433292388916, "learning_rate": 4.37629247749344e-05, "loss": 0.041, "num_input_tokens_seen": 11447904, "step": 3445 }, { "epoch": 0.6902933746842408, "grad_norm": 0.21225985884666443, "learning_rate": 4.374560304823979e-05, "loss": 0.0374, "num_input_tokens_seen": 11465152, "step": 3450 }, { "epoch": 0.6912937998649425, "grad_norm": 0.32921814918518066, "learning_rate": 4.372826074000258e-05, "loss": 0.0365, "num_input_tokens_seen": 11481760, "step": 3455 }, { "epoch": 0.6922942250456444, "grad_norm": 0.07804366946220398, "learning_rate": 4.371089786926359e-05, "loss": 0.0168, "num_input_tokens_seen": 11497888, "step": 3460 }, { "epoch": 0.6932946502263462, "grad_norm": 0.16711276769638062, "learning_rate": 4.369351445508618e-05, "loss": 0.0356, "num_input_tokens_seen": 11514592, "step": 3465 }, { "epoch": 0.694295075407048, "grad_norm": 0.49618369340896606, "learning_rate": 4.367611051655632e-05, "loss": 0.0502, "num_input_tokens_seen": 11532352, "step": 3470 }, { "epoch": 0.6952955005877498, "grad_norm": 0.2727205157279968, "learning_rate": 4.3658686072782476e-05, "loss": 0.0588, "num_input_tokens_seen": 11549408, "step": 3475 }, { "epoch": 0.6962959257684516, "grad_norm": 0.38875845074653625, "learning_rate": 4.364124114289562e-05, "loss": 0.0368, "num_input_tokens_seen": 11566912, "step": 3480 }, { "epoch": 0.6972963509491534, "grad_norm": 0.2872340679168701, "learning_rate": 4.362377574604926e-05, "loss": 0.0291, "num_input_tokens_seen": 11584960, "step": 3485 }, { "epoch": 0.6982967761298552, "grad_norm": 0.41954946517944336, "learning_rate": 4.3606289901419326e-05, "loss": 0.0362, "num_input_tokens_seen": 11601312, "step": 3490 }, { "epoch": 0.699297201310557, "grad_norm": 0.5533486604690552, "learning_rate": 4.358878362820425e-05, "loss": 0.0394, "num_input_tokens_seen": 11617440, "step": 3495 }, { "epoch": 0.7002976264912588, "grad_norm": 0.19497890770435333, "learning_rate": 4.357125694562484e-05, "loss": 0.0379, "num_input_tokens_seen": 11633888, "step": 3500 }, { "epoch": 0.7012980516719606, "grad_norm": 0.34632545709609985, "learning_rate": 4.355370987292434e-05, "loss": 0.0198, "num_input_tokens_seen": 11651136, "step": 3505 }, { "epoch": 0.7022984768526623, "grad_norm": 0.6666107773780823, "learning_rate": 4.353614242936839e-05, "loss": 0.0492, "num_input_tokens_seen": 11666016, "step": 3510 }, { "epoch": 0.7032989020333642, "grad_norm": 0.43019816279411316, "learning_rate": 4.351855463424498e-05, "loss": 0.031, "num_input_tokens_seen": 11682688, "step": 3515 }, { "epoch": 0.704299327214066, "grad_norm": 1.1388988494873047, "learning_rate": 4.350094650686445e-05, "loss": 0.0467, "num_input_tokens_seen": 11698688, "step": 3520 }, { "epoch": 0.7052997523947678, "grad_norm": 0.2856043875217438, "learning_rate": 4.3483318066559456e-05, "loss": 0.0315, "num_input_tokens_seen": 11716416, "step": 3525 }, { "epoch": 0.7063001775754696, "grad_norm": 0.1407749503850937, "learning_rate": 4.3465669332684965e-05, "loss": 0.0342, "num_input_tokens_seen": 11732960, "step": 3530 }, { "epoch": 0.7073006027561713, "grad_norm": 0.32876652479171753, "learning_rate": 4.344800032461823e-05, "loss": 0.0248, "num_input_tokens_seen": 11748480, "step": 3535 }, { "epoch": 0.7083010279368732, "grad_norm": 0.5137848258018494, "learning_rate": 4.343031106175876e-05, "loss": 0.0258, "num_input_tokens_seen": 11766048, "step": 3540 }, { "epoch": 0.709301453117575, "grad_norm": 0.499584436416626, "learning_rate": 4.341260156352828e-05, "loss": 0.0293, "num_input_tokens_seen": 11783232, "step": 3545 }, { "epoch": 0.7103018782982767, "grad_norm": 0.5204300284385681, "learning_rate": 4.339487184937078e-05, "loss": 0.0415, "num_input_tokens_seen": 11798976, "step": 3550 }, { "epoch": 0.7113023034789786, "grad_norm": 1.127689242362976, "learning_rate": 4.337712193875239e-05, "loss": 0.053, "num_input_tokens_seen": 11815680, "step": 3555 }, { "epoch": 0.7123027286596804, "grad_norm": 0.36804962158203125, "learning_rate": 4.335935185116147e-05, "loss": 0.0303, "num_input_tokens_seen": 11832864, "step": 3560 }, { "epoch": 0.7133031538403821, "grad_norm": 0.9016924500465393, "learning_rate": 4.334156160610847e-05, "loss": 0.0586, "num_input_tokens_seen": 11850272, "step": 3565 }, { "epoch": 0.714303579021084, "grad_norm": 0.5556852221488953, "learning_rate": 4.332375122312605e-05, "loss": 0.0301, "num_input_tokens_seen": 11866624, "step": 3570 }, { "epoch": 0.7153040042017857, "grad_norm": 0.5024580955505371, "learning_rate": 4.33059207217689e-05, "loss": 0.0317, "num_input_tokens_seen": 11882752, "step": 3575 }, { "epoch": 0.7163044293824875, "grad_norm": 0.4733322858810425, "learning_rate": 4.3288070121613854e-05, "loss": 0.0332, "num_input_tokens_seen": 11898720, "step": 3580 }, { "epoch": 0.7173048545631894, "grad_norm": 0.7094988822937012, "learning_rate": 4.32701994422598e-05, "loss": 0.0323, "num_input_tokens_seen": 11915264, "step": 3585 }, { "epoch": 0.7183052797438911, "grad_norm": 0.4604605436325073, "learning_rate": 4.3252308703327656e-05, "loss": 0.0415, "num_input_tokens_seen": 11932544, "step": 3590 }, { "epoch": 0.719305704924593, "grad_norm": 0.4017398953437805, "learning_rate": 4.323439792446038e-05, "loss": 0.0361, "num_input_tokens_seen": 11949856, "step": 3595 }, { "epoch": 0.7203061301052948, "grad_norm": 0.09843380004167557, "learning_rate": 4.321646712532292e-05, "loss": 0.0506, "num_input_tokens_seen": 11966144, "step": 3600 }, { "epoch": 0.7213065552859965, "grad_norm": 0.33211007714271545, "learning_rate": 4.319851632560222e-05, "loss": 0.0248, "num_input_tokens_seen": 11984736, "step": 3605 }, { "epoch": 0.7223069804666984, "grad_norm": 0.42315033078193665, "learning_rate": 4.318054554500719e-05, "loss": 0.0391, "num_input_tokens_seen": 12000384, "step": 3610 }, { "epoch": 0.7233074056474001, "grad_norm": 0.5389741063117981, "learning_rate": 4.316255480326864e-05, "loss": 0.0388, "num_input_tokens_seen": 12017056, "step": 3615 }, { "epoch": 0.7243078308281019, "grad_norm": 0.49584582448005676, "learning_rate": 4.314454412013934e-05, "loss": 0.0166, "num_input_tokens_seen": 12033408, "step": 3620 }, { "epoch": 0.7253082560088038, "grad_norm": 0.6536705493927002, "learning_rate": 4.312651351539392e-05, "loss": 0.0361, "num_input_tokens_seen": 12049856, "step": 3625 }, { "epoch": 0.7263086811895055, "grad_norm": 0.4414742887020111, "learning_rate": 4.31084630088289e-05, "loss": 0.0282, "num_input_tokens_seen": 12067008, "step": 3630 }, { "epoch": 0.7273091063702073, "grad_norm": 0.4404211640357971, "learning_rate": 4.309039262026265e-05, "loss": 0.0468, "num_input_tokens_seen": 12084160, "step": 3635 }, { "epoch": 0.7283095315509092, "grad_norm": 0.20715558528900146, "learning_rate": 4.307230236953538e-05, "loss": 0.0295, "num_input_tokens_seen": 12100960, "step": 3640 }, { "epoch": 0.7293099567316109, "grad_norm": 0.30401870608329773, "learning_rate": 4.305419227650908e-05, "loss": 0.0337, "num_input_tokens_seen": 12116352, "step": 3645 }, { "epoch": 0.7303103819123127, "grad_norm": 0.3494117558002472, "learning_rate": 4.303606236106754e-05, "loss": 0.0366, "num_input_tokens_seen": 12132832, "step": 3650 }, { "epoch": 0.7313108070930145, "grad_norm": 0.653393030166626, "learning_rate": 4.3017912643116324e-05, "loss": 0.028, "num_input_tokens_seen": 12148256, "step": 3655 }, { "epoch": 0.7323112322737163, "grad_norm": 0.4546215534210205, "learning_rate": 4.2999743142582735e-05, "loss": 0.0266, "num_input_tokens_seen": 12164768, "step": 3660 }, { "epoch": 0.7333116574544182, "grad_norm": 0.6860993504524231, "learning_rate": 4.298155387941577e-05, "loss": 0.0313, "num_input_tokens_seen": 12181696, "step": 3665 }, { "epoch": 0.7343120826351199, "grad_norm": 0.6686315536499023, "learning_rate": 4.296334487358615e-05, "loss": 0.0417, "num_input_tokens_seen": 12199712, "step": 3670 }, { "epoch": 0.7353125078158217, "grad_norm": 0.9289959073066711, "learning_rate": 4.294511614508628e-05, "loss": 0.0343, "num_input_tokens_seen": 12216000, "step": 3675 }, { "epoch": 0.7363129329965236, "grad_norm": 1.0001676082611084, "learning_rate": 4.29268677139302e-05, "loss": 0.0626, "num_input_tokens_seen": 12232768, "step": 3680 }, { "epoch": 0.7373133581772253, "grad_norm": 0.10140807181596756, "learning_rate": 4.290859960015357e-05, "loss": 0.0123, "num_input_tokens_seen": 12249440, "step": 3685 }, { "epoch": 0.7383137833579271, "grad_norm": 0.5375101566314697, "learning_rate": 4.2890311823813696e-05, "loss": 0.0407, "num_input_tokens_seen": 12265024, "step": 3690 }, { "epoch": 0.739314208538629, "grad_norm": 0.38093164563179016, "learning_rate": 4.2872004404989454e-05, "loss": 0.0332, "num_input_tokens_seen": 12282144, "step": 3695 }, { "epoch": 0.7403146337193307, "grad_norm": 0.461363822221756, "learning_rate": 4.2853677363781274e-05, "loss": 0.0271, "num_input_tokens_seen": 12298080, "step": 3700 }, { "epoch": 0.7413150589000325, "grad_norm": 1.1793761253356934, "learning_rate": 4.283533072031116e-05, "loss": 0.043, "num_input_tokens_seen": 12314080, "step": 3705 }, { "epoch": 0.7423154840807343, "grad_norm": 0.13559503853321075, "learning_rate": 4.281696449472261e-05, "loss": 0.0193, "num_input_tokens_seen": 12331680, "step": 3710 }, { "epoch": 0.7433159092614361, "grad_norm": 0.27876919507980347, "learning_rate": 4.279857870718064e-05, "loss": 0.0371, "num_input_tokens_seen": 12347424, "step": 3715 }, { "epoch": 0.744316334442138, "grad_norm": 0.5252572894096375, "learning_rate": 4.2780173377871725e-05, "loss": 0.0236, "num_input_tokens_seen": 12363392, "step": 3720 }, { "epoch": 0.7453167596228397, "grad_norm": 0.5381074547767639, "learning_rate": 4.276174852700382e-05, "loss": 0.0668, "num_input_tokens_seen": 12379616, "step": 3725 }, { "epoch": 0.7463171848035415, "grad_norm": 1.9420994520187378, "learning_rate": 4.2743304174806295e-05, "loss": 0.0542, "num_input_tokens_seen": 12396896, "step": 3730 }, { "epoch": 0.7473176099842433, "grad_norm": 0.14084391295909882, "learning_rate": 4.2724840341529936e-05, "loss": 0.0377, "num_input_tokens_seen": 12413472, "step": 3735 }, { "epoch": 0.7483180351649451, "grad_norm": 0.19236116111278534, "learning_rate": 4.270635704744693e-05, "loss": 0.0285, "num_input_tokens_seen": 12429760, "step": 3740 }, { "epoch": 0.7493184603456469, "grad_norm": 0.2585514485836029, "learning_rate": 4.268785431285081e-05, "loss": 0.0216, "num_input_tokens_seen": 12446816, "step": 3745 }, { "epoch": 0.7503188855263487, "grad_norm": 0.4992278218269348, "learning_rate": 4.266933215805645e-05, "loss": 0.0154, "num_input_tokens_seen": 12462464, "step": 3750 }, { "epoch": 0.7513193107070505, "grad_norm": 0.556191086769104, "learning_rate": 4.2650790603400096e-05, "loss": 0.041, "num_input_tokens_seen": 12478592, "step": 3755 }, { "epoch": 0.7523197358877523, "grad_norm": 0.7319716215133667, "learning_rate": 4.263222966923923e-05, "loss": 0.0235, "num_input_tokens_seen": 12495936, "step": 3760 }, { "epoch": 0.7533201610684541, "grad_norm": 0.29031893610954285, "learning_rate": 4.261364937595265e-05, "loss": 0.0574, "num_input_tokens_seen": 12512992, "step": 3765 }, { "epoch": 0.7543205862491559, "grad_norm": 0.34786590933799744, "learning_rate": 4.2595049743940386e-05, "loss": 0.0243, "num_input_tokens_seen": 12529504, "step": 3770 }, { "epoch": 0.7553210114298576, "grad_norm": 0.21003863215446472, "learning_rate": 4.257643079362372e-05, "loss": 0.0462, "num_input_tokens_seen": 12547072, "step": 3775 }, { "epoch": 0.7563214366105595, "grad_norm": 0.4604493975639343, "learning_rate": 4.255779254544515e-05, "loss": 0.0226, "num_input_tokens_seen": 12563712, "step": 3780 }, { "epoch": 0.7573218617912613, "grad_norm": 0.5216447114944458, "learning_rate": 4.253913501986834e-05, "loss": 0.0325, "num_input_tokens_seen": 12580544, "step": 3785 }, { "epoch": 0.7583222869719631, "grad_norm": 0.39751800894737244, "learning_rate": 4.252045823737812e-05, "loss": 0.0335, "num_input_tokens_seen": 12596832, "step": 3790 }, { "epoch": 0.7593227121526649, "grad_norm": 0.2205200493335724, "learning_rate": 4.250176221848049e-05, "loss": 0.0424, "num_input_tokens_seen": 12614656, "step": 3795 }, { "epoch": 0.7603231373333666, "grad_norm": 0.404427170753479, "learning_rate": 4.248304698370253e-05, "loss": 0.0269, "num_input_tokens_seen": 12631392, "step": 3800 }, { "epoch": 0.7613235625140685, "grad_norm": 0.8761758804321289, "learning_rate": 4.2464312553592456e-05, "loss": 0.0381, "num_input_tokens_seen": 12648768, "step": 3805 }, { "epoch": 0.7623239876947703, "grad_norm": 0.7704747319221497, "learning_rate": 4.244555894871954e-05, "loss": 0.0398, "num_input_tokens_seen": 12664736, "step": 3810 }, { "epoch": 0.763324412875472, "grad_norm": 0.2651312053203583, "learning_rate": 4.2426786189674116e-05, "loss": 0.0293, "num_input_tokens_seen": 12679712, "step": 3815 }, { "epoch": 0.7643248380561739, "grad_norm": 0.5004244446754456, "learning_rate": 4.2407994297067534e-05, "loss": 0.0598, "num_input_tokens_seen": 12697792, "step": 3820 }, { "epoch": 0.7653252632368757, "grad_norm": 0.3987187147140503, "learning_rate": 4.238918329153215e-05, "loss": 0.0501, "num_input_tokens_seen": 12714080, "step": 3825 }, { "epoch": 0.7663256884175774, "grad_norm": 0.35544225573539734, "learning_rate": 4.2370353193721336e-05, "loss": 0.02, "num_input_tokens_seen": 12729504, "step": 3830 }, { "epoch": 0.7673261135982793, "grad_norm": 0.6649848222732544, "learning_rate": 4.23515040243094e-05, "loss": 0.0341, "num_input_tokens_seen": 12746048, "step": 3835 }, { "epoch": 0.768326538778981, "grad_norm": 1.0058033466339111, "learning_rate": 4.2332635803991575e-05, "loss": 0.0267, "num_input_tokens_seen": 12763040, "step": 3840 }, { "epoch": 0.7693269639596829, "grad_norm": 0.18642783164978027, "learning_rate": 4.2313748553484056e-05, "loss": 0.0268, "num_input_tokens_seen": 12778784, "step": 3845 }, { "epoch": 0.7703273891403847, "grad_norm": 0.3343425393104553, "learning_rate": 4.229484229352388e-05, "loss": 0.0348, "num_input_tokens_seen": 12795264, "step": 3850 }, { "epoch": 0.7713278143210864, "grad_norm": 0.22427132725715637, "learning_rate": 4.2275917044869015e-05, "loss": 0.0389, "num_input_tokens_seen": 12812032, "step": 3855 }, { "epoch": 0.7723282395017883, "grad_norm": 0.2122434675693512, "learning_rate": 4.2256972828298214e-05, "loss": 0.0137, "num_input_tokens_seen": 12829440, "step": 3860 }, { "epoch": 0.7733286646824901, "grad_norm": 0.7551167607307434, "learning_rate": 4.2238009664611096e-05, "loss": 0.048, "num_input_tokens_seen": 12845952, "step": 3865 }, { "epoch": 0.7743290898631918, "grad_norm": 0.5768938064575195, "learning_rate": 4.221902757462807e-05, "loss": 0.0353, "num_input_tokens_seen": 12862848, "step": 3870 }, { "epoch": 0.7753295150438937, "grad_norm": 0.44510433077812195, "learning_rate": 4.220002657919032e-05, "loss": 0.0224, "num_input_tokens_seen": 12879424, "step": 3875 }, { "epoch": 0.7763299402245954, "grad_norm": 0.992270827293396, "learning_rate": 4.218100669915982e-05, "loss": 0.0444, "num_input_tokens_seen": 12895968, "step": 3880 }, { "epoch": 0.7773303654052972, "grad_norm": 0.4635940492153168, "learning_rate": 4.2161967955419226e-05, "loss": 0.0234, "num_input_tokens_seen": 12912704, "step": 3885 }, { "epoch": 0.7783307905859991, "grad_norm": 0.28763484954833984, "learning_rate": 4.214291036887194e-05, "loss": 0.0254, "num_input_tokens_seen": 12929600, "step": 3890 }, { "epoch": 0.7793312157667008, "grad_norm": 0.44379696249961853, "learning_rate": 4.212383396044204e-05, "loss": 0.0319, "num_input_tokens_seen": 12945280, "step": 3895 }, { "epoch": 0.7803316409474026, "grad_norm": 0.8486232757568359, "learning_rate": 4.210473875107428e-05, "loss": 0.0622, "num_input_tokens_seen": 12962048, "step": 3900 }, { "epoch": 0.7813320661281045, "grad_norm": 0.3712434470653534, "learning_rate": 4.208562476173404e-05, "loss": 0.0239, "num_input_tokens_seen": 12977888, "step": 3905 }, { "epoch": 0.7823324913088062, "grad_norm": 0.13711759448051453, "learning_rate": 4.206649201340735e-05, "loss": 0.0243, "num_input_tokens_seen": 12994752, "step": 3910 }, { "epoch": 0.7833329164895081, "grad_norm": 0.23532859981060028, "learning_rate": 4.2047340527100786e-05, "loss": 0.0224, "num_input_tokens_seen": 13011776, "step": 3915 }, { "epoch": 0.7843333416702098, "grad_norm": 0.6952040195465088, "learning_rate": 4.202817032384155e-05, "loss": 0.0422, "num_input_tokens_seen": 13028352, "step": 3920 }, { "epoch": 0.7853337668509116, "grad_norm": 0.4595640003681183, "learning_rate": 4.2008981424677354e-05, "loss": 0.0513, "num_input_tokens_seen": 13044704, "step": 3925 }, { "epoch": 0.7863341920316135, "grad_norm": 0.35172584652900696, "learning_rate": 4.198977385067645e-05, "loss": 0.0295, "num_input_tokens_seen": 13061952, "step": 3930 }, { "epoch": 0.7873346172123152, "grad_norm": 0.1835983693599701, "learning_rate": 4.1970547622927615e-05, "loss": 0.035, "num_input_tokens_seen": 13077760, "step": 3935 }, { "epoch": 0.788335042393017, "grad_norm": 0.37188035249710083, "learning_rate": 4.1951302762540074e-05, "loss": 0.0202, "num_input_tokens_seen": 13094912, "step": 3940 }, { "epoch": 0.7893354675737189, "grad_norm": 0.17795595526695251, "learning_rate": 4.193203929064353e-05, "loss": 0.0132, "num_input_tokens_seen": 13111520, "step": 3945 }, { "epoch": 0.7903358927544206, "grad_norm": 0.6609641909599304, "learning_rate": 4.191275722838811e-05, "loss": 0.0494, "num_input_tokens_seen": 13128960, "step": 3950 }, { "epoch": 0.7913363179351224, "grad_norm": 0.22620953619480133, "learning_rate": 4.189345659694436e-05, "loss": 0.0274, "num_input_tokens_seen": 13146656, "step": 3955 }, { "epoch": 0.7923367431158242, "grad_norm": 0.7801216244697571, "learning_rate": 4.187413741750322e-05, "loss": 0.0229, "num_input_tokens_seen": 13163104, "step": 3960 }, { "epoch": 0.793337168296526, "grad_norm": 0.884833037853241, "learning_rate": 4.185479971127596e-05, "loss": 0.0286, "num_input_tokens_seen": 13179744, "step": 3965 }, { "epoch": 0.7943375934772279, "grad_norm": 0.5032994151115417, "learning_rate": 4.1835443499494245e-05, "loss": 0.0409, "num_input_tokens_seen": 13197504, "step": 3970 }, { "epoch": 0.7953380186579296, "grad_norm": 1.011844277381897, "learning_rate": 4.181606880341001e-05, "loss": 0.042, "num_input_tokens_seen": 13213824, "step": 3975 }, { "epoch": 0.7963384438386314, "grad_norm": 0.6795077323913574, "learning_rate": 4.179667564429551e-05, "loss": 0.028, "num_input_tokens_seen": 13230656, "step": 3980 }, { "epoch": 0.7973388690193333, "grad_norm": 0.35327115654945374, "learning_rate": 4.177726404344326e-05, "loss": 0.0344, "num_input_tokens_seen": 13246944, "step": 3985 }, { "epoch": 0.798339294200035, "grad_norm": 0.2503950893878937, "learning_rate": 4.1757834022166034e-05, "loss": 0.0305, "num_input_tokens_seen": 13263872, "step": 3990 }, { "epoch": 0.7993397193807368, "grad_norm": 0.27673956751823425, "learning_rate": 4.173838560179682e-05, "loss": 0.051, "num_input_tokens_seen": 13280032, "step": 3995 }, { "epoch": 0.8003401445614386, "grad_norm": 0.47994551062583923, "learning_rate": 4.171891880368882e-05, "loss": 0.0438, "num_input_tokens_seen": 13296128, "step": 4000 }, { "epoch": 0.8013405697421404, "grad_norm": 0.5603805184364319, "learning_rate": 4.16994336492154e-05, "loss": 0.0375, "num_input_tokens_seen": 13312704, "step": 4005 }, { "epoch": 0.8023409949228422, "grad_norm": 0.2629021406173706, "learning_rate": 4.167993015977009e-05, "loss": 0.0334, "num_input_tokens_seen": 13329920, "step": 4010 }, { "epoch": 0.803341420103544, "grad_norm": 0.29546400904655457, "learning_rate": 4.166040835676653e-05, "loss": 0.0343, "num_input_tokens_seen": 13346976, "step": 4015 }, { "epoch": 0.8043418452842458, "grad_norm": 0.349707692861557, "learning_rate": 4.1640868261638505e-05, "loss": 0.0314, "num_input_tokens_seen": 13364512, "step": 4020 }, { "epoch": 0.8053422704649476, "grad_norm": 0.8134015798568726, "learning_rate": 4.162130989583985e-05, "loss": 0.0262, "num_input_tokens_seen": 13381696, "step": 4025 }, { "epoch": 0.8063426956456494, "grad_norm": 0.6181686520576477, "learning_rate": 4.160173328084447e-05, "loss": 0.0287, "num_input_tokens_seen": 13398976, "step": 4030 }, { "epoch": 0.8073431208263512, "grad_norm": 0.3832380473613739, "learning_rate": 4.158213843814631e-05, "loss": 0.0215, "num_input_tokens_seen": 13416032, "step": 4035 }, { "epoch": 0.808343546007053, "grad_norm": 0.44530972838401794, "learning_rate": 4.156252538925933e-05, "loss": 0.047, "num_input_tokens_seen": 13432352, "step": 4040 }, { "epoch": 0.8093439711877548, "grad_norm": 0.5037532448768616, "learning_rate": 4.154289415571749e-05, "loss": 0.0393, "num_input_tokens_seen": 13449472, "step": 4045 }, { "epoch": 0.8103443963684566, "grad_norm": 0.2083626538515091, "learning_rate": 4.152324475907468e-05, "loss": 0.0348, "num_input_tokens_seen": 13465184, "step": 4050 }, { "epoch": 0.8113448215491584, "grad_norm": 0.7969433665275574, "learning_rate": 4.1503577220904747e-05, "loss": 0.0309, "num_input_tokens_seen": 13481312, "step": 4055 }, { "epoch": 0.8123452467298602, "grad_norm": 0.5527857542037964, "learning_rate": 4.148389156280149e-05, "loss": 0.0498, "num_input_tokens_seen": 13498944, "step": 4060 }, { "epoch": 0.813345671910562, "grad_norm": 0.28512829542160034, "learning_rate": 4.146418780637855e-05, "loss": 0.0264, "num_input_tokens_seen": 13514528, "step": 4065 }, { "epoch": 0.8143460970912638, "grad_norm": 0.4216727018356323, "learning_rate": 4.144446597326948e-05, "loss": 0.0388, "num_input_tokens_seen": 13532128, "step": 4070 }, { "epoch": 0.8153465222719656, "grad_norm": 0.38498228788375854, "learning_rate": 4.1424726085127666e-05, "loss": 0.0378, "num_input_tokens_seen": 13548320, "step": 4075 }, { "epoch": 0.8163469474526673, "grad_norm": 0.8461599349975586, "learning_rate": 4.140496816362631e-05, "loss": 0.0259, "num_input_tokens_seen": 13564320, "step": 4080 }, { "epoch": 0.8173473726333692, "grad_norm": 1.0074468851089478, "learning_rate": 4.1385192230458417e-05, "loss": 0.0312, "num_input_tokens_seen": 13579904, "step": 4085 }, { "epoch": 0.818347797814071, "grad_norm": 0.4712158739566803, "learning_rate": 4.136539830733677e-05, "loss": 0.0453, "num_input_tokens_seen": 13596832, "step": 4090 }, { "epoch": 0.8193482229947727, "grad_norm": 0.4534381628036499, "learning_rate": 4.13455864159939e-05, "loss": 0.0167, "num_input_tokens_seen": 13613920, "step": 4095 }, { "epoch": 0.8203486481754746, "grad_norm": 0.1941436380147934, "learning_rate": 4.1325756578182084e-05, "loss": 0.0408, "num_input_tokens_seen": 13631232, "step": 4100 }, { "epoch": 0.8213490733561764, "grad_norm": 0.18202431499958038, "learning_rate": 4.130590881567328e-05, "loss": 0.0314, "num_input_tokens_seen": 13647392, "step": 4105 }, { "epoch": 0.8223494985368782, "grad_norm": 0.4083399176597595, "learning_rate": 4.1286043150259134e-05, "loss": 0.0398, "num_input_tokens_seen": 13663872, "step": 4110 }, { "epoch": 0.82334992371758, "grad_norm": 0.29242146015167236, "learning_rate": 4.126615960375095e-05, "loss": 0.0323, "num_input_tokens_seen": 13679616, "step": 4115 }, { "epoch": 0.8243503488982817, "grad_norm": 0.19398359954357147, "learning_rate": 4.124625819797967e-05, "loss": 0.0431, "num_input_tokens_seen": 13696416, "step": 4120 }, { "epoch": 0.8253507740789836, "grad_norm": 0.20560583472251892, "learning_rate": 4.122633895479584e-05, "loss": 0.0203, "num_input_tokens_seen": 13713088, "step": 4125 }, { "epoch": 0.8263511992596854, "grad_norm": 0.551224410533905, "learning_rate": 4.120640189606958e-05, "loss": 0.0281, "num_input_tokens_seen": 13730080, "step": 4130 }, { "epoch": 0.8273516244403871, "grad_norm": 0.5998563170433044, "learning_rate": 4.118644704369057e-05, "loss": 0.0246, "num_input_tokens_seen": 13746272, "step": 4135 }, { "epoch": 0.828352049621089, "grad_norm": 0.410586953163147, "learning_rate": 4.1166474419568055e-05, "loss": 0.0608, "num_input_tokens_seen": 13762656, "step": 4140 }, { "epoch": 0.8293524748017908, "grad_norm": 1.009325623512268, "learning_rate": 4.114648404563076e-05, "loss": 0.0401, "num_input_tokens_seen": 13779584, "step": 4145 }, { "epoch": 0.8303528999824925, "grad_norm": 0.4814990758895874, "learning_rate": 4.1126475943826924e-05, "loss": 0.043, "num_input_tokens_seen": 13795648, "step": 4150 }, { "epoch": 0.8313533251631944, "grad_norm": 0.48686766624450684, "learning_rate": 4.110645013612423e-05, "loss": 0.0287, "num_input_tokens_seen": 13812416, "step": 4155 }, { "epoch": 0.8323537503438961, "grad_norm": 0.3141302168369293, "learning_rate": 4.10864066445098e-05, "loss": 0.0188, "num_input_tokens_seen": 13828864, "step": 4160 }, { "epoch": 0.833354175524598, "grad_norm": 0.38863256573677063, "learning_rate": 4.1066345490990197e-05, "loss": 0.0436, "num_input_tokens_seen": 13844768, "step": 4165 }, { "epoch": 0.8343546007052998, "grad_norm": 0.7164223194122314, "learning_rate": 4.104626669759134e-05, "loss": 0.0319, "num_input_tokens_seen": 13861216, "step": 4170 }, { "epoch": 0.8353550258860015, "grad_norm": 0.37101924419403076, "learning_rate": 4.1026170286358544e-05, "loss": 0.0393, "num_input_tokens_seen": 13877888, "step": 4175 }, { "epoch": 0.8363554510667034, "grad_norm": 0.3636489510536194, "learning_rate": 4.100605627935647e-05, "loss": 0.0268, "num_input_tokens_seen": 13894336, "step": 4180 }, { "epoch": 0.8373558762474052, "grad_norm": 0.8379486203193665, "learning_rate": 4.098592469866906e-05, "loss": 0.0376, "num_input_tokens_seen": 13911360, "step": 4185 }, { "epoch": 0.8383563014281069, "grad_norm": 0.40897300839424133, "learning_rate": 4.0965775566399593e-05, "loss": 0.0222, "num_input_tokens_seen": 13927936, "step": 4190 }, { "epoch": 0.8393567266088088, "grad_norm": 0.8142618536949158, "learning_rate": 4.09456089046706e-05, "loss": 0.0574, "num_input_tokens_seen": 13945536, "step": 4195 }, { "epoch": 0.8403571517895105, "grad_norm": 0.18752454221248627, "learning_rate": 4.092542473562386e-05, "loss": 0.0598, "num_input_tokens_seen": 13962592, "step": 4200 }, { "epoch": 0.8413575769702123, "grad_norm": 0.6369066834449768, "learning_rate": 4.090522308142038e-05, "loss": 0.0315, "num_input_tokens_seen": 13979776, "step": 4205 }, { "epoch": 0.8423580021509142, "grad_norm": 0.27474310994148254, "learning_rate": 4.088500396424033e-05, "loss": 0.0249, "num_input_tokens_seen": 13996192, "step": 4210 }, { "epoch": 0.8433584273316159, "grad_norm": 0.548443615436554, "learning_rate": 4.086476740628312e-05, "loss": 0.0301, "num_input_tokens_seen": 14012096, "step": 4215 }, { "epoch": 0.8443588525123177, "grad_norm": 0.49320584535598755, "learning_rate": 4.084451342976726e-05, "loss": 0.0328, "num_input_tokens_seen": 14028832, "step": 4220 }, { "epoch": 0.8453592776930196, "grad_norm": 0.5967304110527039, "learning_rate": 4.082424205693038e-05, "loss": 0.0366, "num_input_tokens_seen": 14046336, "step": 4225 }, { "epoch": 0.8463597028737213, "grad_norm": 0.21384429931640625, "learning_rate": 4.080395331002925e-05, "loss": 0.0308, "num_input_tokens_seen": 14062304, "step": 4230 }, { "epoch": 0.8473601280544232, "grad_norm": 0.6950636506080627, "learning_rate": 4.078364721133967e-05, "loss": 0.0457, "num_input_tokens_seen": 14078368, "step": 4235 }, { "epoch": 0.8483605532351249, "grad_norm": 0.4137358069419861, "learning_rate": 4.0763323783156525e-05, "loss": 0.026, "num_input_tokens_seen": 14094944, "step": 4240 }, { "epoch": 0.8493609784158267, "grad_norm": 0.5785394906997681, "learning_rate": 4.074298304779371e-05, "loss": 0.0479, "num_input_tokens_seen": 14112384, "step": 4245 }, { "epoch": 0.8503614035965286, "grad_norm": 0.6416853070259094, "learning_rate": 4.0722625027584126e-05, "loss": 0.0321, "num_input_tokens_seen": 14129536, "step": 4250 }, { "epoch": 0.8513618287772303, "grad_norm": 0.499644011259079, "learning_rate": 4.070224974487966e-05, "loss": 0.0286, "num_input_tokens_seen": 14146976, "step": 4255 }, { "epoch": 0.8523622539579321, "grad_norm": 0.7566441893577576, "learning_rate": 4.0681857222051134e-05, "loss": 0.0333, "num_input_tokens_seen": 14163168, "step": 4260 }, { "epoch": 0.853362679138634, "grad_norm": 0.3304438292980194, "learning_rate": 4.066144748148832e-05, "loss": 0.037, "num_input_tokens_seen": 14180416, "step": 4265 }, { "epoch": 0.8543631043193357, "grad_norm": 0.24730849266052246, "learning_rate": 4.0641020545599875e-05, "loss": 0.0272, "num_input_tokens_seen": 14197472, "step": 4270 }, { "epoch": 0.8553635295000375, "grad_norm": 1.0695536136627197, "learning_rate": 4.062057643681335e-05, "loss": 0.0277, "num_input_tokens_seen": 14214336, "step": 4275 }, { "epoch": 0.8563639546807393, "grad_norm": 0.5582004189491272, "learning_rate": 4.0600115177575144e-05, "loss": 0.0216, "num_input_tokens_seen": 14230400, "step": 4280 }, { "epoch": 0.8573643798614411, "grad_norm": 0.5465623140335083, "learning_rate": 4.057963679035048e-05, "loss": 0.053, "num_input_tokens_seen": 14247136, "step": 4285 }, { "epoch": 0.858364805042143, "grad_norm": 0.5054659843444824, "learning_rate": 4.055914129762339e-05, "loss": 0.044, "num_input_tokens_seen": 14265056, "step": 4290 }, { "epoch": 0.8593652302228447, "grad_norm": 0.44049715995788574, "learning_rate": 4.053862872189671e-05, "loss": 0.0246, "num_input_tokens_seen": 14281696, "step": 4295 }, { "epoch": 0.8603656554035465, "grad_norm": 1.1087952852249146, "learning_rate": 4.0518099085692e-05, "loss": 0.0559, "num_input_tokens_seen": 14299200, "step": 4300 }, { "epoch": 0.8613660805842484, "grad_norm": 0.21384133398532867, "learning_rate": 4.049755241154955e-05, "loss": 0.0306, "num_input_tokens_seen": 14316608, "step": 4305 }, { "epoch": 0.8623665057649501, "grad_norm": 0.3457808792591095, "learning_rate": 4.047698872202839e-05, "loss": 0.0212, "num_input_tokens_seen": 14332992, "step": 4310 }, { "epoch": 0.8633669309456519, "grad_norm": 0.172733336687088, "learning_rate": 4.0456408039706204e-05, "loss": 0.0191, "num_input_tokens_seen": 14349216, "step": 4315 }, { "epoch": 0.8643673561263537, "grad_norm": 0.5303016304969788, "learning_rate": 4.043581038717934e-05, "loss": 0.038, "num_input_tokens_seen": 14367488, "step": 4320 }, { "epoch": 0.8653677813070555, "grad_norm": 0.3045515716075897, "learning_rate": 4.041519578706279e-05, "loss": 0.0265, "num_input_tokens_seen": 14385280, "step": 4325 }, { "epoch": 0.8663682064877573, "grad_norm": 0.18472611904144287, "learning_rate": 4.039456426199012e-05, "loss": 0.0412, "num_input_tokens_seen": 14402528, "step": 4330 }, { "epoch": 0.8673686316684591, "grad_norm": 0.5062572956085205, "learning_rate": 4.037391583461353e-05, "loss": 0.0209, "num_input_tokens_seen": 14420160, "step": 4335 }, { "epoch": 0.8683690568491609, "grad_norm": 0.6826553344726562, "learning_rate": 4.0353250527603726e-05, "loss": 0.0409, "num_input_tokens_seen": 14437824, "step": 4340 }, { "epoch": 0.8693694820298626, "grad_norm": 0.11310242116451263, "learning_rate": 4.0332568363649974e-05, "loss": 0.0559, "num_input_tokens_seen": 14455264, "step": 4345 }, { "epoch": 0.8703699072105645, "grad_norm": 0.291314035654068, "learning_rate": 4.0311869365460054e-05, "loss": 0.028, "num_input_tokens_seen": 14470944, "step": 4350 }, { "epoch": 0.8713703323912663, "grad_norm": 0.6921257972717285, "learning_rate": 4.029115355576022e-05, "loss": 0.0204, "num_input_tokens_seen": 14486784, "step": 4355 }, { "epoch": 0.8723707575719681, "grad_norm": 0.3011789321899414, "learning_rate": 4.027042095729517e-05, "loss": 0.0439, "num_input_tokens_seen": 14503392, "step": 4360 }, { "epoch": 0.8733711827526699, "grad_norm": 0.09817075729370117, "learning_rate": 4.024967159282805e-05, "loss": 0.0226, "num_input_tokens_seen": 14520000, "step": 4365 }, { "epoch": 0.8743716079333717, "grad_norm": 0.28903499245643616, "learning_rate": 4.0228905485140415e-05, "loss": 0.0106, "num_input_tokens_seen": 14535648, "step": 4370 }, { "epoch": 0.8753720331140735, "grad_norm": 0.3278878927230835, "learning_rate": 4.020812265703221e-05, "loss": 0.0384, "num_input_tokens_seen": 14552096, "step": 4375 }, { "epoch": 0.8763724582947753, "grad_norm": 0.8860204219818115, "learning_rate": 4.018732313132171e-05, "loss": 0.04, "num_input_tokens_seen": 14568736, "step": 4380 }, { "epoch": 0.877372883475477, "grad_norm": 0.23948296904563904, "learning_rate": 4.016650693084555e-05, "loss": 0.0361, "num_input_tokens_seen": 14586208, "step": 4385 }, { "epoch": 0.8783733086561789, "grad_norm": 0.1991083174943924, "learning_rate": 4.014567407845866e-05, "loss": 0.0252, "num_input_tokens_seen": 14602752, "step": 4390 }, { "epoch": 0.8793737338368807, "grad_norm": 0.6444497108459473, "learning_rate": 4.0124824597034274e-05, "loss": 0.0399, "num_input_tokens_seen": 14619360, "step": 4395 }, { "epoch": 0.8803741590175824, "grad_norm": 0.44131720066070557, "learning_rate": 4.0103958509463835e-05, "loss": 0.0316, "num_input_tokens_seen": 14636256, "step": 4400 }, { "epoch": 0.8813745841982843, "grad_norm": 0.23221592605113983, "learning_rate": 4.008307583865708e-05, "loss": 0.0172, "num_input_tokens_seen": 14652992, "step": 4405 }, { "epoch": 0.882375009378986, "grad_norm": 0.5025535225868225, "learning_rate": 4.00621766075419e-05, "loss": 0.036, "num_input_tokens_seen": 14669632, "step": 4410 }, { "epoch": 0.8833754345596879, "grad_norm": 0.27033570408821106, "learning_rate": 4.004126083906441e-05, "loss": 0.0296, "num_input_tokens_seen": 14685440, "step": 4415 }, { "epoch": 0.8843758597403897, "grad_norm": 0.0698275938630104, "learning_rate": 4.002032855618885e-05, "loss": 0.0109, "num_input_tokens_seen": 14702240, "step": 4420 }, { "epoch": 0.8853762849210914, "grad_norm": 1.5025676488876343, "learning_rate": 3.999937978189761e-05, "loss": 0.0513, "num_input_tokens_seen": 14719072, "step": 4425 }, { "epoch": 0.8863767101017933, "grad_norm": 0.9077886343002319, "learning_rate": 3.9978414539191184e-05, "loss": 0.0441, "num_input_tokens_seen": 14737344, "step": 4430 }, { "epoch": 0.8873771352824951, "grad_norm": 0.20032626390457153, "learning_rate": 3.995743285108814e-05, "loss": 0.0327, "num_input_tokens_seen": 14754432, "step": 4435 }, { "epoch": 0.8883775604631968, "grad_norm": 0.7127216458320618, "learning_rate": 3.993643474062512e-05, "loss": 0.0283, "num_input_tokens_seen": 14771008, "step": 4440 }, { "epoch": 0.8893779856438987, "grad_norm": 0.3725959360599518, "learning_rate": 3.991542023085676e-05, "loss": 0.0155, "num_input_tokens_seen": 14788352, "step": 4445 }, { "epoch": 0.8903784108246005, "grad_norm": 0.587356448173523, "learning_rate": 3.9894389344855755e-05, "loss": 0.0648, "num_input_tokens_seen": 14804928, "step": 4450 }, { "epoch": 0.8913788360053022, "grad_norm": 0.41772809624671936, "learning_rate": 3.9873342105712725e-05, "loss": 0.0299, "num_input_tokens_seen": 14821312, "step": 4455 }, { "epoch": 0.8923792611860041, "grad_norm": 0.6791223287582397, "learning_rate": 3.985227853653628e-05, "loss": 0.0475, "num_input_tokens_seen": 14837504, "step": 4460 }, { "epoch": 0.8933796863667058, "grad_norm": 0.6203396320343018, "learning_rate": 3.983119866045296e-05, "loss": 0.0505, "num_input_tokens_seen": 14855584, "step": 4465 }, { "epoch": 0.8943801115474076, "grad_norm": 1.6683917045593262, "learning_rate": 3.9810102500607184e-05, "loss": 0.0485, "num_input_tokens_seen": 14872640, "step": 4470 }, { "epoch": 0.8953805367281095, "grad_norm": 0.2774893343448639, "learning_rate": 3.978899008016127e-05, "loss": 0.044, "num_input_tokens_seen": 14890848, "step": 4475 }, { "epoch": 0.8963809619088112, "grad_norm": 0.5847212672233582, "learning_rate": 3.976786142229538e-05, "loss": 0.043, "num_input_tokens_seen": 14906144, "step": 4480 }, { "epoch": 0.8973813870895131, "grad_norm": 0.14882434904575348, "learning_rate": 3.974671655020752e-05, "loss": 0.0396, "num_input_tokens_seen": 14922400, "step": 4485 }, { "epoch": 0.8983818122702149, "grad_norm": 0.16097332537174225, "learning_rate": 3.972555548711347e-05, "loss": 0.0185, "num_input_tokens_seen": 14939008, "step": 4490 }, { "epoch": 0.8993822374509166, "grad_norm": 0.20941989123821259, "learning_rate": 3.9704378256246805e-05, "loss": 0.0363, "num_input_tokens_seen": 14956256, "step": 4495 }, { "epoch": 0.9003826626316185, "grad_norm": 0.7076755166053772, "learning_rate": 3.968318488085886e-05, "loss": 0.0432, "num_input_tokens_seen": 14971968, "step": 4500 }, { "epoch": 0.9013830878123202, "grad_norm": 0.6078043580055237, "learning_rate": 3.9661975384218664e-05, "loss": 0.0355, "num_input_tokens_seen": 14988800, "step": 4505 }, { "epoch": 0.902383512993022, "grad_norm": 0.481460303068161, "learning_rate": 3.9640749789612986e-05, "loss": 0.029, "num_input_tokens_seen": 15005120, "step": 4510 }, { "epoch": 0.9033839381737239, "grad_norm": 0.4094735085964203, "learning_rate": 3.9619508120346235e-05, "loss": 0.0356, "num_input_tokens_seen": 15022176, "step": 4515 }, { "epoch": 0.9043843633544256, "grad_norm": 0.18756358325481415, "learning_rate": 3.95982503997405e-05, "loss": 0.0254, "num_input_tokens_seen": 15038400, "step": 4520 }, { "epoch": 0.9053847885351274, "grad_norm": 0.2987261116504669, "learning_rate": 3.957697665113547e-05, "loss": 0.031, "num_input_tokens_seen": 15054624, "step": 4525 }, { "epoch": 0.9063852137158293, "grad_norm": 0.13014473021030426, "learning_rate": 3.9555686897888426e-05, "loss": 0.0274, "num_input_tokens_seen": 15070848, "step": 4530 }, { "epoch": 0.907385638896531, "grad_norm": 0.504302442073822, "learning_rate": 3.953438116337425e-05, "loss": 0.0464, "num_input_tokens_seen": 15087936, "step": 4535 }, { "epoch": 0.9083860640772328, "grad_norm": 0.47447165846824646, "learning_rate": 3.951305947098535e-05, "loss": 0.0302, "num_input_tokens_seen": 15104512, "step": 4540 }, { "epoch": 0.9093864892579346, "grad_norm": 0.5051746368408203, "learning_rate": 3.949172184413166e-05, "loss": 0.0413, "num_input_tokens_seen": 15120064, "step": 4545 }, { "epoch": 0.9103869144386364, "grad_norm": 0.5719951391220093, "learning_rate": 3.9470368306240615e-05, "loss": 0.0291, "num_input_tokens_seen": 15136000, "step": 4550 }, { "epoch": 0.9113873396193383, "grad_norm": 0.32244595885276794, "learning_rate": 3.944899888075709e-05, "loss": 0.0392, "num_input_tokens_seen": 15152704, "step": 4555 }, { "epoch": 0.91238776480004, "grad_norm": 0.2153882533311844, "learning_rate": 3.942761359114345e-05, "loss": 0.0244, "num_input_tokens_seen": 15170176, "step": 4560 }, { "epoch": 0.9133881899807418, "grad_norm": 0.40789932012557983, "learning_rate": 3.940621246087946e-05, "loss": 0.0365, "num_input_tokens_seen": 15186656, "step": 4565 }, { "epoch": 0.9143886151614437, "grad_norm": 0.38933348655700684, "learning_rate": 3.9384795513462234e-05, "loss": 0.0465, "num_input_tokens_seen": 15202592, "step": 4570 }, { "epoch": 0.9153890403421454, "grad_norm": 0.19240064918994904, "learning_rate": 3.936336277240633e-05, "loss": 0.0282, "num_input_tokens_seen": 15219904, "step": 4575 }, { "epoch": 0.9163894655228472, "grad_norm": 0.5518398880958557, "learning_rate": 3.9341914261243584e-05, "loss": 0.0333, "num_input_tokens_seen": 15236608, "step": 4580 }, { "epoch": 0.917389890703549, "grad_norm": 0.6835047602653503, "learning_rate": 3.932045000352318e-05, "loss": 0.0397, "num_input_tokens_seen": 15254016, "step": 4585 }, { "epoch": 0.9183903158842508, "grad_norm": 0.8669648170471191, "learning_rate": 3.9298970022811575e-05, "loss": 0.0398, "num_input_tokens_seen": 15270624, "step": 4590 }, { "epoch": 0.9193907410649526, "grad_norm": 0.37144604325294495, "learning_rate": 3.927747434269249e-05, "loss": 0.0323, "num_input_tokens_seen": 15287392, "step": 4595 }, { "epoch": 0.9203911662456544, "grad_norm": 1.0355428457260132, "learning_rate": 3.925596298676689e-05, "loss": 0.0466, "num_input_tokens_seen": 15304928, "step": 4600 }, { "epoch": 0.9213915914263562, "grad_norm": 0.20743988454341888, "learning_rate": 3.923443597865295e-05, "loss": 0.0258, "num_input_tokens_seen": 15322208, "step": 4605 }, { "epoch": 0.922392016607058, "grad_norm": 0.384706050157547, "learning_rate": 3.921289334198602e-05, "loss": 0.0269, "num_input_tokens_seen": 15338560, "step": 4610 }, { "epoch": 0.9233924417877598, "grad_norm": 0.7780063152313232, "learning_rate": 3.919133510041863e-05, "loss": 0.0301, "num_input_tokens_seen": 15356608, "step": 4615 }, { "epoch": 0.9243928669684616, "grad_norm": 1.501051425933838, "learning_rate": 3.916976127762042e-05, "loss": 0.0684, "num_input_tokens_seen": 15373184, "step": 4620 }, { "epoch": 0.9253932921491634, "grad_norm": 0.08556798845529556, "learning_rate": 3.914817189727815e-05, "loss": 0.0202, "num_input_tokens_seen": 15390112, "step": 4625 }, { "epoch": 0.9263937173298652, "grad_norm": 0.6085869073867798, "learning_rate": 3.912656698309565e-05, "loss": 0.0346, "num_input_tokens_seen": 15407680, "step": 4630 }, { "epoch": 0.927394142510567, "grad_norm": 0.22436808049678802, "learning_rate": 3.9104946558793834e-05, "loss": 0.0377, "num_input_tokens_seen": 15424384, "step": 4635 }, { "epoch": 0.9283945676912688, "grad_norm": 0.4365352392196655, "learning_rate": 3.908331064811061e-05, "loss": 0.0386, "num_input_tokens_seen": 15441152, "step": 4640 }, { "epoch": 0.9293949928719706, "grad_norm": 0.2901744544506073, "learning_rate": 3.90616592748009e-05, "loss": 0.0372, "num_input_tokens_seen": 15457344, "step": 4645 }, { "epoch": 0.9303954180526723, "grad_norm": 0.3928404152393341, "learning_rate": 3.903999246263662e-05, "loss": 0.0354, "num_input_tokens_seen": 15474400, "step": 4650 }, { "epoch": 0.9313958432333742, "grad_norm": 0.5478469729423523, "learning_rate": 3.901831023540662e-05, "loss": 0.0283, "num_input_tokens_seen": 15490272, "step": 4655 }, { "epoch": 0.932396268414076, "grad_norm": 0.5972186923027039, "learning_rate": 3.8996612616916686e-05, "loss": 0.0457, "num_input_tokens_seen": 15507392, "step": 4660 }, { "epoch": 0.9333966935947777, "grad_norm": 0.42526888847351074, "learning_rate": 3.897489963098948e-05, "loss": 0.0367, "num_input_tokens_seen": 15523712, "step": 4665 }, { "epoch": 0.9343971187754796, "grad_norm": 0.7272816300392151, "learning_rate": 3.8953171301464554e-05, "loss": 0.0367, "num_input_tokens_seen": 15539360, "step": 4670 }, { "epoch": 0.9353975439561814, "grad_norm": 0.4032025933265686, "learning_rate": 3.893142765219832e-05, "loss": 0.0254, "num_input_tokens_seen": 15556384, "step": 4675 }, { "epoch": 0.9363979691368832, "grad_norm": 0.1461431235074997, "learning_rate": 3.890966870706398e-05, "loss": 0.0206, "num_input_tokens_seen": 15572672, "step": 4680 }, { "epoch": 0.937398394317585, "grad_norm": 0.31067830324172974, "learning_rate": 3.8887894489951546e-05, "loss": 0.0396, "num_input_tokens_seen": 15589248, "step": 4685 }, { "epoch": 0.9383988194982867, "grad_norm": 0.2968614101409912, "learning_rate": 3.886610502476781e-05, "loss": 0.0426, "num_input_tokens_seen": 15604608, "step": 4690 }, { "epoch": 0.9393992446789886, "grad_norm": 0.48350194096565247, "learning_rate": 3.8844300335436286e-05, "loss": 0.043, "num_input_tokens_seen": 15621696, "step": 4695 }, { "epoch": 0.9403996698596904, "grad_norm": 0.6612088084220886, "learning_rate": 3.882248044589719e-05, "loss": 0.0405, "num_input_tokens_seen": 15638368, "step": 4700 }, { "epoch": 0.9414000950403921, "grad_norm": 0.32954537868499756, "learning_rate": 3.880064538010747e-05, "loss": 0.0104, "num_input_tokens_seen": 15655616, "step": 4705 }, { "epoch": 0.942400520221094, "grad_norm": 0.3914512097835541, "learning_rate": 3.8778795162040705e-05, "loss": 0.0312, "num_input_tokens_seen": 15672224, "step": 4710 }, { "epoch": 0.9434009454017958, "grad_norm": 0.1047367975115776, "learning_rate": 3.875692981568712e-05, "loss": 0.0224, "num_input_tokens_seen": 15689248, "step": 4715 }, { "epoch": 0.9444013705824975, "grad_norm": 0.23204562067985535, "learning_rate": 3.8735049365053536e-05, "loss": 0.0241, "num_input_tokens_seen": 15706144, "step": 4720 }, { "epoch": 0.9454017957631994, "grad_norm": 0.9369616508483887, "learning_rate": 3.871315383416337e-05, "loss": 0.0239, "num_input_tokens_seen": 15724032, "step": 4725 }, { "epoch": 0.9464022209439011, "grad_norm": 0.4923385679721832, "learning_rate": 3.869124324705662e-05, "loss": 0.0323, "num_input_tokens_seen": 15741184, "step": 4730 }, { "epoch": 0.947402646124603, "grad_norm": 0.5696051120758057, "learning_rate": 3.866931762778976e-05, "loss": 0.0722, "num_input_tokens_seen": 15757536, "step": 4735 }, { "epoch": 0.9484030713053048, "grad_norm": 1.1380066871643066, "learning_rate": 3.8647377000435825e-05, "loss": 0.0327, "num_input_tokens_seen": 15774208, "step": 4740 }, { "epoch": 0.9494034964860065, "grad_norm": 0.8190211057662964, "learning_rate": 3.862542138908428e-05, "loss": 0.0543, "num_input_tokens_seen": 15790752, "step": 4745 }, { "epoch": 0.9504039216667084, "grad_norm": 0.7366694211959839, "learning_rate": 3.860345081784107e-05, "loss": 0.0494, "num_input_tokens_seen": 15808608, "step": 4750 }, { "epoch": 0.9514043468474102, "grad_norm": 0.44672852754592896, "learning_rate": 3.8581465310828566e-05, "loss": 0.0269, "num_input_tokens_seen": 15824544, "step": 4755 }, { "epoch": 0.9524047720281119, "grad_norm": 0.23382925987243652, "learning_rate": 3.8559464892185526e-05, "loss": 0.0181, "num_input_tokens_seen": 15840672, "step": 4760 }, { "epoch": 0.9534051972088138, "grad_norm": 0.20152555406093597, "learning_rate": 3.853744958606708e-05, "loss": 0.0354, "num_input_tokens_seen": 15857824, "step": 4765 }, { "epoch": 0.9544056223895155, "grad_norm": 0.15209810435771942, "learning_rate": 3.851541941664471e-05, "loss": 0.0288, "num_input_tokens_seen": 15873472, "step": 4770 }, { "epoch": 0.9554060475702173, "grad_norm": 0.9689774513244629, "learning_rate": 3.849337440810622e-05, "loss": 0.0287, "num_input_tokens_seen": 15889248, "step": 4775 }, { "epoch": 0.9564064727509192, "grad_norm": 0.7377488017082214, "learning_rate": 3.847131458465568e-05, "loss": 0.0498, "num_input_tokens_seen": 15907136, "step": 4780 }, { "epoch": 0.9574068979316209, "grad_norm": 0.4979168772697449, "learning_rate": 3.8449239970513476e-05, "loss": 0.0604, "num_input_tokens_seen": 15923456, "step": 4785 }, { "epoch": 0.9584073231123227, "grad_norm": 0.21843913197517395, "learning_rate": 3.842715058991619e-05, "loss": 0.0398, "num_input_tokens_seen": 15938880, "step": 4790 }, { "epoch": 0.9594077482930246, "grad_norm": 0.2384534627199173, "learning_rate": 3.8405046467116636e-05, "loss": 0.0377, "num_input_tokens_seen": 15956320, "step": 4795 }, { "epoch": 0.9604081734737263, "grad_norm": 0.26655861735343933, "learning_rate": 3.8382927626383794e-05, "loss": 0.0161, "num_input_tokens_seen": 15973088, "step": 4800 }, { "epoch": 0.9614085986544282, "grad_norm": 0.07932254672050476, "learning_rate": 3.836079409200283e-05, "loss": 0.0228, "num_input_tokens_seen": 15989696, "step": 4805 }, { "epoch": 0.9624090238351299, "grad_norm": 0.4905978739261627, "learning_rate": 3.833864588827504e-05, "loss": 0.0198, "num_input_tokens_seen": 16006784, "step": 4810 }, { "epoch": 0.9634094490158317, "grad_norm": 0.2714848220348358, "learning_rate": 3.83164830395178e-05, "loss": 0.0299, "num_input_tokens_seen": 16022688, "step": 4815 }, { "epoch": 0.9644098741965336, "grad_norm": 0.7874173521995544, "learning_rate": 3.8294305570064596e-05, "loss": 0.0438, "num_input_tokens_seen": 16040064, "step": 4820 }, { "epoch": 0.9654102993772353, "grad_norm": 0.5085760354995728, "learning_rate": 3.8272113504264936e-05, "loss": 0.0271, "num_input_tokens_seen": 16055456, "step": 4825 }, { "epoch": 0.9664107245579371, "grad_norm": 0.5401554107666016, "learning_rate": 3.8249906866484386e-05, "loss": 0.0452, "num_input_tokens_seen": 16072352, "step": 4830 }, { "epoch": 0.967411149738639, "grad_norm": 0.45530328154563904, "learning_rate": 3.8227685681104484e-05, "loss": 0.0362, "num_input_tokens_seen": 16088928, "step": 4835 }, { "epoch": 0.9684115749193407, "grad_norm": 0.1726125329732895, "learning_rate": 3.8205449972522775e-05, "loss": 0.0349, "num_input_tokens_seen": 16105440, "step": 4840 }, { "epoch": 0.9694120001000425, "grad_norm": 0.5761016011238098, "learning_rate": 3.8183199765152706e-05, "loss": 0.021, "num_input_tokens_seen": 16121344, "step": 4845 }, { "epoch": 0.9704124252807443, "grad_norm": 0.6431677341461182, "learning_rate": 3.816093508342366e-05, "loss": 0.0484, "num_input_tokens_seen": 16137152, "step": 4850 }, { "epoch": 0.9714128504614461, "grad_norm": 0.3383553624153137, "learning_rate": 3.813865595178094e-05, "loss": 0.0314, "num_input_tokens_seen": 16153728, "step": 4855 }, { "epoch": 0.972413275642148, "grad_norm": 0.38180163502693176, "learning_rate": 3.811636239468568e-05, "loss": 0.0369, "num_input_tokens_seen": 16171584, "step": 4860 }, { "epoch": 0.9734137008228497, "grad_norm": 0.22540175914764404, "learning_rate": 3.8094054436614855e-05, "loss": 0.0188, "num_input_tokens_seen": 16188864, "step": 4865 }, { "epoch": 0.9744141260035515, "grad_norm": 0.22031109035015106, "learning_rate": 3.8071732102061254e-05, "loss": 0.0158, "num_input_tokens_seen": 16206048, "step": 4870 }, { "epoch": 0.9754145511842534, "grad_norm": 0.7438973784446716, "learning_rate": 3.8049395415533474e-05, "loss": 0.0556, "num_input_tokens_seen": 16222016, "step": 4875 }, { "epoch": 0.9764149763649551, "grad_norm": 0.841015100479126, "learning_rate": 3.802704440155583e-05, "loss": 0.0667, "num_input_tokens_seen": 16238272, "step": 4880 }, { "epoch": 0.9774154015456569, "grad_norm": 0.34464555978775024, "learning_rate": 3.800467908466841e-05, "loss": 0.0339, "num_input_tokens_seen": 16254304, "step": 4885 }, { "epoch": 0.9784158267263587, "grad_norm": 0.39327627420425415, "learning_rate": 3.7982299489426975e-05, "loss": 0.0432, "num_input_tokens_seen": 16270944, "step": 4890 }, { "epoch": 0.9794162519070605, "grad_norm": 0.5456660389900208, "learning_rate": 3.795990564040296e-05, "loss": 0.0416, "num_input_tokens_seen": 16287744, "step": 4895 }, { "epoch": 0.9804166770877623, "grad_norm": 0.5775066018104553, "learning_rate": 3.793749756218349e-05, "loss": 0.0418, "num_input_tokens_seen": 16303680, "step": 4900 }, { "epoch": 0.9814171022684641, "grad_norm": 0.5876214504241943, "learning_rate": 3.791507527937125e-05, "loss": 0.0546, "num_input_tokens_seen": 16321152, "step": 4905 }, { "epoch": 0.9824175274491659, "grad_norm": 0.42364928126335144, "learning_rate": 3.789263881658458e-05, "loss": 0.0354, "num_input_tokens_seen": 16338368, "step": 4910 }, { "epoch": 0.9834179526298676, "grad_norm": 1.0304877758026123, "learning_rate": 3.7870188198457356e-05, "loss": 0.0555, "num_input_tokens_seen": 16355712, "step": 4915 }, { "epoch": 0.9844183778105695, "grad_norm": 0.11261558532714844, "learning_rate": 3.784772344963901e-05, "loss": 0.0547, "num_input_tokens_seen": 16372480, "step": 4920 }, { "epoch": 0.9854188029912713, "grad_norm": 0.1952620893716812, "learning_rate": 3.782524459479446e-05, "loss": 0.023, "num_input_tokens_seen": 16389088, "step": 4925 }, { "epoch": 0.9864192281719731, "grad_norm": 0.42790642380714417, "learning_rate": 3.780275165860416e-05, "loss": 0.0449, "num_input_tokens_seen": 16405536, "step": 4930 }, { "epoch": 0.9874196533526749, "grad_norm": 0.08422848582267761, "learning_rate": 3.7780244665763995e-05, "loss": 0.0255, "num_input_tokens_seen": 16421504, "step": 4935 }, { "epoch": 0.9884200785333767, "grad_norm": 0.544394314289093, "learning_rate": 3.775772364098529e-05, "loss": 0.0247, "num_input_tokens_seen": 16437952, "step": 4940 }, { "epoch": 0.9894205037140785, "grad_norm": 0.4700765609741211, "learning_rate": 3.7735188608994745e-05, "loss": 0.0263, "num_input_tokens_seen": 16455168, "step": 4945 }, { "epoch": 0.9904209288947803, "grad_norm": 0.3246154189109802, "learning_rate": 3.77126395945345e-05, "loss": 0.0246, "num_input_tokens_seen": 16472448, "step": 4950 }, { "epoch": 0.991421354075482, "grad_norm": 0.35189178586006165, "learning_rate": 3.7690076622362006e-05, "loss": 0.0154, "num_input_tokens_seen": 16488608, "step": 4955 }, { "epoch": 0.9924217792561839, "grad_norm": 0.09933869540691376, "learning_rate": 3.7667499717250026e-05, "loss": 0.0294, "num_input_tokens_seen": 16504640, "step": 4960 }, { "epoch": 0.9934222044368857, "grad_norm": 0.8066585659980774, "learning_rate": 3.764490890398667e-05, "loss": 0.0378, "num_input_tokens_seen": 16521760, "step": 4965 }, { "epoch": 0.9944226296175874, "grad_norm": 0.07991819828748703, "learning_rate": 3.762230420737528e-05, "loss": 0.0271, "num_input_tokens_seen": 16537888, "step": 4970 }, { "epoch": 0.9954230547982893, "grad_norm": 0.7777450680732727, "learning_rate": 3.759968565223444e-05, "loss": 0.0187, "num_input_tokens_seen": 16555168, "step": 4975 }, { "epoch": 0.9964234799789911, "grad_norm": 0.35846781730651855, "learning_rate": 3.7577053263397974e-05, "loss": 0.0281, "num_input_tokens_seen": 16573280, "step": 4980 }, { "epoch": 0.9974239051596928, "grad_norm": 0.32245972752571106, "learning_rate": 3.755440706571487e-05, "loss": 0.0182, "num_input_tokens_seen": 16590048, "step": 4985 }, { "epoch": 0.9984243303403947, "grad_norm": 0.7595393657684326, "learning_rate": 3.7531747084049304e-05, "loss": 0.029, "num_input_tokens_seen": 16606080, "step": 4990 }, { "epoch": 0.9994247555210964, "grad_norm": 0.27887552976608276, "learning_rate": 3.7509073343280555e-05, "loss": 0.0375, "num_input_tokens_seen": 16622848, "step": 4995 }, { "epoch": 1.0004251807017983, "grad_norm": 0.27957552671432495, "learning_rate": 3.748638586830303e-05, "loss": 0.0198, "num_input_tokens_seen": 16639872, "step": 5000 }, { "epoch": 1.0014256058825, "grad_norm": 0.5249574184417725, "learning_rate": 3.7463684684026215e-05, "loss": 0.0254, "num_input_tokens_seen": 16656160, "step": 5005 }, { "epoch": 1.0024260310632018, "grad_norm": 0.2980789542198181, "learning_rate": 3.744096981537463e-05, "loss": 0.0235, "num_input_tokens_seen": 16671392, "step": 5010 }, { "epoch": 1.0034264562439037, "grad_norm": 0.18490222096443176, "learning_rate": 3.741824128728784e-05, "loss": 0.0208, "num_input_tokens_seen": 16688672, "step": 5015 }, { "epoch": 1.0044268814246056, "grad_norm": 0.2901025414466858, "learning_rate": 3.73954991247204e-05, "loss": 0.0164, "num_input_tokens_seen": 16704704, "step": 5020 }, { "epoch": 1.0054273066053072, "grad_norm": 0.8393234610557556, "learning_rate": 3.737274335264184e-05, "loss": 0.0342, "num_input_tokens_seen": 16721664, "step": 5025 }, { "epoch": 1.006427731786009, "grad_norm": 0.24501626193523407, "learning_rate": 3.7349973996036606e-05, "loss": 0.0213, "num_input_tokens_seen": 16739200, "step": 5030 }, { "epoch": 1.007428156966711, "grad_norm": 0.625282347202301, "learning_rate": 3.7327191079904094e-05, "loss": 0.0142, "num_input_tokens_seen": 16755488, "step": 5035 }, { "epoch": 1.0084285821474126, "grad_norm": 0.390674352645874, "learning_rate": 3.730439462925858e-05, "loss": 0.0377, "num_input_tokens_seen": 16771712, "step": 5040 }, { "epoch": 1.0094290073281145, "grad_norm": 0.1857205480337143, "learning_rate": 3.728158466912918e-05, "loss": 0.0187, "num_input_tokens_seen": 16788256, "step": 5045 }, { "epoch": 1.0104294325088163, "grad_norm": 0.08821652084589005, "learning_rate": 3.7258761224559855e-05, "loss": 0.027, "num_input_tokens_seen": 16805056, "step": 5050 }, { "epoch": 1.011429857689518, "grad_norm": 0.3787899613380432, "learning_rate": 3.72359243206094e-05, "loss": 0.044, "num_input_tokens_seen": 16822848, "step": 5055 }, { "epoch": 1.0124302828702199, "grad_norm": 0.2202107459306717, "learning_rate": 3.721307398235131e-05, "loss": 0.0186, "num_input_tokens_seen": 16839360, "step": 5060 }, { "epoch": 1.0134307080509217, "grad_norm": 0.793282687664032, "learning_rate": 3.7190210234873924e-05, "loss": 0.0421, "num_input_tokens_seen": 16856896, "step": 5065 }, { "epoch": 1.0144311332316234, "grad_norm": 0.602552056312561, "learning_rate": 3.7167333103280234e-05, "loss": 0.0301, "num_input_tokens_seen": 16873568, "step": 5070 }, { "epoch": 1.0154315584123252, "grad_norm": 0.3753560185432434, "learning_rate": 3.714444261268796e-05, "loss": 0.0124, "num_input_tokens_seen": 16890880, "step": 5075 }, { "epoch": 1.016431983593027, "grad_norm": 0.4732748866081238, "learning_rate": 3.7121538788229485e-05, "loss": 0.0163, "num_input_tokens_seen": 16906816, "step": 5080 }, { "epoch": 1.0174324087737288, "grad_norm": 0.6718013286590576, "learning_rate": 3.709862165505182e-05, "loss": 0.0284, "num_input_tokens_seen": 16922368, "step": 5085 }, { "epoch": 1.0184328339544306, "grad_norm": 0.340224027633667, "learning_rate": 3.707569123831659e-05, "loss": 0.0232, "num_input_tokens_seen": 16939200, "step": 5090 }, { "epoch": 1.0194332591351325, "grad_norm": 0.8260764479637146, "learning_rate": 3.705274756320004e-05, "loss": 0.0347, "num_input_tokens_seen": 16956608, "step": 5095 }, { "epoch": 1.0204336843158341, "grad_norm": 0.41199642419815063, "learning_rate": 3.7029790654892924e-05, "loss": 0.0319, "num_input_tokens_seen": 16974112, "step": 5100 }, { "epoch": 1.021434109496536, "grad_norm": 0.15774880349636078, "learning_rate": 3.700682053860053e-05, "loss": 0.0108, "num_input_tokens_seen": 16990592, "step": 5105 }, { "epoch": 1.0224345346772379, "grad_norm": 0.8791263103485107, "learning_rate": 3.698383723954268e-05, "loss": 0.0244, "num_input_tokens_seen": 17005824, "step": 5110 }, { "epoch": 1.0234349598579395, "grad_norm": 0.4210505783557892, "learning_rate": 3.6960840782953654e-05, "loss": 0.0302, "num_input_tokens_seen": 17021824, "step": 5115 }, { "epoch": 1.0244353850386414, "grad_norm": 0.305221289396286, "learning_rate": 3.6937831194082166e-05, "loss": 0.0214, "num_input_tokens_seen": 17038208, "step": 5120 }, { "epoch": 1.0254358102193433, "grad_norm": 0.25967296957969666, "learning_rate": 3.6914808498191355e-05, "loss": 0.0197, "num_input_tokens_seen": 17056096, "step": 5125 }, { "epoch": 1.026436235400045, "grad_norm": 0.5764554142951965, "learning_rate": 3.689177272055877e-05, "loss": 0.0222, "num_input_tokens_seen": 17072960, "step": 5130 }, { "epoch": 1.0274366605807468, "grad_norm": 0.4865210950374603, "learning_rate": 3.6868723886476295e-05, "loss": 0.0232, "num_input_tokens_seen": 17090176, "step": 5135 }, { "epoch": 1.0284370857614487, "grad_norm": 0.23601755499839783, "learning_rate": 3.684566202125015e-05, "loss": 0.0203, "num_input_tokens_seen": 17106528, "step": 5140 }, { "epoch": 1.0294375109421505, "grad_norm": 0.6067863702774048, "learning_rate": 3.6822587150200897e-05, "loss": 0.0243, "num_input_tokens_seen": 17123008, "step": 5145 }, { "epoch": 1.0304379361228522, "grad_norm": 0.5614882111549377, "learning_rate": 3.679949929866334e-05, "loss": 0.0153, "num_input_tokens_seen": 17139200, "step": 5150 }, { "epoch": 1.031438361303554, "grad_norm": 0.16848532855510712, "learning_rate": 3.6776398491986555e-05, "loss": 0.033, "num_input_tokens_seen": 17155648, "step": 5155 }, { "epoch": 1.032438786484256, "grad_norm": 0.20804527401924133, "learning_rate": 3.675328475553382e-05, "loss": 0.0294, "num_input_tokens_seen": 17172352, "step": 5160 }, { "epoch": 1.0334392116649576, "grad_norm": 0.3758269250392914, "learning_rate": 3.6730158114682645e-05, "loss": 0.0119, "num_input_tokens_seen": 17188640, "step": 5165 }, { "epoch": 1.0344396368456594, "grad_norm": 0.06102520599961281, "learning_rate": 3.670701859482468e-05, "loss": 0.039, "num_input_tokens_seen": 17204128, "step": 5170 }, { "epoch": 1.0354400620263613, "grad_norm": 0.3474910855293274, "learning_rate": 3.668386622136573e-05, "loss": 0.0167, "num_input_tokens_seen": 17219232, "step": 5175 }, { "epoch": 1.036440487207063, "grad_norm": 0.6429199576377869, "learning_rate": 3.666070101972569e-05, "loss": 0.0167, "num_input_tokens_seen": 17235328, "step": 5180 }, { "epoch": 1.0374409123877648, "grad_norm": 0.5511621236801147, "learning_rate": 3.663752301533857e-05, "loss": 0.0281, "num_input_tokens_seen": 17252128, "step": 5185 }, { "epoch": 1.0384413375684667, "grad_norm": 0.99336177110672, "learning_rate": 3.661433223365241e-05, "loss": 0.0424, "num_input_tokens_seen": 17268224, "step": 5190 }, { "epoch": 1.0394417627491683, "grad_norm": 0.26638805866241455, "learning_rate": 3.659112870012932e-05, "loss": 0.0117, "num_input_tokens_seen": 17284960, "step": 5195 }, { "epoch": 1.0404421879298702, "grad_norm": 0.3554804027080536, "learning_rate": 3.656791244024534e-05, "loss": 0.0205, "num_input_tokens_seen": 17302272, "step": 5200 }, { "epoch": 1.041442613110572, "grad_norm": 1.1736732721328735, "learning_rate": 3.654468347949055e-05, "loss": 0.0365, "num_input_tokens_seen": 17318976, "step": 5205 }, { "epoch": 1.0424430382912737, "grad_norm": 0.4835314154624939, "learning_rate": 3.6521441843368945e-05, "loss": 0.0291, "num_input_tokens_seen": 17335840, "step": 5210 }, { "epoch": 1.0434434634719756, "grad_norm": 0.3860490620136261, "learning_rate": 3.649818755739844e-05, "loss": 0.0162, "num_input_tokens_seen": 17352704, "step": 5215 }, { "epoch": 1.0444438886526775, "grad_norm": 0.6804181337356567, "learning_rate": 3.647492064711083e-05, "loss": 0.0413, "num_input_tokens_seen": 17369472, "step": 5220 }, { "epoch": 1.045444313833379, "grad_norm": 0.349334716796875, "learning_rate": 3.645164113805181e-05, "loss": 0.025, "num_input_tokens_seen": 17387008, "step": 5225 }, { "epoch": 1.046444739014081, "grad_norm": 0.3037664592266083, "learning_rate": 3.642834905578084e-05, "loss": 0.0212, "num_input_tokens_seen": 17403552, "step": 5230 }, { "epoch": 1.0474451641947828, "grad_norm": 0.2728258967399597, "learning_rate": 3.640504442587125e-05, "loss": 0.012, "num_input_tokens_seen": 17421408, "step": 5235 }, { "epoch": 1.0484455893754845, "grad_norm": 0.8211174011230469, "learning_rate": 3.6381727273910104e-05, "loss": 0.0286, "num_input_tokens_seen": 17438240, "step": 5240 }, { "epoch": 1.0494460145561864, "grad_norm": 0.2891646921634674, "learning_rate": 3.635839762549824e-05, "loss": 0.027, "num_input_tokens_seen": 17455488, "step": 5245 }, { "epoch": 1.0504464397368882, "grad_norm": 0.75777268409729, "learning_rate": 3.633505550625021e-05, "loss": 0.0235, "num_input_tokens_seen": 17471872, "step": 5250 }, { "epoch": 1.0514468649175899, "grad_norm": 0.709787130355835, "learning_rate": 3.6311700941794236e-05, "loss": 0.0277, "num_input_tokens_seen": 17489024, "step": 5255 }, { "epoch": 1.0524472900982917, "grad_norm": 0.4019218385219574, "learning_rate": 3.628833395777224e-05, "loss": 0.0191, "num_input_tokens_seen": 17506912, "step": 5260 }, { "epoch": 1.0534477152789936, "grad_norm": 0.47778013348579407, "learning_rate": 3.626495457983975e-05, "loss": 0.0211, "num_input_tokens_seen": 17524320, "step": 5265 }, { "epoch": 1.0544481404596953, "grad_norm": 0.5299893617630005, "learning_rate": 3.624156283366592e-05, "loss": 0.0343, "num_input_tokens_seen": 17539904, "step": 5270 }, { "epoch": 1.0554485656403971, "grad_norm": 0.16689853370189667, "learning_rate": 3.6218158744933476e-05, "loss": 0.0061, "num_input_tokens_seen": 17557504, "step": 5275 }, { "epoch": 1.056448990821099, "grad_norm": 0.5492450594902039, "learning_rate": 3.619474233933871e-05, "loss": 0.0346, "num_input_tokens_seen": 17574112, "step": 5280 }, { "epoch": 1.0574494160018009, "grad_norm": 0.7357029318809509, "learning_rate": 3.617131364259141e-05, "loss": 0.0196, "num_input_tokens_seen": 17590176, "step": 5285 }, { "epoch": 1.0584498411825025, "grad_norm": 0.4019847810268402, "learning_rate": 3.614787268041487e-05, "loss": 0.0129, "num_input_tokens_seen": 17606400, "step": 5290 }, { "epoch": 1.0594502663632044, "grad_norm": 0.6422977447509766, "learning_rate": 3.6124419478545854e-05, "loss": 0.0256, "num_input_tokens_seen": 17623584, "step": 5295 }, { "epoch": 1.0604506915439063, "grad_norm": 0.3176085948944092, "learning_rate": 3.610095406273459e-05, "loss": 0.0216, "num_input_tokens_seen": 17641504, "step": 5300 }, { "epoch": 1.061451116724608, "grad_norm": 0.6598035097122192, "learning_rate": 3.6077476458744664e-05, "loss": 0.0237, "num_input_tokens_seen": 17659232, "step": 5305 }, { "epoch": 1.0624515419053098, "grad_norm": 0.17913129925727844, "learning_rate": 3.605398669235307e-05, "loss": 0.023, "num_input_tokens_seen": 17675744, "step": 5310 }, { "epoch": 1.0634519670860116, "grad_norm": 0.6988956928253174, "learning_rate": 3.6030484789350176e-05, "loss": 0.0502, "num_input_tokens_seen": 17693056, "step": 5315 }, { "epoch": 1.0644523922667133, "grad_norm": 0.41953983902931213, "learning_rate": 3.600697077553964e-05, "loss": 0.0187, "num_input_tokens_seen": 17710336, "step": 5320 }, { "epoch": 1.0654528174474152, "grad_norm": 0.4209190309047699, "learning_rate": 3.598344467673843e-05, "loss": 0.0198, "num_input_tokens_seen": 17726176, "step": 5325 }, { "epoch": 1.066453242628117, "grad_norm": 0.17477113008499146, "learning_rate": 3.595990651877679e-05, "loss": 0.0203, "num_input_tokens_seen": 17741664, "step": 5330 }, { "epoch": 1.0674536678088187, "grad_norm": 0.30309292674064636, "learning_rate": 3.593635632749821e-05, "loss": 0.0236, "num_input_tokens_seen": 17757888, "step": 5335 }, { "epoch": 1.0684540929895205, "grad_norm": 0.3024982810020447, "learning_rate": 3.591279412875937e-05, "loss": 0.0229, "num_input_tokens_seen": 17775584, "step": 5340 }, { "epoch": 1.0694545181702224, "grad_norm": 0.7211211323738098, "learning_rate": 3.588921994843013e-05, "loss": 0.0188, "num_input_tokens_seen": 17791040, "step": 5345 }, { "epoch": 1.070454943350924, "grad_norm": 0.4666048586368561, "learning_rate": 3.586563381239355e-05, "loss": 0.0329, "num_input_tokens_seen": 17807264, "step": 5350 }, { "epoch": 1.071455368531626, "grad_norm": 0.24549226462841034, "learning_rate": 3.5842035746545765e-05, "loss": 0.0155, "num_input_tokens_seen": 17824096, "step": 5355 }, { "epoch": 1.0724557937123278, "grad_norm": 0.3099384009838104, "learning_rate": 3.5818425776796044e-05, "loss": 0.016, "num_input_tokens_seen": 17840480, "step": 5360 }, { "epoch": 1.0734562188930294, "grad_norm": 0.2598489224910736, "learning_rate": 3.579480392906669e-05, "loss": 0.0387, "num_input_tokens_seen": 17857504, "step": 5365 }, { "epoch": 1.0744566440737313, "grad_norm": 1.099205732345581, "learning_rate": 3.5771170229293085e-05, "loss": 0.0285, "num_input_tokens_seen": 17873600, "step": 5370 }, { "epoch": 1.0754570692544332, "grad_norm": 0.8734391331672668, "learning_rate": 3.574752470342361e-05, "loss": 0.0526, "num_input_tokens_seen": 17890400, "step": 5375 }, { "epoch": 1.0764574944351348, "grad_norm": 0.34526267647743225, "learning_rate": 3.5723867377419614e-05, "loss": 0.0279, "num_input_tokens_seen": 17907104, "step": 5380 }, { "epoch": 1.0774579196158367, "grad_norm": 0.3123202621936798, "learning_rate": 3.570019827725542e-05, "loss": 0.0233, "num_input_tokens_seen": 17923200, "step": 5385 }, { "epoch": 1.0784583447965386, "grad_norm": 0.6335588693618774, "learning_rate": 3.567651742891828e-05, "loss": 0.0212, "num_input_tokens_seen": 17940448, "step": 5390 }, { "epoch": 1.0794587699772404, "grad_norm": 0.10668666660785675, "learning_rate": 3.5652824858408326e-05, "loss": 0.0146, "num_input_tokens_seen": 17956544, "step": 5395 }, { "epoch": 1.080459195157942, "grad_norm": 0.7952919602394104, "learning_rate": 3.562912059173859e-05, "loss": 0.0317, "num_input_tokens_seen": 17973312, "step": 5400 }, { "epoch": 1.081459620338644, "grad_norm": 0.8085919618606567, "learning_rate": 3.560540465493492e-05, "loss": 0.0285, "num_input_tokens_seen": 17989376, "step": 5405 }, { "epoch": 1.0824600455193458, "grad_norm": 0.3211195468902588, "learning_rate": 3.558167707403599e-05, "loss": 0.0277, "num_input_tokens_seen": 18005536, "step": 5410 }, { "epoch": 1.0834604707000475, "grad_norm": 0.7521841526031494, "learning_rate": 3.555793787509324e-05, "loss": 0.0244, "num_input_tokens_seen": 18021888, "step": 5415 }, { "epoch": 1.0844608958807493, "grad_norm": 0.7724508047103882, "learning_rate": 3.5534187084170884e-05, "loss": 0.0276, "num_input_tokens_seen": 18037664, "step": 5420 }, { "epoch": 1.0854613210614512, "grad_norm": 0.40795546770095825, "learning_rate": 3.5510424727345874e-05, "loss": 0.0206, "num_input_tokens_seen": 18054400, "step": 5425 }, { "epoch": 1.0864617462421529, "grad_norm": 0.40630871057510376, "learning_rate": 3.548665083070784e-05, "loss": 0.0218, "num_input_tokens_seen": 18072672, "step": 5430 }, { "epoch": 1.0874621714228547, "grad_norm": 0.7899190783500671, "learning_rate": 3.5462865420359073e-05, "loss": 0.0295, "num_input_tokens_seen": 18089280, "step": 5435 }, { "epoch": 1.0884625966035566, "grad_norm": 0.5796297788619995, "learning_rate": 3.543906852241454e-05, "loss": 0.0321, "num_input_tokens_seen": 18105600, "step": 5440 }, { "epoch": 1.0894630217842582, "grad_norm": 0.7630406022071838, "learning_rate": 3.54152601630018e-05, "loss": 0.0297, "num_input_tokens_seen": 18121152, "step": 5445 }, { "epoch": 1.0904634469649601, "grad_norm": 0.6281930208206177, "learning_rate": 3.5391440368260975e-05, "loss": 0.0432, "num_input_tokens_seen": 18138976, "step": 5450 }, { "epoch": 1.091463872145662, "grad_norm": 0.6421923041343689, "learning_rate": 3.536760916434478e-05, "loss": 0.0412, "num_input_tokens_seen": 18156800, "step": 5455 }, { "epoch": 1.0924642973263636, "grad_norm": 0.4233951270580292, "learning_rate": 3.534376657741845e-05, "loss": 0.0372, "num_input_tokens_seen": 18172512, "step": 5460 }, { "epoch": 1.0934647225070655, "grad_norm": 0.30023810267448425, "learning_rate": 3.53199126336597e-05, "loss": 0.0178, "num_input_tokens_seen": 18188352, "step": 5465 }, { "epoch": 1.0944651476877674, "grad_norm": 0.8775084614753723, "learning_rate": 3.529604735925871e-05, "loss": 0.0278, "num_input_tokens_seen": 18204960, "step": 5470 }, { "epoch": 1.095465572868469, "grad_norm": 0.2205784022808075, "learning_rate": 3.527217078041813e-05, "loss": 0.0205, "num_input_tokens_seen": 18220000, "step": 5475 }, { "epoch": 1.0964659980491709, "grad_norm": 0.7994363307952881, "learning_rate": 3.5248282923353004e-05, "loss": 0.032, "num_input_tokens_seen": 18235712, "step": 5480 }, { "epoch": 1.0974664232298728, "grad_norm": 1.386659026145935, "learning_rate": 3.5224383814290754e-05, "loss": 0.0451, "num_input_tokens_seen": 18252384, "step": 5485 }, { "epoch": 1.0984668484105744, "grad_norm": 0.6417826414108276, "learning_rate": 3.520047347947116e-05, "loss": 0.037, "num_input_tokens_seen": 18268256, "step": 5490 }, { "epoch": 1.0994672735912763, "grad_norm": 0.3438206613063812, "learning_rate": 3.517655194514634e-05, "loss": 0.0224, "num_input_tokens_seen": 18284128, "step": 5495 }, { "epoch": 1.1004676987719781, "grad_norm": 0.7664891481399536, "learning_rate": 3.515261923758068e-05, "loss": 0.0237, "num_input_tokens_seen": 18300480, "step": 5500 }, { "epoch": 1.1014681239526798, "grad_norm": 0.6775107383728027, "learning_rate": 3.512867538305086e-05, "loss": 0.0164, "num_input_tokens_seen": 18318144, "step": 5505 }, { "epoch": 1.1024685491333817, "grad_norm": 0.3344127833843231, "learning_rate": 3.510472040784579e-05, "loss": 0.0171, "num_input_tokens_seen": 18333760, "step": 5510 }, { "epoch": 1.1034689743140835, "grad_norm": 0.2144256830215454, "learning_rate": 3.50807543382666e-05, "loss": 0.0102, "num_input_tokens_seen": 18349056, "step": 5515 }, { "epoch": 1.1044693994947852, "grad_norm": 0.3108498156070709, "learning_rate": 3.5056777200626574e-05, "loss": 0.018, "num_input_tokens_seen": 18366240, "step": 5520 }, { "epoch": 1.105469824675487, "grad_norm": 0.23828475177288055, "learning_rate": 3.5032789021251167e-05, "loss": 0.0204, "num_input_tokens_seen": 18382816, "step": 5525 }, { "epoch": 1.106470249856189, "grad_norm": 0.316902220249176, "learning_rate": 3.500878982647795e-05, "loss": 0.0144, "num_input_tokens_seen": 18399808, "step": 5530 }, { "epoch": 1.1074706750368906, "grad_norm": 1.227034330368042, "learning_rate": 3.498477964265661e-05, "loss": 0.0239, "num_input_tokens_seen": 18417088, "step": 5535 }, { "epoch": 1.1084711002175924, "grad_norm": 1.0346033573150635, "learning_rate": 3.496075849614887e-05, "loss": 0.0248, "num_input_tokens_seen": 18432448, "step": 5540 }, { "epoch": 1.1094715253982943, "grad_norm": 0.13602112233638763, "learning_rate": 3.49367264133285e-05, "loss": 0.0256, "num_input_tokens_seen": 18449440, "step": 5545 }, { "epoch": 1.1104719505789962, "grad_norm": 0.771540105342865, "learning_rate": 3.491268342058128e-05, "loss": 0.0304, "num_input_tokens_seen": 18465312, "step": 5550 }, { "epoch": 1.1114723757596978, "grad_norm": 0.5797554850578308, "learning_rate": 3.488862954430499e-05, "loss": 0.0218, "num_input_tokens_seen": 18481824, "step": 5555 }, { "epoch": 1.1124728009403997, "grad_norm": 0.7463229298591614, "learning_rate": 3.4864564810909296e-05, "loss": 0.0164, "num_input_tokens_seen": 18498624, "step": 5560 }, { "epoch": 1.1134732261211016, "grad_norm": 0.5763323903083801, "learning_rate": 3.484048924681584e-05, "loss": 0.0195, "num_input_tokens_seen": 18513824, "step": 5565 }, { "epoch": 1.1144736513018032, "grad_norm": 0.17134538292884827, "learning_rate": 3.481640287845817e-05, "loss": 0.0281, "num_input_tokens_seen": 18530592, "step": 5570 }, { "epoch": 1.115474076482505, "grad_norm": 0.802047610282898, "learning_rate": 3.479230573228162e-05, "loss": 0.0349, "num_input_tokens_seen": 18547104, "step": 5575 }, { "epoch": 1.116474501663207, "grad_norm": 0.6504844427108765, "learning_rate": 3.476819783474344e-05, "loss": 0.0289, "num_input_tokens_seen": 18562560, "step": 5580 }, { "epoch": 1.1174749268439086, "grad_norm": 0.61082923412323, "learning_rate": 3.474407921231263e-05, "loss": 0.0294, "num_input_tokens_seen": 18578752, "step": 5585 }, { "epoch": 1.1184753520246105, "grad_norm": 0.09099405258893967, "learning_rate": 3.471994989147e-05, "loss": 0.0114, "num_input_tokens_seen": 18596128, "step": 5590 }, { "epoch": 1.1194757772053123, "grad_norm": 0.28908923268318176, "learning_rate": 3.4695809898708075e-05, "loss": 0.0127, "num_input_tokens_seen": 18611872, "step": 5595 }, { "epoch": 1.120476202386014, "grad_norm": 0.3132735788822174, "learning_rate": 3.4671659260531116e-05, "loss": 0.0168, "num_input_tokens_seen": 18628416, "step": 5600 }, { "epoch": 1.1214766275667158, "grad_norm": 0.7768466472625732, "learning_rate": 3.464749800345507e-05, "loss": 0.0145, "num_input_tokens_seen": 18645376, "step": 5605 }, { "epoch": 1.1224770527474177, "grad_norm": 0.35140541195869446, "learning_rate": 3.462332615400755e-05, "loss": 0.0288, "num_input_tokens_seen": 18662048, "step": 5610 }, { "epoch": 1.1234774779281194, "grad_norm": 0.6198104023933411, "learning_rate": 3.459914373872778e-05, "loss": 0.0236, "num_input_tokens_seen": 18678432, "step": 5615 }, { "epoch": 1.1244779031088212, "grad_norm": 0.14505791664123535, "learning_rate": 3.457495078416659e-05, "loss": 0.0202, "num_input_tokens_seen": 18694304, "step": 5620 }, { "epoch": 1.125478328289523, "grad_norm": 0.19856490194797516, "learning_rate": 3.45507473168864e-05, "loss": 0.0171, "num_input_tokens_seen": 18710080, "step": 5625 }, { "epoch": 1.126478753470225, "grad_norm": 0.11967971920967102, "learning_rate": 3.452653336346115e-05, "loss": 0.0109, "num_input_tokens_seen": 18726720, "step": 5630 }, { "epoch": 1.1274791786509266, "grad_norm": 0.2559490203857422, "learning_rate": 3.450230895047631e-05, "loss": 0.0251, "num_input_tokens_seen": 18743680, "step": 5635 }, { "epoch": 1.1284796038316285, "grad_norm": 0.4048309028148651, "learning_rate": 3.447807410452881e-05, "loss": 0.0364, "num_input_tokens_seen": 18760928, "step": 5640 }, { "epoch": 1.1294800290123304, "grad_norm": 0.05227162688970566, "learning_rate": 3.445382885222707e-05, "loss": 0.0132, "num_input_tokens_seen": 18777504, "step": 5645 }, { "epoch": 1.130480454193032, "grad_norm": 0.2708294093608856, "learning_rate": 3.44295732201909e-05, "loss": 0.0227, "num_input_tokens_seen": 18794752, "step": 5650 }, { "epoch": 1.1314808793737339, "grad_norm": 0.8836909532546997, "learning_rate": 3.4405307235051534e-05, "loss": 0.023, "num_input_tokens_seen": 18812352, "step": 5655 }, { "epoch": 1.1324813045544357, "grad_norm": 0.8740631937980652, "learning_rate": 3.438103092345155e-05, "loss": 0.0261, "num_input_tokens_seen": 18829472, "step": 5660 }, { "epoch": 1.1334817297351374, "grad_norm": 0.13569481670856476, "learning_rate": 3.435674431204489e-05, "loss": 0.0297, "num_input_tokens_seen": 18845664, "step": 5665 }, { "epoch": 1.1344821549158393, "grad_norm": 0.2504052519798279, "learning_rate": 3.4332447427496775e-05, "loss": 0.0278, "num_input_tokens_seen": 18861632, "step": 5670 }, { "epoch": 1.1354825800965411, "grad_norm": 0.8606333136558533, "learning_rate": 3.4308140296483724e-05, "loss": 0.0274, "num_input_tokens_seen": 18878240, "step": 5675 }, { "epoch": 1.1364830052772428, "grad_norm": 0.5849329233169556, "learning_rate": 3.4283822945693514e-05, "loss": 0.0286, "num_input_tokens_seen": 18894624, "step": 5680 }, { "epoch": 1.1374834304579446, "grad_norm": 0.30441370606422424, "learning_rate": 3.425949540182512e-05, "loss": 0.0366, "num_input_tokens_seen": 18911168, "step": 5685 }, { "epoch": 1.1384838556386465, "grad_norm": 0.29574379324913025, "learning_rate": 3.423515769158872e-05, "loss": 0.0133, "num_input_tokens_seen": 18927712, "step": 5690 }, { "epoch": 1.1394842808193482, "grad_norm": 0.2827134430408478, "learning_rate": 3.421080984170565e-05, "loss": 0.0249, "num_input_tokens_seen": 18944288, "step": 5695 }, { "epoch": 1.14048470600005, "grad_norm": 0.09477187693119049, "learning_rate": 3.418645187890839e-05, "loss": 0.0237, "num_input_tokens_seen": 18960864, "step": 5700 }, { "epoch": 1.141485131180752, "grad_norm": 0.2097742259502411, "learning_rate": 3.416208382994051e-05, "loss": 0.0155, "num_input_tokens_seen": 18977568, "step": 5705 }, { "epoch": 1.1424855563614535, "grad_norm": 0.887037456035614, "learning_rate": 3.413770572155667e-05, "loss": 0.0303, "num_input_tokens_seen": 18993920, "step": 5710 }, { "epoch": 1.1434859815421554, "grad_norm": 0.1299106776714325, "learning_rate": 3.411331758052254e-05, "loss": 0.0171, "num_input_tokens_seen": 19010688, "step": 5715 }, { "epoch": 1.1444864067228573, "grad_norm": 1.0206340551376343, "learning_rate": 3.408891943361486e-05, "loss": 0.0325, "num_input_tokens_seen": 19028992, "step": 5720 }, { "epoch": 1.145486831903559, "grad_norm": 0.44007593393325806, "learning_rate": 3.406451130762131e-05, "loss": 0.019, "num_input_tokens_seen": 19045792, "step": 5725 }, { "epoch": 1.1464872570842608, "grad_norm": 0.297586590051651, "learning_rate": 3.404009322934052e-05, "loss": 0.0136, "num_input_tokens_seen": 19062208, "step": 5730 }, { "epoch": 1.1474876822649627, "grad_norm": 0.34563490748405457, "learning_rate": 3.4015665225582104e-05, "loss": 0.0117, "num_input_tokens_seen": 19078304, "step": 5735 }, { "epoch": 1.1484881074456643, "grad_norm": 0.38076719641685486, "learning_rate": 3.399122732316653e-05, "loss": 0.0153, "num_input_tokens_seen": 19095904, "step": 5740 }, { "epoch": 1.1494885326263662, "grad_norm": 0.1036958321928978, "learning_rate": 3.396677954892514e-05, "loss": 0.0243, "num_input_tokens_seen": 19111264, "step": 5745 }, { "epoch": 1.150488957807068, "grad_norm": 0.0501471608877182, "learning_rate": 3.3942321929700096e-05, "loss": 0.0102, "num_input_tokens_seen": 19128064, "step": 5750 }, { "epoch": 1.1514893829877697, "grad_norm": 0.27944546937942505, "learning_rate": 3.3917854492344416e-05, "loss": 0.0181, "num_input_tokens_seen": 19143904, "step": 5755 }, { "epoch": 1.1524898081684716, "grad_norm": 0.653773307800293, "learning_rate": 3.389337726372187e-05, "loss": 0.0128, "num_input_tokens_seen": 19159328, "step": 5760 }, { "epoch": 1.1534902333491734, "grad_norm": 0.3465997874736786, "learning_rate": 3.386889027070696e-05, "loss": 0.0109, "num_input_tokens_seen": 19175488, "step": 5765 }, { "epoch": 1.154490658529875, "grad_norm": 0.22995132207870483, "learning_rate": 3.3844393540184954e-05, "loss": 0.0209, "num_input_tokens_seen": 19192416, "step": 5770 }, { "epoch": 1.155491083710577, "grad_norm": 1.1864521503448486, "learning_rate": 3.381988709905177e-05, "loss": 0.0479, "num_input_tokens_seen": 19209952, "step": 5775 }, { "epoch": 1.1564915088912788, "grad_norm": 0.29596784710884094, "learning_rate": 3.379537097421401e-05, "loss": 0.0179, "num_input_tokens_seen": 19226112, "step": 5780 }, { "epoch": 1.1574919340719805, "grad_norm": 0.20320992171764374, "learning_rate": 3.37708451925889e-05, "loss": 0.0267, "num_input_tokens_seen": 19242816, "step": 5785 }, { "epoch": 1.1584923592526823, "grad_norm": 0.21257950365543365, "learning_rate": 3.374630978110428e-05, "loss": 0.0265, "num_input_tokens_seen": 19259456, "step": 5790 }, { "epoch": 1.1594927844333842, "grad_norm": 0.32195860147476196, "learning_rate": 3.372176476669853e-05, "loss": 0.024, "num_input_tokens_seen": 19275488, "step": 5795 }, { "epoch": 1.1604932096140859, "grad_norm": 0.3061094880104065, "learning_rate": 3.3697210176320615e-05, "loss": 0.0162, "num_input_tokens_seen": 19291232, "step": 5800 }, { "epoch": 1.1614936347947877, "grad_norm": 0.5570156574249268, "learning_rate": 3.3672646036929985e-05, "loss": 0.024, "num_input_tokens_seen": 19308448, "step": 5805 }, { "epoch": 1.1624940599754896, "grad_norm": 0.05850246548652649, "learning_rate": 3.3648072375496594e-05, "loss": 0.024, "num_input_tokens_seen": 19324416, "step": 5810 }, { "epoch": 1.1634944851561915, "grad_norm": 0.42457568645477295, "learning_rate": 3.362348921900085e-05, "loss": 0.0221, "num_input_tokens_seen": 19342176, "step": 5815 }, { "epoch": 1.1644949103368931, "grad_norm": 0.6461185216903687, "learning_rate": 3.359889659443356e-05, "loss": 0.0171, "num_input_tokens_seen": 19359264, "step": 5820 }, { "epoch": 1.165495335517595, "grad_norm": 0.24088598787784576, "learning_rate": 3.3574294528795945e-05, "loss": 0.0225, "num_input_tokens_seen": 19375808, "step": 5825 }, { "epoch": 1.1664957606982969, "grad_norm": 0.24455271661281586, "learning_rate": 3.35496830490996e-05, "loss": 0.021, "num_input_tokens_seen": 19391296, "step": 5830 }, { "epoch": 1.1674961858789985, "grad_norm": 0.48601317405700684, "learning_rate": 3.352506218236644e-05, "loss": 0.0147, "num_input_tokens_seen": 19408896, "step": 5835 }, { "epoch": 1.1684966110597004, "grad_norm": 0.13113577663898468, "learning_rate": 3.35004319556287e-05, "loss": 0.0172, "num_input_tokens_seen": 19425856, "step": 5840 }, { "epoch": 1.1694970362404022, "grad_norm": 0.21660153567790985, "learning_rate": 3.347579239592888e-05, "loss": 0.022, "num_input_tokens_seen": 19443200, "step": 5845 }, { "epoch": 1.170497461421104, "grad_norm": 0.39085274934768677, "learning_rate": 3.3451143530319725e-05, "loss": 0.0165, "num_input_tokens_seen": 19460800, "step": 5850 }, { "epoch": 1.1714978866018058, "grad_norm": 1.0306813716888428, "learning_rate": 3.342648538586422e-05, "loss": 0.0167, "num_input_tokens_seen": 19477280, "step": 5855 }, { "epoch": 1.1724983117825076, "grad_norm": 0.38740503787994385, "learning_rate": 3.340181798963551e-05, "loss": 0.0252, "num_input_tokens_seen": 19495232, "step": 5860 }, { "epoch": 1.1734987369632093, "grad_norm": 0.10791005939245224, "learning_rate": 3.337714136871691e-05, "loss": 0.0362, "num_input_tokens_seen": 19511232, "step": 5865 }, { "epoch": 1.1744991621439111, "grad_norm": 0.49075236916542053, "learning_rate": 3.335245555020187e-05, "loss": 0.0241, "num_input_tokens_seen": 19528256, "step": 5870 }, { "epoch": 1.175499587324613, "grad_norm": 0.8124425411224365, "learning_rate": 3.332776056119392e-05, "loss": 0.0284, "num_input_tokens_seen": 19545344, "step": 5875 }, { "epoch": 1.1765000125053147, "grad_norm": 0.543897807598114, "learning_rate": 3.330305642880669e-05, "loss": 0.0191, "num_input_tokens_seen": 19561792, "step": 5880 }, { "epoch": 1.1775004376860165, "grad_norm": 1.1725205183029175, "learning_rate": 3.327834318016381e-05, "loss": 0.0338, "num_input_tokens_seen": 19578016, "step": 5885 }, { "epoch": 1.1785008628667184, "grad_norm": 0.5985966324806213, "learning_rate": 3.325362084239894e-05, "loss": 0.0297, "num_input_tokens_seen": 19595776, "step": 5890 }, { "epoch": 1.1795012880474203, "grad_norm": 0.3250117301940918, "learning_rate": 3.322888944265572e-05, "loss": 0.0451, "num_input_tokens_seen": 19612960, "step": 5895 }, { "epoch": 1.180501713228122, "grad_norm": 0.5441887378692627, "learning_rate": 3.3204149008087745e-05, "loss": 0.0363, "num_input_tokens_seen": 19629728, "step": 5900 }, { "epoch": 1.1815021384088238, "grad_norm": 0.22758100926876068, "learning_rate": 3.317939956585851e-05, "loss": 0.0274, "num_input_tokens_seen": 19647360, "step": 5905 }, { "epoch": 1.1825025635895257, "grad_norm": 0.4903544783592224, "learning_rate": 3.315464114314141e-05, "loss": 0.0262, "num_input_tokens_seen": 19664128, "step": 5910 }, { "epoch": 1.1835029887702273, "grad_norm": 0.549884557723999, "learning_rate": 3.312987376711971e-05, "loss": 0.0224, "num_input_tokens_seen": 19681440, "step": 5915 }, { "epoch": 1.1845034139509292, "grad_norm": 0.3596501052379608, "learning_rate": 3.310509746498649e-05, "loss": 0.0232, "num_input_tokens_seen": 19697888, "step": 5920 }, { "epoch": 1.185503839131631, "grad_norm": 0.14788438379764557, "learning_rate": 3.308031226394464e-05, "loss": 0.0294, "num_input_tokens_seen": 19714400, "step": 5925 }, { "epoch": 1.1865042643123327, "grad_norm": 0.21356885135173798, "learning_rate": 3.3055518191206824e-05, "loss": 0.0247, "num_input_tokens_seen": 19730848, "step": 5930 }, { "epoch": 1.1875046894930346, "grad_norm": 0.20204849541187286, "learning_rate": 3.303071527399543e-05, "loss": 0.0174, "num_input_tokens_seen": 19747424, "step": 5935 }, { "epoch": 1.1885051146737364, "grad_norm": 0.47173598408699036, "learning_rate": 3.300590353954256e-05, "loss": 0.0239, "num_input_tokens_seen": 19763584, "step": 5940 }, { "epoch": 1.189505539854438, "grad_norm": 0.3403458893299103, "learning_rate": 3.298108301509003e-05, "loss": 0.0171, "num_input_tokens_seen": 19780000, "step": 5945 }, { "epoch": 1.19050596503514, "grad_norm": 0.10040679574012756, "learning_rate": 3.295625372788925e-05, "loss": 0.0355, "num_input_tokens_seen": 19796000, "step": 5950 }, { "epoch": 1.1915063902158418, "grad_norm": 0.725883424282074, "learning_rate": 3.2931415705201294e-05, "loss": 0.0214, "num_input_tokens_seen": 19812512, "step": 5955 }, { "epoch": 1.1925068153965435, "grad_norm": 0.2431889921426773, "learning_rate": 3.290656897429682e-05, "loss": 0.0312, "num_input_tokens_seen": 19829888, "step": 5960 }, { "epoch": 1.1935072405772453, "grad_norm": 0.3370024561882019, "learning_rate": 3.2881713562456036e-05, "loss": 0.0287, "num_input_tokens_seen": 19847712, "step": 5965 }, { "epoch": 1.1945076657579472, "grad_norm": 0.269488662481308, "learning_rate": 3.2856849496968685e-05, "loss": 0.0374, "num_input_tokens_seen": 19863776, "step": 5970 }, { "epoch": 1.1955080909386488, "grad_norm": 0.19755752384662628, "learning_rate": 3.283197680513401e-05, "loss": 0.0123, "num_input_tokens_seen": 19880000, "step": 5975 }, { "epoch": 1.1965085161193507, "grad_norm": 0.37241336703300476, "learning_rate": 3.280709551426074e-05, "loss": 0.018, "num_input_tokens_seen": 19896896, "step": 5980 }, { "epoch": 1.1975089413000526, "grad_norm": 0.15598493814468384, "learning_rate": 3.2782205651667013e-05, "loss": 0.0124, "num_input_tokens_seen": 19912544, "step": 5985 }, { "epoch": 1.1985093664807542, "grad_norm": 0.23613229393959045, "learning_rate": 3.275730724468041e-05, "loss": 0.021, "num_input_tokens_seen": 19928608, "step": 5990 }, { "epoch": 1.199509791661456, "grad_norm": 0.42579779028892517, "learning_rate": 3.27324003206379e-05, "loss": 0.0172, "num_input_tokens_seen": 19944832, "step": 5995 }, { "epoch": 1.200510216842158, "grad_norm": 0.6366907954216003, "learning_rate": 3.270748490688575e-05, "loss": 0.0259, "num_input_tokens_seen": 19961280, "step": 6000 }, { "epoch": 1.2015106420228596, "grad_norm": 0.19301056861877441, "learning_rate": 3.268256103077961e-05, "loss": 0.0195, "num_input_tokens_seen": 19977056, "step": 6005 }, { "epoch": 1.2025110672035615, "grad_norm": 0.7739194631576538, "learning_rate": 3.2657628719684377e-05, "loss": 0.0369, "num_input_tokens_seen": 19993696, "step": 6010 }, { "epoch": 1.2035114923842634, "grad_norm": 0.4243657886981964, "learning_rate": 3.2632688000974234e-05, "loss": 0.0269, "num_input_tokens_seen": 20010176, "step": 6015 }, { "epoch": 1.204511917564965, "grad_norm": 0.7800456881523132, "learning_rate": 3.2607738902032586e-05, "loss": 0.0235, "num_input_tokens_seen": 20026752, "step": 6020 }, { "epoch": 1.2055123427456669, "grad_norm": 0.4734976887702942, "learning_rate": 3.258278145025204e-05, "loss": 0.019, "num_input_tokens_seen": 20042784, "step": 6025 }, { "epoch": 1.2065127679263687, "grad_norm": 0.06025788187980652, "learning_rate": 3.255781567303438e-05, "loss": 0.0142, "num_input_tokens_seen": 20059488, "step": 6030 }, { "epoch": 1.2075131931070704, "grad_norm": 0.283933162689209, "learning_rate": 3.253284159779052e-05, "loss": 0.0136, "num_input_tokens_seen": 20076320, "step": 6035 }, { "epoch": 1.2085136182877723, "grad_norm": 0.3365752696990967, "learning_rate": 3.250785925194048e-05, "loss": 0.0224, "num_input_tokens_seen": 20093984, "step": 6040 }, { "epoch": 1.2095140434684741, "grad_norm": 0.35834792256355286, "learning_rate": 3.248286866291338e-05, "loss": 0.0196, "num_input_tokens_seen": 20109984, "step": 6045 }, { "epoch": 1.2105144686491758, "grad_norm": 0.26457342505455017, "learning_rate": 3.24578698581474e-05, "loss": 0.0292, "num_input_tokens_seen": 20125760, "step": 6050 }, { "epoch": 1.2115148938298776, "grad_norm": 0.3771316409111023, "learning_rate": 3.243286286508968e-05, "loss": 0.0289, "num_input_tokens_seen": 20141504, "step": 6055 }, { "epoch": 1.2125153190105795, "grad_norm": 0.19131837785243988, "learning_rate": 3.240784771119642e-05, "loss": 0.0206, "num_input_tokens_seen": 20158752, "step": 6060 }, { "epoch": 1.2135157441912814, "grad_norm": 0.7011873722076416, "learning_rate": 3.238282442393274e-05, "loss": 0.0252, "num_input_tokens_seen": 20175776, "step": 6065 }, { "epoch": 1.214516169371983, "grad_norm": 0.06214110925793648, "learning_rate": 3.235779303077272e-05, "loss": 0.0193, "num_input_tokens_seen": 20192704, "step": 6070 }, { "epoch": 1.215516594552685, "grad_norm": 0.32150939106941223, "learning_rate": 3.233275355919931e-05, "loss": 0.0162, "num_input_tokens_seen": 20211104, "step": 6075 }, { "epoch": 1.2165170197333868, "grad_norm": 0.19849102199077606, "learning_rate": 3.230770603670433e-05, "loss": 0.0177, "num_input_tokens_seen": 20228000, "step": 6080 }, { "epoch": 1.2175174449140884, "grad_norm": 0.8177050352096558, "learning_rate": 3.2282650490788466e-05, "loss": 0.022, "num_input_tokens_seen": 20244448, "step": 6085 }, { "epoch": 1.2185178700947903, "grad_norm": 1.0571411848068237, "learning_rate": 3.225758694896118e-05, "loss": 0.0463, "num_input_tokens_seen": 20261344, "step": 6090 }, { "epoch": 1.2195182952754922, "grad_norm": 0.6836508512496948, "learning_rate": 3.2232515438740754e-05, "loss": 0.0406, "num_input_tokens_seen": 20278368, "step": 6095 }, { "epoch": 1.2205187204561938, "grad_norm": 0.42014962434768677, "learning_rate": 3.220743598765418e-05, "loss": 0.0323, "num_input_tokens_seen": 20295456, "step": 6100 }, { "epoch": 1.2215191456368957, "grad_norm": 0.3057481646537781, "learning_rate": 3.218234862323718e-05, "loss": 0.0307, "num_input_tokens_seen": 20311520, "step": 6105 }, { "epoch": 1.2225195708175975, "grad_norm": 0.5091171264648438, "learning_rate": 3.2157253373034194e-05, "loss": 0.023, "num_input_tokens_seen": 20328544, "step": 6110 }, { "epoch": 1.2235199959982992, "grad_norm": 0.3312574326992035, "learning_rate": 3.213215026459826e-05, "loss": 0.0137, "num_input_tokens_seen": 20344640, "step": 6115 }, { "epoch": 1.224520421179001, "grad_norm": 0.09498897194862366, "learning_rate": 3.2107039325491095e-05, "loss": 0.0186, "num_input_tokens_seen": 20359840, "step": 6120 }, { "epoch": 1.225520846359703, "grad_norm": 0.7953746914863586, "learning_rate": 3.2081920583283005e-05, "loss": 0.0198, "num_input_tokens_seen": 20377056, "step": 6125 }, { "epoch": 1.2265212715404046, "grad_norm": 0.34085607528686523, "learning_rate": 3.205679406555285e-05, "loss": 0.0184, "num_input_tokens_seen": 20393152, "step": 6130 }, { "epoch": 1.2275216967211064, "grad_norm": 0.37727925181388855, "learning_rate": 3.2031659799888016e-05, "loss": 0.025, "num_input_tokens_seen": 20409888, "step": 6135 }, { "epoch": 1.2285221219018083, "grad_norm": 0.23031802475452423, "learning_rate": 3.2006517813884434e-05, "loss": 0.0213, "num_input_tokens_seen": 20426528, "step": 6140 }, { "epoch": 1.2295225470825102, "grad_norm": 0.8728134632110596, "learning_rate": 3.198136813514648e-05, "loss": 0.0241, "num_input_tokens_seen": 20443488, "step": 6145 }, { "epoch": 1.2305229722632118, "grad_norm": 0.5621263980865479, "learning_rate": 3.1956210791287e-05, "loss": 0.0188, "num_input_tokens_seen": 20458944, "step": 6150 }, { "epoch": 1.2315233974439137, "grad_norm": 0.6416258811950684, "learning_rate": 3.193104580992722e-05, "loss": 0.019, "num_input_tokens_seen": 20474784, "step": 6155 }, { "epoch": 1.2325238226246156, "grad_norm": 0.5702195763587952, "learning_rate": 3.190587321869678e-05, "loss": 0.0242, "num_input_tokens_seen": 20491264, "step": 6160 }, { "epoch": 1.2335242478053172, "grad_norm": 0.2383430451154709, "learning_rate": 3.188069304523367e-05, "loss": 0.022, "num_input_tokens_seen": 20507424, "step": 6165 }, { "epoch": 1.234524672986019, "grad_norm": 0.33039844036102295, "learning_rate": 3.18555053171842e-05, "loss": 0.0249, "num_input_tokens_seen": 20523360, "step": 6170 }, { "epoch": 1.235525098166721, "grad_norm": 0.735704779624939, "learning_rate": 3.1830310062202996e-05, "loss": 0.0278, "num_input_tokens_seen": 20539328, "step": 6175 }, { "epoch": 1.2365255233474226, "grad_norm": 0.1837497353553772, "learning_rate": 3.1805107307952906e-05, "loss": 0.0138, "num_input_tokens_seen": 20555872, "step": 6180 }, { "epoch": 1.2375259485281245, "grad_norm": 0.46224069595336914, "learning_rate": 3.1779897082105054e-05, "loss": 0.0244, "num_input_tokens_seen": 20572416, "step": 6185 }, { "epoch": 1.2385263737088263, "grad_norm": 0.15932872891426086, "learning_rate": 3.175467941233873e-05, "loss": 0.0263, "num_input_tokens_seen": 20589216, "step": 6190 }, { "epoch": 1.239526798889528, "grad_norm": 0.43344810605049133, "learning_rate": 3.172945432634142e-05, "loss": 0.0236, "num_input_tokens_seen": 20605056, "step": 6195 }, { "epoch": 1.2405272240702299, "grad_norm": 0.2447565197944641, "learning_rate": 3.170422185180877e-05, "loss": 0.0274, "num_input_tokens_seen": 20622336, "step": 6200 }, { "epoch": 1.2415276492509317, "grad_norm": 0.7022611498832703, "learning_rate": 3.167898201644449e-05, "loss": 0.0225, "num_input_tokens_seen": 20638528, "step": 6205 }, { "epoch": 1.2425280744316334, "grad_norm": 0.3187255859375, "learning_rate": 3.165373484796041e-05, "loss": 0.014, "num_input_tokens_seen": 20654656, "step": 6210 }, { "epoch": 1.2435284996123352, "grad_norm": 0.4601322412490845, "learning_rate": 3.162848037407641e-05, "loss": 0.0326, "num_input_tokens_seen": 20672128, "step": 6215 }, { "epoch": 1.2445289247930371, "grad_norm": 0.6867743730545044, "learning_rate": 3.160321862252036e-05, "loss": 0.0252, "num_input_tokens_seen": 20688096, "step": 6220 }, { "epoch": 1.2455293499737388, "grad_norm": 0.3737960755825043, "learning_rate": 3.157794962102815e-05, "loss": 0.0139, "num_input_tokens_seen": 20703360, "step": 6225 }, { "epoch": 1.2465297751544406, "grad_norm": 0.09392146021127701, "learning_rate": 3.155267339734362e-05, "loss": 0.0217, "num_input_tokens_seen": 20719936, "step": 6230 }, { "epoch": 1.2475302003351425, "grad_norm": 0.47164633870124817, "learning_rate": 3.1527389979218546e-05, "loss": 0.0156, "num_input_tokens_seen": 20737440, "step": 6235 }, { "epoch": 1.2485306255158442, "grad_norm": 0.5443530082702637, "learning_rate": 3.150209939441259e-05, "loss": 0.0445, "num_input_tokens_seen": 20754176, "step": 6240 }, { "epoch": 1.249531050696546, "grad_norm": 0.3592490553855896, "learning_rate": 3.14768016706933e-05, "loss": 0.0234, "num_input_tokens_seen": 20771232, "step": 6245 }, { "epoch": 1.2505314758772479, "grad_norm": 0.3470192551612854, "learning_rate": 3.1451496835836044e-05, "loss": 0.0229, "num_input_tokens_seen": 20789184, "step": 6250 }, { "epoch": 1.2515319010579495, "grad_norm": 0.5166939496994019, "learning_rate": 3.1426184917624e-05, "loss": 0.035, "num_input_tokens_seen": 20805152, "step": 6255 }, { "epoch": 1.2525323262386514, "grad_norm": 0.2819918692111969, "learning_rate": 3.1400865943848146e-05, "loss": 0.0136, "num_input_tokens_seen": 20822432, "step": 6260 }, { "epoch": 1.2535327514193533, "grad_norm": 0.39920511841773987, "learning_rate": 3.137553994230718e-05, "loss": 0.0189, "num_input_tokens_seen": 20838656, "step": 6265 }, { "epoch": 1.254533176600055, "grad_norm": 0.2822507321834564, "learning_rate": 3.135020694080752e-05, "loss": 0.0292, "num_input_tokens_seen": 20855552, "step": 6270 }, { "epoch": 1.2555336017807568, "grad_norm": 1.1919745206832886, "learning_rate": 3.132486696716329e-05, "loss": 0.0348, "num_input_tokens_seen": 20871200, "step": 6275 }, { "epoch": 1.2565340269614587, "grad_norm": 0.3402027487754822, "learning_rate": 3.129952004919624e-05, "loss": 0.0268, "num_input_tokens_seen": 20889312, "step": 6280 }, { "epoch": 1.2575344521421603, "grad_norm": 0.7842808961868286, "learning_rate": 3.127416621473576e-05, "loss": 0.0287, "num_input_tokens_seen": 20905376, "step": 6285 }, { "epoch": 1.2585348773228622, "grad_norm": 0.10425746440887451, "learning_rate": 3.124880549161885e-05, "loss": 0.0185, "num_input_tokens_seen": 20923360, "step": 6290 }, { "epoch": 1.259535302503564, "grad_norm": 0.6856287717819214, "learning_rate": 3.122343790769003e-05, "loss": 0.0298, "num_input_tokens_seen": 20939840, "step": 6295 }, { "epoch": 1.2605357276842657, "grad_norm": 0.2629176080226898, "learning_rate": 3.119806349080139e-05, "loss": 0.017, "num_input_tokens_seen": 20956928, "step": 6300 }, { "epoch": 1.2615361528649676, "grad_norm": 0.6491134166717529, "learning_rate": 3.117268226881252e-05, "loss": 0.0198, "num_input_tokens_seen": 20973024, "step": 6305 }, { "epoch": 1.2625365780456694, "grad_norm": 0.3988141119480133, "learning_rate": 3.114729426959046e-05, "loss": 0.0158, "num_input_tokens_seen": 20988544, "step": 6310 }, { "epoch": 1.263537003226371, "grad_norm": 0.30559006333351135, "learning_rate": 3.11218995210097e-05, "loss": 0.0147, "num_input_tokens_seen": 21005920, "step": 6315 }, { "epoch": 1.264537428407073, "grad_norm": 0.23855215311050415, "learning_rate": 3.109649805095216e-05, "loss": 0.0116, "num_input_tokens_seen": 21022528, "step": 6320 }, { "epoch": 1.2655378535877748, "grad_norm": 0.3411101698875427, "learning_rate": 3.107108988730712e-05, "loss": 0.0375, "num_input_tokens_seen": 21038560, "step": 6325 }, { "epoch": 1.2665382787684765, "grad_norm": 0.09174685180187225, "learning_rate": 3.1045675057971204e-05, "loss": 0.0342, "num_input_tokens_seen": 21055200, "step": 6330 }, { "epoch": 1.2675387039491783, "grad_norm": 0.7968325614929199, "learning_rate": 3.102025359084836e-05, "loss": 0.0329, "num_input_tokens_seen": 21071648, "step": 6335 }, { "epoch": 1.2685391291298802, "grad_norm": 0.650685727596283, "learning_rate": 3.099482551384984e-05, "loss": 0.0205, "num_input_tokens_seen": 21088864, "step": 6340 }, { "epoch": 1.269539554310582, "grad_norm": 0.7141836285591125, "learning_rate": 3.096939085489413e-05, "loss": 0.0244, "num_input_tokens_seen": 21105472, "step": 6345 }, { "epoch": 1.2705399794912837, "grad_norm": 0.36796361207962036, "learning_rate": 3.094394964190696e-05, "loss": 0.0275, "num_input_tokens_seen": 21121088, "step": 6350 }, { "epoch": 1.2715404046719856, "grad_norm": 1.0574023723602295, "learning_rate": 3.091850190282124e-05, "loss": 0.0256, "num_input_tokens_seen": 21139008, "step": 6355 }, { "epoch": 1.2725408298526875, "grad_norm": 0.5442276000976562, "learning_rate": 3.089304766557707e-05, "loss": 0.0146, "num_input_tokens_seen": 21156160, "step": 6360 }, { "epoch": 1.273541255033389, "grad_norm": 0.46550431847572327, "learning_rate": 3.0867586958121654e-05, "loss": 0.0216, "num_input_tokens_seen": 21172064, "step": 6365 }, { "epoch": 1.274541680214091, "grad_norm": 0.11053989082574844, "learning_rate": 3.0842119808409315e-05, "loss": 0.0173, "num_input_tokens_seen": 21188736, "step": 6370 }, { "epoch": 1.2755421053947928, "grad_norm": 0.2156449854373932, "learning_rate": 3.0816646244401455e-05, "loss": 0.017, "num_input_tokens_seen": 21204320, "step": 6375 }, { "epoch": 1.2765425305754947, "grad_norm": 0.15073303878307343, "learning_rate": 3.079116629406651e-05, "loss": 0.021, "num_input_tokens_seen": 21221120, "step": 6380 }, { "epoch": 1.2775429557561964, "grad_norm": 0.4173339307308197, "learning_rate": 3.0765679985379935e-05, "loss": 0.0205, "num_input_tokens_seen": 21237408, "step": 6385 }, { "epoch": 1.2785433809368982, "grad_norm": 1.0750230550765991, "learning_rate": 3.074018734632416e-05, "loss": 0.0493, "num_input_tokens_seen": 21253408, "step": 6390 }, { "epoch": 1.2795438061176, "grad_norm": 0.5304452776908875, "learning_rate": 3.071468840488855e-05, "loss": 0.0112, "num_input_tokens_seen": 21270432, "step": 6395 }, { "epoch": 1.2805442312983017, "grad_norm": 0.32167404890060425, "learning_rate": 3.068918318906943e-05, "loss": 0.0235, "num_input_tokens_seen": 21287040, "step": 6400 }, { "epoch": 1.2815446564790036, "grad_norm": 0.5210312008857727, "learning_rate": 3.066367172686998e-05, "loss": 0.014, "num_input_tokens_seen": 21302752, "step": 6405 }, { "epoch": 1.2825450816597055, "grad_norm": 0.31968334317207336, "learning_rate": 3.063815404630024e-05, "loss": 0.0105, "num_input_tokens_seen": 21318368, "step": 6410 }, { "epoch": 1.2835455068404071, "grad_norm": 0.16647179424762726, "learning_rate": 3.06126301753771e-05, "loss": 0.017, "num_input_tokens_seen": 21334464, "step": 6415 }, { "epoch": 1.284545932021109, "grad_norm": 0.8157343864440918, "learning_rate": 3.0587100142124225e-05, "loss": 0.025, "num_input_tokens_seen": 21351840, "step": 6420 }, { "epoch": 1.2855463572018109, "grad_norm": 0.6097559928894043, "learning_rate": 3.056156397457205e-05, "loss": 0.0332, "num_input_tokens_seen": 21368416, "step": 6425 }, { "epoch": 1.2865467823825125, "grad_norm": 0.5425021648406982, "learning_rate": 3.053602170075775e-05, "loss": 0.0437, "num_input_tokens_seen": 21385344, "step": 6430 }, { "epoch": 1.2875472075632144, "grad_norm": 0.6646090745925903, "learning_rate": 3.051047334872521e-05, "loss": 0.02, "num_input_tokens_seen": 21400288, "step": 6435 }, { "epoch": 1.2885476327439163, "grad_norm": 0.26214632391929626, "learning_rate": 3.0484918946524983e-05, "loss": 0.0205, "num_input_tokens_seen": 21416704, "step": 6440 }, { "epoch": 1.289548057924618, "grad_norm": 0.2173846811056137, "learning_rate": 3.0459358522214244e-05, "loss": 0.0151, "num_input_tokens_seen": 21432928, "step": 6445 }, { "epoch": 1.2905484831053198, "grad_norm": 0.31593844294548035, "learning_rate": 3.043379210385681e-05, "loss": 0.0234, "num_input_tokens_seen": 21448928, "step": 6450 }, { "epoch": 1.2915489082860216, "grad_norm": 0.6138732433319092, "learning_rate": 3.0408219719523077e-05, "loss": 0.0166, "num_input_tokens_seen": 21466240, "step": 6455 }, { "epoch": 1.2925493334667233, "grad_norm": 0.15956415235996246, "learning_rate": 3.0382641397289967e-05, "loss": 0.0154, "num_input_tokens_seen": 21482080, "step": 6460 }, { "epoch": 1.2935497586474252, "grad_norm": 0.5225535035133362, "learning_rate": 3.0357057165240943e-05, "loss": 0.0199, "num_input_tokens_seen": 21497920, "step": 6465 }, { "epoch": 1.294550183828127, "grad_norm": 0.4933944344520569, "learning_rate": 3.0331467051465955e-05, "loss": 0.03, "num_input_tokens_seen": 21514848, "step": 6470 }, { "epoch": 1.2955506090088287, "grad_norm": 0.8946163058280945, "learning_rate": 3.0305871084061398e-05, "loss": 0.0411, "num_input_tokens_seen": 21531136, "step": 6475 }, { "epoch": 1.2965510341895305, "grad_norm": 0.2678624391555786, "learning_rate": 3.0280269291130104e-05, "loss": 0.0279, "num_input_tokens_seen": 21546208, "step": 6480 }, { "epoch": 1.2975514593702324, "grad_norm": 0.4984210133552551, "learning_rate": 3.02546617007813e-05, "loss": 0.0194, "num_input_tokens_seen": 21561632, "step": 6485 }, { "epoch": 1.298551884550934, "grad_norm": 0.8703275918960571, "learning_rate": 3.022904834113058e-05, "loss": 0.0208, "num_input_tokens_seen": 21577760, "step": 6490 }, { "epoch": 1.299552309731636, "grad_norm": 0.43906381726264954, "learning_rate": 3.0203429240299865e-05, "loss": 0.022, "num_input_tokens_seen": 21594176, "step": 6495 }, { "epoch": 1.3005527349123378, "grad_norm": 0.18665099143981934, "learning_rate": 3.0177804426417384e-05, "loss": 0.0115, "num_input_tokens_seen": 21611552, "step": 6500 }, { "epoch": 1.3015531600930395, "grad_norm": 0.3526068925857544, "learning_rate": 3.0152173927617634e-05, "loss": 0.0287, "num_input_tokens_seen": 21628864, "step": 6505 }, { "epoch": 1.3025535852737413, "grad_norm": 0.19899950921535492, "learning_rate": 3.0126537772041375e-05, "loss": 0.0621, "num_input_tokens_seen": 21645408, "step": 6510 }, { "epoch": 1.3035540104544432, "grad_norm": 0.21650636196136475, "learning_rate": 3.0100895987835547e-05, "loss": 0.0161, "num_input_tokens_seen": 21661952, "step": 6515 }, { "epoch": 1.3045544356351448, "grad_norm": 0.11611881852149963, "learning_rate": 3.0075248603153284e-05, "loss": 0.0178, "num_input_tokens_seen": 21677760, "step": 6520 }, { "epoch": 1.3055548608158467, "grad_norm": 0.25818386673927307, "learning_rate": 3.0049595646153877e-05, "loss": 0.0135, "num_input_tokens_seen": 21694560, "step": 6525 }, { "epoch": 1.3065552859965486, "grad_norm": 0.281645268201828, "learning_rate": 3.0023937145002717e-05, "loss": 0.016, "num_input_tokens_seen": 21711392, "step": 6530 }, { "epoch": 1.3075557111772502, "grad_norm": 0.5636654496192932, "learning_rate": 2.9998273127871302e-05, "loss": 0.0218, "num_input_tokens_seen": 21728800, "step": 6535 }, { "epoch": 1.308556136357952, "grad_norm": 0.3114864230155945, "learning_rate": 2.9972603622937167e-05, "loss": 0.0094, "num_input_tokens_seen": 21745664, "step": 6540 }, { "epoch": 1.309556561538654, "grad_norm": 0.8865411877632141, "learning_rate": 2.9946928658383893e-05, "loss": 0.0178, "num_input_tokens_seen": 21761536, "step": 6545 }, { "epoch": 1.3105569867193556, "grad_norm": 0.29117724299430847, "learning_rate": 2.992124826240104e-05, "loss": 0.014, "num_input_tokens_seen": 21777888, "step": 6550 }, { "epoch": 1.3115574119000575, "grad_norm": 0.3249759376049042, "learning_rate": 2.989556246318412e-05, "loss": 0.0192, "num_input_tokens_seen": 21793952, "step": 6555 }, { "epoch": 1.3125578370807593, "grad_norm": 1.2025161981582642, "learning_rate": 2.9869871288934615e-05, "loss": 0.0213, "num_input_tokens_seen": 21810208, "step": 6560 }, { "epoch": 1.313558262261461, "grad_norm": 0.5750439167022705, "learning_rate": 2.9844174767859877e-05, "loss": 0.0228, "num_input_tokens_seen": 21826368, "step": 6565 }, { "epoch": 1.3145586874421629, "grad_norm": 0.24626857042312622, "learning_rate": 2.9818472928173136e-05, "loss": 0.0097, "num_input_tokens_seen": 21844192, "step": 6570 }, { "epoch": 1.3155591126228647, "grad_norm": 1.0716522932052612, "learning_rate": 2.9792765798093465e-05, "loss": 0.0338, "num_input_tokens_seen": 21861312, "step": 6575 }, { "epoch": 1.3165595378035664, "grad_norm": 0.2427041381597519, "learning_rate": 2.976705340584574e-05, "loss": 0.0277, "num_input_tokens_seen": 21878080, "step": 6580 }, { "epoch": 1.3175599629842683, "grad_norm": 0.1594732105731964, "learning_rate": 2.974133577966063e-05, "loss": 0.013, "num_input_tokens_seen": 21893856, "step": 6585 }, { "epoch": 1.3185603881649701, "grad_norm": 0.49334800243377686, "learning_rate": 2.9715612947774523e-05, "loss": 0.0116, "num_input_tokens_seen": 21910496, "step": 6590 }, { "epoch": 1.319560813345672, "grad_norm": 0.2606200873851776, "learning_rate": 2.9689884938429552e-05, "loss": 0.0096, "num_input_tokens_seen": 21927488, "step": 6595 }, { "epoch": 1.3205612385263736, "grad_norm": 0.9450080394744873, "learning_rate": 2.9664151779873516e-05, "loss": 0.0471, "num_input_tokens_seen": 21943360, "step": 6600 }, { "epoch": 1.3215616637070755, "grad_norm": 0.2751266360282898, "learning_rate": 2.9638413500359868e-05, "loss": 0.0349, "num_input_tokens_seen": 21960320, "step": 6605 }, { "epoch": 1.3225620888877774, "grad_norm": 0.6991470456123352, "learning_rate": 2.961267012814769e-05, "loss": 0.0194, "num_input_tokens_seen": 21977920, "step": 6610 }, { "epoch": 1.323562514068479, "grad_norm": 0.5267415642738342, "learning_rate": 2.9586921691501662e-05, "loss": 0.0303, "num_input_tokens_seen": 21996000, "step": 6615 }, { "epoch": 1.324562939249181, "grad_norm": 0.19206087291240692, "learning_rate": 2.9561168218692008e-05, "loss": 0.0129, "num_input_tokens_seen": 22012064, "step": 6620 }, { "epoch": 1.3255633644298828, "grad_norm": 0.3343439996242523, "learning_rate": 2.953540973799449e-05, "loss": 0.0161, "num_input_tokens_seen": 22028832, "step": 6625 }, { "epoch": 1.3265637896105846, "grad_norm": 0.031643129885196686, "learning_rate": 2.9509646277690362e-05, "loss": 0.0172, "num_input_tokens_seen": 22047168, "step": 6630 }, { "epoch": 1.3275642147912863, "grad_norm": 0.3571871221065521, "learning_rate": 2.9483877866066368e-05, "loss": 0.0201, "num_input_tokens_seen": 22063840, "step": 6635 }, { "epoch": 1.3285646399719881, "grad_norm": 0.09580302238464355, "learning_rate": 2.9458104531414654e-05, "loss": 0.0108, "num_input_tokens_seen": 22079968, "step": 6640 }, { "epoch": 1.32956506515269, "grad_norm": 0.920793354511261, "learning_rate": 2.9432326302032793e-05, "loss": 0.0355, "num_input_tokens_seen": 22096320, "step": 6645 }, { "epoch": 1.3305654903333917, "grad_norm": 0.5107559561729431, "learning_rate": 2.9406543206223735e-05, "loss": 0.0151, "num_input_tokens_seen": 22113600, "step": 6650 }, { "epoch": 1.3315659155140935, "grad_norm": 0.9166436195373535, "learning_rate": 2.9380755272295758e-05, "loss": 0.0202, "num_input_tokens_seen": 22129664, "step": 6655 }, { "epoch": 1.3325663406947954, "grad_norm": 0.2292318493127823, "learning_rate": 2.9354962528562463e-05, "loss": 0.0141, "num_input_tokens_seen": 22145728, "step": 6660 }, { "epoch": 1.333566765875497, "grad_norm": 0.16360390186309814, "learning_rate": 2.9329165003342722e-05, "loss": 0.0226, "num_input_tokens_seen": 22162880, "step": 6665 }, { "epoch": 1.334567191056199, "grad_norm": 0.2184598594903946, "learning_rate": 2.9303362724960677e-05, "loss": 0.0116, "num_input_tokens_seen": 22180672, "step": 6670 }, { "epoch": 1.3355676162369008, "grad_norm": 0.5025904774665833, "learning_rate": 2.9277555721745664e-05, "loss": 0.0172, "num_input_tokens_seen": 22197568, "step": 6675 }, { "epoch": 1.3365680414176024, "grad_norm": 0.45514610409736633, "learning_rate": 2.925174402203221e-05, "loss": 0.0442, "num_input_tokens_seen": 22214368, "step": 6680 }, { "epoch": 1.3375684665983043, "grad_norm": 0.19939345121383667, "learning_rate": 2.9225927654160023e-05, "loss": 0.0143, "num_input_tokens_seen": 22230624, "step": 6685 }, { "epoch": 1.3385688917790062, "grad_norm": 0.5434321165084839, "learning_rate": 2.9200106646473917e-05, "loss": 0.0244, "num_input_tokens_seen": 22246208, "step": 6690 }, { "epoch": 1.3395693169597078, "grad_norm": 0.7273253798484802, "learning_rate": 2.9174281027323796e-05, "loss": 0.0225, "num_input_tokens_seen": 22263072, "step": 6695 }, { "epoch": 1.3405697421404097, "grad_norm": 0.33907535672187805, "learning_rate": 2.9148450825064633e-05, "loss": 0.0142, "num_input_tokens_seen": 22279968, "step": 6700 }, { "epoch": 1.3415701673211116, "grad_norm": 0.19482874870300293, "learning_rate": 2.9122616068056436e-05, "loss": 0.0273, "num_input_tokens_seen": 22295904, "step": 6705 }, { "epoch": 1.3425705925018132, "grad_norm": 0.37009555101394653, "learning_rate": 2.9096776784664204e-05, "loss": 0.0221, "num_input_tokens_seen": 22312704, "step": 6710 }, { "epoch": 1.343571017682515, "grad_norm": 0.6175490021705627, "learning_rate": 2.9070933003257933e-05, "loss": 0.0338, "num_input_tokens_seen": 22329376, "step": 6715 }, { "epoch": 1.344571442863217, "grad_norm": 0.13512542843818665, "learning_rate": 2.9045084752212515e-05, "loss": 0.0271, "num_input_tokens_seen": 22344672, "step": 6720 }, { "epoch": 1.3455718680439186, "grad_norm": 0.40606412291526794, "learning_rate": 2.901923205990779e-05, "loss": 0.0142, "num_input_tokens_seen": 22361120, "step": 6725 }, { "epoch": 1.3465722932246205, "grad_norm": 0.6456787586212158, "learning_rate": 2.8993374954728448e-05, "loss": 0.0294, "num_input_tokens_seen": 22378656, "step": 6730 }, { "epoch": 1.3475727184053223, "grad_norm": 0.6901668310165405, "learning_rate": 2.8967513465064028e-05, "loss": 0.0253, "num_input_tokens_seen": 22394560, "step": 6735 }, { "epoch": 1.348573143586024, "grad_norm": 0.5287696719169617, "learning_rate": 2.894164761930889e-05, "loss": 0.0214, "num_input_tokens_seen": 22410880, "step": 6740 }, { "epoch": 1.3495735687667259, "grad_norm": 0.07469778507947922, "learning_rate": 2.8915777445862184e-05, "loss": 0.0167, "num_input_tokens_seen": 22427648, "step": 6745 }, { "epoch": 1.3505739939474277, "grad_norm": 0.35534486174583435, "learning_rate": 2.8889902973127796e-05, "loss": 0.0224, "num_input_tokens_seen": 22442976, "step": 6750 }, { "epoch": 1.3515744191281294, "grad_norm": 0.37630361318588257, "learning_rate": 2.886402422951433e-05, "loss": 0.0297, "num_input_tokens_seen": 22458784, "step": 6755 }, { "epoch": 1.3525748443088312, "grad_norm": 0.23175548017024994, "learning_rate": 2.883814124343509e-05, "loss": 0.0152, "num_input_tokens_seen": 22475872, "step": 6760 }, { "epoch": 1.353575269489533, "grad_norm": 0.23230740427970886, "learning_rate": 2.8812254043308052e-05, "loss": 0.0203, "num_input_tokens_seen": 22493344, "step": 6765 }, { "epoch": 1.3545756946702348, "grad_norm": 0.1427178680896759, "learning_rate": 2.8786362657555782e-05, "loss": 0.008, "num_input_tokens_seen": 22509344, "step": 6770 }, { "epoch": 1.3555761198509366, "grad_norm": 0.9476840496063232, "learning_rate": 2.8760467114605462e-05, "loss": 0.0318, "num_input_tokens_seen": 22525568, "step": 6775 }, { "epoch": 1.3565765450316385, "grad_norm": 0.44155165553092957, "learning_rate": 2.873456744288885e-05, "loss": 0.0274, "num_input_tokens_seen": 22542560, "step": 6780 }, { "epoch": 1.3575769702123401, "grad_norm": 0.45242565870285034, "learning_rate": 2.870866367084221e-05, "loss": 0.0168, "num_input_tokens_seen": 22559424, "step": 6785 }, { "epoch": 1.358577395393042, "grad_norm": 1.2894710302352905, "learning_rate": 2.868275582690634e-05, "loss": 0.0234, "num_input_tokens_seen": 22576064, "step": 6790 }, { "epoch": 1.3595778205737439, "grad_norm": 0.4912334084510803, "learning_rate": 2.8656843939526473e-05, "loss": 0.0252, "num_input_tokens_seen": 22591904, "step": 6795 }, { "epoch": 1.3605782457544455, "grad_norm": 0.2881946563720703, "learning_rate": 2.8630928037152322e-05, "loss": 0.0248, "num_input_tokens_seen": 22609024, "step": 6800 }, { "epoch": 1.3615786709351474, "grad_norm": 0.732107400894165, "learning_rate": 2.8605008148237965e-05, "loss": 0.0259, "num_input_tokens_seen": 22625088, "step": 6805 }, { "epoch": 1.3625790961158493, "grad_norm": 0.23752035200595856, "learning_rate": 2.8579084301241886e-05, "loss": 0.0287, "num_input_tokens_seen": 22642144, "step": 6810 }, { "epoch": 1.363579521296551, "grad_norm": 0.17034226655960083, "learning_rate": 2.855315652462691e-05, "loss": 0.0191, "num_input_tokens_seen": 22658592, "step": 6815 }, { "epoch": 1.3645799464772528, "grad_norm": 0.1437387317419052, "learning_rate": 2.852722484686018e-05, "loss": 0.0087, "num_input_tokens_seen": 22674432, "step": 6820 }, { "epoch": 1.3655803716579547, "grad_norm": 0.5661661028862, "learning_rate": 2.8501289296413103e-05, "loss": 0.037, "num_input_tokens_seen": 22690624, "step": 6825 }, { "epoch": 1.3665807968386563, "grad_norm": 0.27419745922088623, "learning_rate": 2.8475349901761344e-05, "loss": 0.0236, "num_input_tokens_seen": 22707424, "step": 6830 }, { "epoch": 1.3675812220193582, "grad_norm": 0.33506685495376587, "learning_rate": 2.844940669138481e-05, "loss": 0.0173, "num_input_tokens_seen": 22724576, "step": 6835 }, { "epoch": 1.36858164720006, "grad_norm": 0.2337416410446167, "learning_rate": 2.8423459693767585e-05, "loss": 0.0183, "num_input_tokens_seen": 22740992, "step": 6840 }, { "epoch": 1.369582072380762, "grad_norm": 0.4036204218864441, "learning_rate": 2.8397508937397904e-05, "loss": 0.0309, "num_input_tokens_seen": 22759104, "step": 6845 }, { "epoch": 1.3705824975614636, "grad_norm": 0.2100093513727188, "learning_rate": 2.8371554450768124e-05, "loss": 0.0129, "num_input_tokens_seen": 22775008, "step": 6850 }, { "epoch": 1.3715829227421654, "grad_norm": 0.11855494976043701, "learning_rate": 2.8345596262374718e-05, "loss": 0.0191, "num_input_tokens_seen": 22791424, "step": 6855 }, { "epoch": 1.3725833479228673, "grad_norm": 0.7924718856811523, "learning_rate": 2.8319634400718215e-05, "loss": 0.0223, "num_input_tokens_seen": 22807680, "step": 6860 }, { "epoch": 1.373583773103569, "grad_norm": 0.4390965402126312, "learning_rate": 2.8293668894303165e-05, "loss": 0.0142, "num_input_tokens_seen": 22825568, "step": 6865 }, { "epoch": 1.3745841982842708, "grad_norm": 1.074836015701294, "learning_rate": 2.8267699771638136e-05, "loss": 0.0325, "num_input_tokens_seen": 22842080, "step": 6870 }, { "epoch": 1.3755846234649727, "grad_norm": 0.17005810141563416, "learning_rate": 2.8241727061235662e-05, "loss": 0.0153, "num_input_tokens_seen": 22858624, "step": 6875 }, { "epoch": 1.3765850486456745, "grad_norm": 0.42438748478889465, "learning_rate": 2.8215750791612207e-05, "loss": 0.0198, "num_input_tokens_seen": 22874944, "step": 6880 }, { "epoch": 1.3775854738263762, "grad_norm": 0.13305796682834625, "learning_rate": 2.818977099128815e-05, "loss": 0.0268, "num_input_tokens_seen": 22892352, "step": 6885 }, { "epoch": 1.378585899007078, "grad_norm": 0.2759329676628113, "learning_rate": 2.8163787688787752e-05, "loss": 0.0229, "num_input_tokens_seen": 22908224, "step": 6890 }, { "epoch": 1.37958632418778, "grad_norm": 0.8038346171379089, "learning_rate": 2.813780091263912e-05, "loss": 0.0368, "num_input_tokens_seen": 22925088, "step": 6895 }, { "epoch": 1.3805867493684816, "grad_norm": 0.13557299971580505, "learning_rate": 2.811181069137415e-05, "loss": 0.0201, "num_input_tokens_seen": 22941920, "step": 6900 }, { "epoch": 1.3815871745491834, "grad_norm": 0.20478865504264832, "learning_rate": 2.8085817053528546e-05, "loss": 0.0131, "num_input_tokens_seen": 22959616, "step": 6905 }, { "epoch": 1.3825875997298853, "grad_norm": 0.3666134178638458, "learning_rate": 2.8059820027641765e-05, "loss": 0.0123, "num_input_tokens_seen": 22975872, "step": 6910 }, { "epoch": 1.383588024910587, "grad_norm": 0.8973509669303894, "learning_rate": 2.8033819642256963e-05, "loss": 0.0425, "num_input_tokens_seen": 22992800, "step": 6915 }, { "epoch": 1.3845884500912888, "grad_norm": 0.5391751527786255, "learning_rate": 2.8007815925921006e-05, "loss": 0.0351, "num_input_tokens_seen": 23009856, "step": 6920 }, { "epoch": 1.3855888752719907, "grad_norm": 0.22452957928180695, "learning_rate": 2.7981808907184398e-05, "loss": 0.0241, "num_input_tokens_seen": 23026848, "step": 6925 }, { "epoch": 1.3865893004526924, "grad_norm": 0.6008860468864441, "learning_rate": 2.7955798614601288e-05, "loss": 0.0285, "num_input_tokens_seen": 23042944, "step": 6930 }, { "epoch": 1.3875897256333942, "grad_norm": 0.6881251335144043, "learning_rate": 2.792978507672941e-05, "loss": 0.0314, "num_input_tokens_seen": 23060128, "step": 6935 }, { "epoch": 1.388590150814096, "grad_norm": 0.07176212221384048, "learning_rate": 2.7903768322130043e-05, "loss": 0.0177, "num_input_tokens_seen": 23078208, "step": 6940 }, { "epoch": 1.3895905759947977, "grad_norm": 0.32679834961891174, "learning_rate": 2.787774837936804e-05, "loss": 0.0244, "num_input_tokens_seen": 23095136, "step": 6945 }, { "epoch": 1.3905910011754996, "grad_norm": 0.17960023880004883, "learning_rate": 2.785172527701172e-05, "loss": 0.0246, "num_input_tokens_seen": 23110912, "step": 6950 }, { "epoch": 1.3915914263562015, "grad_norm": 0.3369027376174927, "learning_rate": 2.782569904363288e-05, "loss": 0.0106, "num_input_tokens_seen": 23127520, "step": 6955 }, { "epoch": 1.3925918515369031, "grad_norm": 0.31298232078552246, "learning_rate": 2.7799669707806758e-05, "loss": 0.0159, "num_input_tokens_seen": 23145792, "step": 6960 }, { "epoch": 1.393592276717605, "grad_norm": 1.370901107788086, "learning_rate": 2.7773637298111988e-05, "loss": 0.0335, "num_input_tokens_seen": 23162368, "step": 6965 }, { "epoch": 1.3945927018983069, "grad_norm": 0.2301444262266159, "learning_rate": 2.7747601843130607e-05, "loss": 0.0338, "num_input_tokens_seen": 23178048, "step": 6970 }, { "epoch": 1.3955931270790085, "grad_norm": 0.2517456114292145, "learning_rate": 2.772156337144795e-05, "loss": 0.0097, "num_input_tokens_seen": 23196352, "step": 6975 }, { "epoch": 1.3965935522597104, "grad_norm": 0.6408939957618713, "learning_rate": 2.7695521911652715e-05, "loss": 0.0281, "num_input_tokens_seen": 23212096, "step": 6980 }, { "epoch": 1.3975939774404122, "grad_norm": 0.6506691575050354, "learning_rate": 2.7669477492336844e-05, "loss": 0.0195, "num_input_tokens_seen": 23229216, "step": 6985 }, { "epoch": 1.398594402621114, "grad_norm": 0.1905229240655899, "learning_rate": 2.7643430142095532e-05, "loss": 0.0199, "num_input_tokens_seen": 23245312, "step": 6990 }, { "epoch": 1.3995948278018158, "grad_norm": 0.7158108949661255, "learning_rate": 2.761737988952721e-05, "loss": 0.0247, "num_input_tokens_seen": 23260896, "step": 6995 }, { "epoch": 1.4005952529825176, "grad_norm": 0.3425590395927429, "learning_rate": 2.7591326763233493e-05, "loss": 0.0109, "num_input_tokens_seen": 23277216, "step": 7000 }, { "epoch": 1.4015956781632193, "grad_norm": 1.0016491413116455, "learning_rate": 2.756527079181913e-05, "loss": 0.0262, "num_input_tokens_seen": 23294368, "step": 7005 }, { "epoch": 1.4025961033439212, "grad_norm": 0.25987598299980164, "learning_rate": 2.7539212003892013e-05, "loss": 0.0144, "num_input_tokens_seen": 23310816, "step": 7010 }, { "epoch": 1.403596528524623, "grad_norm": 0.5691525936126709, "learning_rate": 2.7513150428063112e-05, "loss": 0.0235, "num_input_tokens_seen": 23327968, "step": 7015 }, { "epoch": 1.4045969537053247, "grad_norm": 0.5938063263893127, "learning_rate": 2.7487086092946485e-05, "loss": 0.0288, "num_input_tokens_seen": 23344896, "step": 7020 }, { "epoch": 1.4055973788860265, "grad_norm": 0.743880569934845, "learning_rate": 2.7461019027159185e-05, "loss": 0.021, "num_input_tokens_seen": 23362912, "step": 7025 }, { "epoch": 1.4065978040667284, "grad_norm": 0.8439948558807373, "learning_rate": 2.743494925932129e-05, "loss": 0.045, "num_input_tokens_seen": 23378880, "step": 7030 }, { "epoch": 1.40759822924743, "grad_norm": 1.1395623683929443, "learning_rate": 2.740887681805583e-05, "loss": 0.0346, "num_input_tokens_seen": 23395040, "step": 7035 }, { "epoch": 1.408598654428132, "grad_norm": 0.6801261305809021, "learning_rate": 2.738280173198877e-05, "loss": 0.0255, "num_input_tokens_seen": 23411424, "step": 7040 }, { "epoch": 1.4095990796088338, "grad_norm": 0.37221503257751465, "learning_rate": 2.7356724029748994e-05, "loss": 0.0158, "num_input_tokens_seen": 23429184, "step": 7045 }, { "epoch": 1.4105995047895354, "grad_norm": 0.4096348285675049, "learning_rate": 2.7330643739968237e-05, "loss": 0.0174, "num_input_tokens_seen": 23445760, "step": 7050 }, { "epoch": 1.4115999299702373, "grad_norm": 0.6790040135383606, "learning_rate": 2.7304560891281095e-05, "loss": 0.0309, "num_input_tokens_seen": 23461120, "step": 7055 }, { "epoch": 1.4126003551509392, "grad_norm": 0.37077081203460693, "learning_rate": 2.727847551232496e-05, "loss": 0.0248, "num_input_tokens_seen": 23477024, "step": 7060 }, { "epoch": 1.4136007803316408, "grad_norm": 0.364713579416275, "learning_rate": 2.725238763174e-05, "loss": 0.0092, "num_input_tokens_seen": 23493856, "step": 7065 }, { "epoch": 1.4146012055123427, "grad_norm": 0.5751842856407166, "learning_rate": 2.722629727816914e-05, "loss": 0.0299, "num_input_tokens_seen": 23510976, "step": 7070 }, { "epoch": 1.4156016306930446, "grad_norm": 0.21734090149402618, "learning_rate": 2.7200204480258023e-05, "loss": 0.0175, "num_input_tokens_seen": 23528864, "step": 7075 }, { "epoch": 1.4166020558737462, "grad_norm": 0.5161658525466919, "learning_rate": 2.7174109266654966e-05, "loss": 0.0222, "num_input_tokens_seen": 23544896, "step": 7080 }, { "epoch": 1.417602481054448, "grad_norm": 0.2955359220504761, "learning_rate": 2.7148011666010936e-05, "loss": 0.0177, "num_input_tokens_seen": 23561440, "step": 7085 }, { "epoch": 1.41860290623515, "grad_norm": 0.1447141021490097, "learning_rate": 2.712191170697953e-05, "loss": 0.0205, "num_input_tokens_seen": 23577664, "step": 7090 }, { "epoch": 1.4196033314158516, "grad_norm": 0.23074093461036682, "learning_rate": 2.709580941821692e-05, "loss": 0.0225, "num_input_tokens_seen": 23594336, "step": 7095 }, { "epoch": 1.4206037565965535, "grad_norm": 0.747652530670166, "learning_rate": 2.706970482838187e-05, "loss": 0.0253, "num_input_tokens_seen": 23612000, "step": 7100 }, { "epoch": 1.4216041817772553, "grad_norm": 0.6384264826774597, "learning_rate": 2.704359796613562e-05, "loss": 0.0167, "num_input_tokens_seen": 23628576, "step": 7105 }, { "epoch": 1.4226046069579572, "grad_norm": 1.05685555934906, "learning_rate": 2.7017488860141955e-05, "loss": 0.0222, "num_input_tokens_seen": 23646112, "step": 7110 }, { "epoch": 1.4236050321386589, "grad_norm": 0.3217991292476654, "learning_rate": 2.699137753906708e-05, "loss": 0.0158, "num_input_tokens_seen": 23662560, "step": 7115 }, { "epoch": 1.4246054573193607, "grad_norm": 1.1819077730178833, "learning_rate": 2.696526403157966e-05, "loss": 0.0334, "num_input_tokens_seen": 23679264, "step": 7120 }, { "epoch": 1.4256058825000626, "grad_norm": 0.6659606099128723, "learning_rate": 2.6939148366350758e-05, "loss": 0.0224, "num_input_tokens_seen": 23695648, "step": 7125 }, { "epoch": 1.4266063076807645, "grad_norm": 0.5607231259346008, "learning_rate": 2.6913030572053798e-05, "loss": 0.0257, "num_input_tokens_seen": 23711616, "step": 7130 }, { "epoch": 1.427606732861466, "grad_norm": 0.20791594684123993, "learning_rate": 2.6886910677364542e-05, "loss": 0.0254, "num_input_tokens_seen": 23728544, "step": 7135 }, { "epoch": 1.428607158042168, "grad_norm": 1.3680839538574219, "learning_rate": 2.6860788710961054e-05, "loss": 0.0464, "num_input_tokens_seen": 23745536, "step": 7140 }, { "epoch": 1.4296075832228698, "grad_norm": 0.24108925461769104, "learning_rate": 2.683466470152369e-05, "loss": 0.0127, "num_input_tokens_seen": 23761856, "step": 7145 }, { "epoch": 1.4306080084035715, "grad_norm": 0.32991474866867065, "learning_rate": 2.680853867773504e-05, "loss": 0.0298, "num_input_tokens_seen": 23777728, "step": 7150 }, { "epoch": 1.4316084335842734, "grad_norm": 0.19740597903728485, "learning_rate": 2.6782410668279895e-05, "loss": 0.0396, "num_input_tokens_seen": 23794560, "step": 7155 }, { "epoch": 1.4326088587649752, "grad_norm": 0.8721890449523926, "learning_rate": 2.675628070184523e-05, "loss": 0.0191, "num_input_tokens_seen": 23812032, "step": 7160 }, { "epoch": 1.4336092839456769, "grad_norm": 0.4150925874710083, "learning_rate": 2.6730148807120197e-05, "loss": 0.0166, "num_input_tokens_seen": 23828608, "step": 7165 }, { "epoch": 1.4346097091263788, "grad_norm": 0.7514519095420837, "learning_rate": 2.6704015012796013e-05, "loss": 0.03, "num_input_tokens_seen": 23844224, "step": 7170 }, { "epoch": 1.4356101343070806, "grad_norm": 0.13385280966758728, "learning_rate": 2.6677879347566038e-05, "loss": 0.0137, "num_input_tokens_seen": 23861088, "step": 7175 }, { "epoch": 1.4366105594877823, "grad_norm": 0.2709481716156006, "learning_rate": 2.6651741840125627e-05, "loss": 0.01, "num_input_tokens_seen": 23877952, "step": 7180 }, { "epoch": 1.4376109846684841, "grad_norm": 0.05890822038054466, "learning_rate": 2.6625602519172216e-05, "loss": 0.0174, "num_input_tokens_seen": 23893568, "step": 7185 }, { "epoch": 1.438611409849186, "grad_norm": 0.053118109703063965, "learning_rate": 2.6599461413405192e-05, "loss": 0.02, "num_input_tokens_seen": 23910592, "step": 7190 }, { "epoch": 1.4396118350298877, "grad_norm": 0.2782624065876007, "learning_rate": 2.6573318551525904e-05, "loss": 0.0301, "num_input_tokens_seen": 23926400, "step": 7195 }, { "epoch": 1.4406122602105895, "grad_norm": 0.9280529022216797, "learning_rate": 2.6547173962237653e-05, "loss": 0.0295, "num_input_tokens_seen": 23942080, "step": 7200 }, { "epoch": 1.4416126853912914, "grad_norm": 0.22394904494285583, "learning_rate": 2.6521027674245625e-05, "loss": 0.0128, "num_input_tokens_seen": 23959136, "step": 7205 }, { "epoch": 1.442613110571993, "grad_norm": 0.5346195697784424, "learning_rate": 2.649487971625686e-05, "loss": 0.024, "num_input_tokens_seen": 23976224, "step": 7210 }, { "epoch": 1.443613535752695, "grad_norm": 0.18652726709842682, "learning_rate": 2.646873011698024e-05, "loss": 0.0213, "num_input_tokens_seen": 23993312, "step": 7215 }, { "epoch": 1.4446139609333968, "grad_norm": 0.5600389838218689, "learning_rate": 2.6442578905126458e-05, "loss": 0.0098, "num_input_tokens_seen": 24008896, "step": 7220 }, { "epoch": 1.4456143861140984, "grad_norm": 0.423225075006485, "learning_rate": 2.6416426109407954e-05, "loss": 0.0568, "num_input_tokens_seen": 24025632, "step": 7225 }, { "epoch": 1.4466148112948003, "grad_norm": 0.2751964330673218, "learning_rate": 2.6390271758538944e-05, "loss": 0.0131, "num_input_tokens_seen": 24041792, "step": 7230 }, { "epoch": 1.4476152364755022, "grad_norm": 0.10048820078372955, "learning_rate": 2.6364115881235302e-05, "loss": 0.0283, "num_input_tokens_seen": 24058656, "step": 7235 }, { "epoch": 1.4486156616562038, "grad_norm": 0.5874961614608765, "learning_rate": 2.6337958506214627e-05, "loss": 0.0263, "num_input_tokens_seen": 24074048, "step": 7240 }, { "epoch": 1.4496160868369057, "grad_norm": 0.6436035633087158, "learning_rate": 2.6311799662196136e-05, "loss": 0.0228, "num_input_tokens_seen": 24090400, "step": 7245 }, { "epoch": 1.4506165120176076, "grad_norm": 0.2383117377758026, "learning_rate": 2.6285639377900657e-05, "loss": 0.0315, "num_input_tokens_seen": 24106432, "step": 7250 }, { "epoch": 1.4516169371983092, "grad_norm": 0.3349505066871643, "learning_rate": 2.6259477682050603e-05, "loss": 0.0175, "num_input_tokens_seen": 24123840, "step": 7255 }, { "epoch": 1.452617362379011, "grad_norm": 0.9861046671867371, "learning_rate": 2.6233314603369957e-05, "loss": 0.011, "num_input_tokens_seen": 24139360, "step": 7260 }, { "epoch": 1.453617787559713, "grad_norm": 0.1726267784833908, "learning_rate": 2.6207150170584187e-05, "loss": 0.0146, "num_input_tokens_seen": 24155520, "step": 7265 }, { "epoch": 1.4546182127404146, "grad_norm": 0.29923322796821594, "learning_rate": 2.6180984412420266e-05, "loss": 0.0227, "num_input_tokens_seen": 24172672, "step": 7270 }, { "epoch": 1.4556186379211165, "grad_norm": 0.3386686444282532, "learning_rate": 2.6154817357606626e-05, "loss": 0.0265, "num_input_tokens_seen": 24189536, "step": 7275 }, { "epoch": 1.4566190631018183, "grad_norm": 0.1898878961801529, "learning_rate": 2.6128649034873115e-05, "loss": 0.0283, "num_input_tokens_seen": 24204352, "step": 7280 }, { "epoch": 1.45761948828252, "grad_norm": 0.1861916184425354, "learning_rate": 2.610247947295097e-05, "loss": 0.0156, "num_input_tokens_seen": 24220832, "step": 7285 }, { "epoch": 1.4586199134632218, "grad_norm": 0.7051615715026855, "learning_rate": 2.6076308700572794e-05, "loss": 0.0252, "num_input_tokens_seen": 24238016, "step": 7290 }, { "epoch": 1.4596203386439237, "grad_norm": 0.3806222379207611, "learning_rate": 2.605013674647252e-05, "loss": 0.0309, "num_input_tokens_seen": 24253888, "step": 7295 }, { "epoch": 1.4606207638246254, "grad_norm": 0.6028633117675781, "learning_rate": 2.6023963639385374e-05, "loss": 0.021, "num_input_tokens_seen": 24270880, "step": 7300 }, { "epoch": 1.4616211890053272, "grad_norm": 0.2088484764099121, "learning_rate": 2.5997789408047863e-05, "loss": 0.0192, "num_input_tokens_seen": 24288256, "step": 7305 }, { "epoch": 1.462621614186029, "grad_norm": 0.35228636860847473, "learning_rate": 2.5971614081197692e-05, "loss": 0.0262, "num_input_tokens_seen": 24304704, "step": 7310 }, { "epoch": 1.4636220393667307, "grad_norm": 0.8489367961883545, "learning_rate": 2.5945437687573816e-05, "loss": 0.0157, "num_input_tokens_seen": 24321952, "step": 7315 }, { "epoch": 1.4646224645474326, "grad_norm": 1.9019261598587036, "learning_rate": 2.5919260255916322e-05, "loss": 0.0333, "num_input_tokens_seen": 24337696, "step": 7320 }, { "epoch": 1.4656228897281345, "grad_norm": 0.20332832634449005, "learning_rate": 2.589308181496645e-05, "loss": 0.0384, "num_input_tokens_seen": 24353696, "step": 7325 }, { "epoch": 1.4666233149088361, "grad_norm": 0.30129843950271606, "learning_rate": 2.5866902393466574e-05, "loss": 0.0131, "num_input_tokens_seen": 24370240, "step": 7330 }, { "epoch": 1.467623740089538, "grad_norm": 0.7240298986434937, "learning_rate": 2.5840722020160107e-05, "loss": 0.0251, "num_input_tokens_seen": 24387520, "step": 7335 }, { "epoch": 1.4686241652702399, "grad_norm": 1.0181207656860352, "learning_rate": 2.5814540723791518e-05, "loss": 0.0366, "num_input_tokens_seen": 24403840, "step": 7340 }, { "epoch": 1.4696245904509415, "grad_norm": 1.1026760339736938, "learning_rate": 2.5788358533106295e-05, "loss": 0.0377, "num_input_tokens_seen": 24419936, "step": 7345 }, { "epoch": 1.4706250156316434, "grad_norm": 0.7127203941345215, "learning_rate": 2.5762175476850896e-05, "loss": 0.0301, "num_input_tokens_seen": 24436736, "step": 7350 }, { "epoch": 1.4716254408123453, "grad_norm": 0.2946961224079132, "learning_rate": 2.573599158377276e-05, "loss": 0.0202, "num_input_tokens_seen": 24453120, "step": 7355 }, { "epoch": 1.4726258659930471, "grad_norm": 0.17568668723106384, "learning_rate": 2.570980688262022e-05, "loss": 0.0081, "num_input_tokens_seen": 24470368, "step": 7360 }, { "epoch": 1.4736262911737488, "grad_norm": 0.2808990478515625, "learning_rate": 2.568362140214248e-05, "loss": 0.0207, "num_input_tokens_seen": 24488224, "step": 7365 }, { "epoch": 1.4746267163544506, "grad_norm": 0.12591205537319183, "learning_rate": 2.5657435171089646e-05, "loss": 0.0135, "num_input_tokens_seen": 24504896, "step": 7370 }, { "epoch": 1.4756271415351525, "grad_norm": 0.5561877489089966, "learning_rate": 2.5631248218212595e-05, "loss": 0.0185, "num_input_tokens_seen": 24521888, "step": 7375 }, { "epoch": 1.4766275667158542, "grad_norm": 0.07315061241388321, "learning_rate": 2.5605060572263046e-05, "loss": 0.0234, "num_input_tokens_seen": 24538496, "step": 7380 }, { "epoch": 1.477627991896556, "grad_norm": 0.4523506760597229, "learning_rate": 2.5578872261993447e-05, "loss": 0.0196, "num_input_tokens_seen": 24554848, "step": 7385 }, { "epoch": 1.478628417077258, "grad_norm": 0.07306066155433655, "learning_rate": 2.5552683316156983e-05, "loss": 0.0244, "num_input_tokens_seen": 24571840, "step": 7390 }, { "epoch": 1.4796288422579598, "grad_norm": 0.3900500535964966, "learning_rate": 2.5526493763507548e-05, "loss": 0.0221, "num_input_tokens_seen": 24589280, "step": 7395 }, { "epoch": 1.4806292674386614, "grad_norm": 0.5769615173339844, "learning_rate": 2.5500303632799675e-05, "loss": 0.0157, "num_input_tokens_seen": 24605568, "step": 7400 }, { "epoch": 1.4816296926193633, "grad_norm": 0.06523500382900238, "learning_rate": 2.5474112952788565e-05, "loss": 0.0238, "num_input_tokens_seen": 24622304, "step": 7405 }, { "epoch": 1.4826301178000652, "grad_norm": 0.3345092535018921, "learning_rate": 2.5447921752230003e-05, "loss": 0.0438, "num_input_tokens_seen": 24639328, "step": 7410 }, { "epoch": 1.4836305429807668, "grad_norm": 0.5575299263000488, "learning_rate": 2.542173005988034e-05, "loss": 0.0136, "num_input_tokens_seen": 24657216, "step": 7415 }, { "epoch": 1.4846309681614687, "grad_norm": 0.6276900768280029, "learning_rate": 2.539553790449649e-05, "loss": 0.0379, "num_input_tokens_seen": 24673664, "step": 7420 }, { "epoch": 1.4856313933421705, "grad_norm": 0.5556184649467468, "learning_rate": 2.5369345314835856e-05, "loss": 0.0235, "num_input_tokens_seen": 24691264, "step": 7425 }, { "epoch": 1.4866318185228722, "grad_norm": 0.6906352043151855, "learning_rate": 2.5343152319656316e-05, "loss": 0.0217, "num_input_tokens_seen": 24708512, "step": 7430 }, { "epoch": 1.487632243703574, "grad_norm": 0.267528235912323, "learning_rate": 2.5316958947716206e-05, "loss": 0.0269, "num_input_tokens_seen": 24725152, "step": 7435 }, { "epoch": 1.488632668884276, "grad_norm": 0.313608318567276, "learning_rate": 2.5290765227774277e-05, "loss": 0.0158, "num_input_tokens_seen": 24741696, "step": 7440 }, { "epoch": 1.4896330940649776, "grad_norm": 0.8933184146881104, "learning_rate": 2.5264571188589642e-05, "loss": 0.0331, "num_input_tokens_seen": 24758432, "step": 7445 }, { "epoch": 1.4906335192456794, "grad_norm": 0.48685455322265625, "learning_rate": 2.5238376858921786e-05, "loss": 0.0248, "num_input_tokens_seen": 24774400, "step": 7450 }, { "epoch": 1.4916339444263813, "grad_norm": 0.806394636631012, "learning_rate": 2.5212182267530498e-05, "loss": 0.0207, "num_input_tokens_seen": 24790848, "step": 7455 }, { "epoch": 1.492634369607083, "grad_norm": 0.7549716234207153, "learning_rate": 2.5185987443175878e-05, "loss": 0.0213, "num_input_tokens_seen": 24808608, "step": 7460 }, { "epoch": 1.4936347947877848, "grad_norm": 0.014884378761053085, "learning_rate": 2.5159792414618244e-05, "loss": 0.0207, "num_input_tokens_seen": 24824928, "step": 7465 }, { "epoch": 1.4946352199684867, "grad_norm": 0.788593053817749, "learning_rate": 2.513359721061817e-05, "loss": 0.0177, "num_input_tokens_seen": 24840736, "step": 7470 }, { "epoch": 1.4956356451491883, "grad_norm": 0.3795950710773468, "learning_rate": 2.510740185993641e-05, "loss": 0.0223, "num_input_tokens_seen": 24858208, "step": 7475 }, { "epoch": 1.4966360703298902, "grad_norm": 1.174605369567871, "learning_rate": 2.5081206391333873e-05, "loss": 0.025, "num_input_tokens_seen": 24874720, "step": 7480 }, { "epoch": 1.497636495510592, "grad_norm": 0.6153615713119507, "learning_rate": 2.505501083357162e-05, "loss": 0.0195, "num_input_tokens_seen": 24893184, "step": 7485 }, { "epoch": 1.4986369206912937, "grad_norm": 0.3919779062271118, "learning_rate": 2.502881521541078e-05, "loss": 0.0318, "num_input_tokens_seen": 24910720, "step": 7490 }, { "epoch": 1.4996373458719956, "grad_norm": 0.2815072536468506, "learning_rate": 2.500261956561257e-05, "loss": 0.007, "num_input_tokens_seen": 24927392, "step": 7495 }, { "epoch": 1.5006377710526975, "grad_norm": 0.11717823147773743, "learning_rate": 2.4976423912938236e-05, "loss": 0.0199, "num_input_tokens_seen": 24944736, "step": 7500 }, { "epoch": 1.5016381962333991, "grad_norm": 1.0984041690826416, "learning_rate": 2.4950228286149028e-05, "loss": 0.047, "num_input_tokens_seen": 24960832, "step": 7505 }, { "epoch": 1.502638621414101, "grad_norm": 0.7648619413375854, "learning_rate": 2.4924032714006157e-05, "loss": 0.0319, "num_input_tokens_seen": 24978272, "step": 7510 }, { "epoch": 1.5036390465948029, "grad_norm": 0.40908828377723694, "learning_rate": 2.4897837225270796e-05, "loss": 0.0196, "num_input_tokens_seen": 24995904, "step": 7515 }, { "epoch": 1.5046394717755045, "grad_norm": 0.14489774405956268, "learning_rate": 2.4871641848704004e-05, "loss": 0.0096, "num_input_tokens_seen": 25013536, "step": 7520 }, { "epoch": 1.5056398969562064, "grad_norm": 0.4525126516819, "learning_rate": 2.484544661306672e-05, "loss": 0.0408, "num_input_tokens_seen": 25030560, "step": 7525 }, { "epoch": 1.5066403221369082, "grad_norm": 0.21470829844474792, "learning_rate": 2.481925154711975e-05, "loss": 0.0153, "num_input_tokens_seen": 25046880, "step": 7530 }, { "epoch": 1.5076407473176099, "grad_norm": 0.30370137095451355, "learning_rate": 2.479305667962369e-05, "loss": 0.0301, "num_input_tokens_seen": 25064384, "step": 7535 }, { "epoch": 1.5086411724983118, "grad_norm": 0.14242781698703766, "learning_rate": 2.476686203933892e-05, "loss": 0.0163, "num_input_tokens_seen": 25079808, "step": 7540 }, { "epoch": 1.5096415976790136, "grad_norm": 0.3540135324001312, "learning_rate": 2.4740667655025586e-05, "loss": 0.036, "num_input_tokens_seen": 25096128, "step": 7545 }, { "epoch": 1.5106420228597153, "grad_norm": 0.38274118304252625, "learning_rate": 2.4714473555443534e-05, "loss": 0.0146, "num_input_tokens_seen": 25112128, "step": 7550 }, { "epoch": 1.5116424480404171, "grad_norm": 0.8952894806861877, "learning_rate": 2.4688279769352323e-05, "loss": 0.0207, "num_input_tokens_seen": 25129440, "step": 7555 }, { "epoch": 1.512642873221119, "grad_norm": 0.40251901745796204, "learning_rate": 2.466208632551114e-05, "loss": 0.0231, "num_input_tokens_seen": 25144928, "step": 7560 }, { "epoch": 1.5136432984018207, "grad_norm": 1.6884913444519043, "learning_rate": 2.463589325267881e-05, "loss": 0.0253, "num_input_tokens_seen": 25161024, "step": 7565 }, { "epoch": 1.5146437235825225, "grad_norm": 0.24734361469745636, "learning_rate": 2.4609700579613745e-05, "loss": 0.024, "num_input_tokens_seen": 25177408, "step": 7570 }, { "epoch": 1.5156441487632244, "grad_norm": 0.8871567845344543, "learning_rate": 2.4583508335073933e-05, "loss": 0.0152, "num_input_tokens_seen": 25193632, "step": 7575 }, { "epoch": 1.516644573943926, "grad_norm": 0.6303677558898926, "learning_rate": 2.4557316547816865e-05, "loss": 0.0129, "num_input_tokens_seen": 25211136, "step": 7580 }, { "epoch": 1.517644999124628, "grad_norm": 0.1167726218700409, "learning_rate": 2.4531125246599556e-05, "loss": 0.0211, "num_input_tokens_seen": 25228416, "step": 7585 }, { "epoch": 1.5186454243053298, "grad_norm": 0.3821775019168854, "learning_rate": 2.4504934460178475e-05, "loss": 0.0067, "num_input_tokens_seen": 25244640, "step": 7590 }, { "epoch": 1.5196458494860314, "grad_norm": 0.09346763789653778, "learning_rate": 2.4478744217309515e-05, "loss": 0.0166, "num_input_tokens_seen": 25262528, "step": 7595 }, { "epoch": 1.5206462746667335, "grad_norm": 1.1715312004089355, "learning_rate": 2.445255454674801e-05, "loss": 0.0289, "num_input_tokens_seen": 25278720, "step": 7600 }, { "epoch": 1.5216466998474352, "grad_norm": 0.7903635501861572, "learning_rate": 2.4426365477248613e-05, "loss": 0.036, "num_input_tokens_seen": 25295616, "step": 7605 }, { "epoch": 1.5226471250281368, "grad_norm": 0.16973412036895752, "learning_rate": 2.4400177037565363e-05, "loss": 0.0139, "num_input_tokens_seen": 25312384, "step": 7610 }, { "epoch": 1.523647550208839, "grad_norm": 0.07989294826984406, "learning_rate": 2.437398925645159e-05, "loss": 0.0108, "num_input_tokens_seen": 25329056, "step": 7615 }, { "epoch": 1.5246479753895406, "grad_norm": 0.18625207245349884, "learning_rate": 2.4347802162659877e-05, "loss": 0.0181, "num_input_tokens_seen": 25345664, "step": 7620 }, { "epoch": 1.5256484005702422, "grad_norm": 0.7733270525932312, "learning_rate": 2.43216157849421e-05, "loss": 0.0477, "num_input_tokens_seen": 25361568, "step": 7625 }, { "epoch": 1.5266488257509443, "grad_norm": 0.5695168972015381, "learning_rate": 2.4295430152049316e-05, "loss": 0.0213, "num_input_tokens_seen": 25378496, "step": 7630 }, { "epoch": 1.527649250931646, "grad_norm": 1.1541272401809692, "learning_rate": 2.426924529273176e-05, "loss": 0.0317, "num_input_tokens_seen": 25394240, "step": 7635 }, { "epoch": 1.5286496761123476, "grad_norm": 0.38781797885894775, "learning_rate": 2.424306123573885e-05, "loss": 0.0233, "num_input_tokens_seen": 25410656, "step": 7640 }, { "epoch": 1.5296501012930497, "grad_norm": 0.1999148726463318, "learning_rate": 2.421687800981908e-05, "loss": 0.0195, "num_input_tokens_seen": 25427328, "step": 7645 }, { "epoch": 1.5306505264737513, "grad_norm": 0.06788372248411179, "learning_rate": 2.4190695643720074e-05, "loss": 0.0162, "num_input_tokens_seen": 25443840, "step": 7650 }, { "epoch": 1.5316509516544532, "grad_norm": 0.21277567744255066, "learning_rate": 2.416451416618849e-05, "loss": 0.0164, "num_input_tokens_seen": 25461216, "step": 7655 }, { "epoch": 1.532651376835155, "grad_norm": 0.23967178165912628, "learning_rate": 2.4138333605970002e-05, "loss": 0.0239, "num_input_tokens_seen": 25478080, "step": 7660 }, { "epoch": 1.5336518020158567, "grad_norm": 0.45840054750442505, "learning_rate": 2.4112153991809297e-05, "loss": 0.0173, "num_input_tokens_seen": 25494528, "step": 7665 }, { "epoch": 1.5346522271965586, "grad_norm": 0.1507226526737213, "learning_rate": 2.408597535245001e-05, "loss": 0.0105, "num_input_tokens_seen": 25511904, "step": 7670 }, { "epoch": 1.5356526523772605, "grad_norm": 0.5378280878067017, "learning_rate": 2.4059797716634707e-05, "loss": 0.0157, "num_input_tokens_seen": 25529120, "step": 7675 }, { "epoch": 1.536653077557962, "grad_norm": 0.6452955603599548, "learning_rate": 2.403362111310486e-05, "loss": 0.0261, "num_input_tokens_seen": 25546336, "step": 7680 }, { "epoch": 1.537653502738664, "grad_norm": 0.24737097322940826, "learning_rate": 2.40074455706008e-05, "loss": 0.0114, "num_input_tokens_seen": 25562976, "step": 7685 }, { "epoch": 1.5386539279193658, "grad_norm": 0.21651674807071686, "learning_rate": 2.398127111786169e-05, "loss": 0.0212, "num_input_tokens_seen": 25579808, "step": 7690 }, { "epoch": 1.5396543531000675, "grad_norm": 0.4780050218105316, "learning_rate": 2.395509778362552e-05, "loss": 0.0358, "num_input_tokens_seen": 25595712, "step": 7695 }, { "epoch": 1.5406547782807694, "grad_norm": 0.3401283919811249, "learning_rate": 2.3928925596629004e-05, "loss": 0.0177, "num_input_tokens_seen": 25612384, "step": 7700 }, { "epoch": 1.5416552034614712, "grad_norm": 0.3908470571041107, "learning_rate": 2.3902754585607655e-05, "loss": 0.0117, "num_input_tokens_seen": 25628704, "step": 7705 }, { "epoch": 1.5426556286421729, "grad_norm": 0.42244410514831543, "learning_rate": 2.387658477929565e-05, "loss": 0.0157, "num_input_tokens_seen": 25645024, "step": 7710 }, { "epoch": 1.5436560538228747, "grad_norm": 0.17085978388786316, "learning_rate": 2.3850416206425868e-05, "loss": 0.0144, "num_input_tokens_seen": 25661952, "step": 7715 }, { "epoch": 1.5446564790035766, "grad_norm": 0.4565860629081726, "learning_rate": 2.3824248895729823e-05, "loss": 0.0134, "num_input_tokens_seen": 25679520, "step": 7720 }, { "epoch": 1.5456569041842783, "grad_norm": 0.8869709968566895, "learning_rate": 2.379808287593764e-05, "loss": 0.0157, "num_input_tokens_seen": 25695968, "step": 7725 }, { "epoch": 1.5466573293649801, "grad_norm": 0.08044911921024323, "learning_rate": 2.3771918175778037e-05, "loss": 0.0112, "num_input_tokens_seen": 25711168, "step": 7730 }, { "epoch": 1.547657754545682, "grad_norm": 0.0679691955447197, "learning_rate": 2.374575482397828e-05, "loss": 0.0208, "num_input_tokens_seen": 25726784, "step": 7735 }, { "epoch": 1.5486581797263836, "grad_norm": 0.13976770639419556, "learning_rate": 2.3719592849264144e-05, "loss": 0.0178, "num_input_tokens_seen": 25744064, "step": 7740 }, { "epoch": 1.5496586049070855, "grad_norm": 0.4790512025356293, "learning_rate": 2.3693432280359906e-05, "loss": 0.0328, "num_input_tokens_seen": 25761632, "step": 7745 }, { "epoch": 1.5506590300877874, "grad_norm": 0.499796986579895, "learning_rate": 2.366727314598829e-05, "loss": 0.0483, "num_input_tokens_seen": 25777632, "step": 7750 }, { "epoch": 1.551659455268489, "grad_norm": 0.5286955237388611, "learning_rate": 2.3641115474870462e-05, "loss": 0.0174, "num_input_tokens_seen": 25795264, "step": 7755 }, { "epoch": 1.552659880449191, "grad_norm": 0.3542931377887726, "learning_rate": 2.361495929572596e-05, "loss": 0.023, "num_input_tokens_seen": 25811680, "step": 7760 }, { "epoch": 1.5536603056298928, "grad_norm": 0.2267359495162964, "learning_rate": 2.358880463727269e-05, "loss": 0.0151, "num_input_tokens_seen": 25827328, "step": 7765 }, { "epoch": 1.5546607308105944, "grad_norm": 0.5924873352050781, "learning_rate": 2.35626515282269e-05, "loss": 0.0206, "num_input_tokens_seen": 25843520, "step": 7770 }, { "epoch": 1.5556611559912963, "grad_norm": 0.0663023293018341, "learning_rate": 2.3536499997303126e-05, "loss": 0.0262, "num_input_tokens_seen": 25859040, "step": 7775 }, { "epoch": 1.5566615811719982, "grad_norm": 0.5205705761909485, "learning_rate": 2.3510350073214165e-05, "loss": 0.0318, "num_input_tokens_seen": 25875008, "step": 7780 }, { "epoch": 1.5576620063526998, "grad_norm": 0.3358594477176666, "learning_rate": 2.3484201784671055e-05, "loss": 0.0326, "num_input_tokens_seen": 25892064, "step": 7785 }, { "epoch": 1.5586624315334017, "grad_norm": 0.2526882290840149, "learning_rate": 2.3458055160383054e-05, "loss": 0.0165, "num_input_tokens_seen": 25908896, "step": 7790 }, { "epoch": 1.5596628567141035, "grad_norm": 0.286253958940506, "learning_rate": 2.343191022905758e-05, "loss": 0.0251, "num_input_tokens_seen": 25926816, "step": 7795 }, { "epoch": 1.5606632818948052, "grad_norm": 0.47777873277664185, "learning_rate": 2.340576701940017e-05, "loss": 0.0236, "num_input_tokens_seen": 25943296, "step": 7800 }, { "epoch": 1.561663707075507, "grad_norm": 0.5363597273826599, "learning_rate": 2.3379625560114516e-05, "loss": 0.0281, "num_input_tokens_seen": 25958496, "step": 7805 }, { "epoch": 1.562664132256209, "grad_norm": 0.8436539173126221, "learning_rate": 2.3353485879902367e-05, "loss": 0.0242, "num_input_tokens_seen": 25974336, "step": 7810 }, { "epoch": 1.5636645574369106, "grad_norm": 0.13085699081420898, "learning_rate": 2.3327348007463506e-05, "loss": 0.0112, "num_input_tokens_seen": 25989792, "step": 7815 }, { "epoch": 1.5646649826176124, "grad_norm": 0.12430788576602936, "learning_rate": 2.330121197149574e-05, "loss": 0.0246, "num_input_tokens_seen": 26006080, "step": 7820 }, { "epoch": 1.5656654077983143, "grad_norm": 0.13795477151870728, "learning_rate": 2.3275077800694872e-05, "loss": 0.0191, "num_input_tokens_seen": 26022496, "step": 7825 }, { "epoch": 1.566665832979016, "grad_norm": 0.36709097027778625, "learning_rate": 2.324894552375464e-05, "loss": 0.0209, "num_input_tokens_seen": 26039136, "step": 7830 }, { "epoch": 1.5676662581597178, "grad_norm": 0.6450609564781189, "learning_rate": 2.3222815169366706e-05, "loss": 0.0191, "num_input_tokens_seen": 26054624, "step": 7835 }, { "epoch": 1.5686666833404197, "grad_norm": 0.46975892782211304, "learning_rate": 2.319668676622063e-05, "loss": 0.0251, "num_input_tokens_seen": 26070720, "step": 7840 }, { "epoch": 1.5696671085211213, "grad_norm": 0.4092192053794861, "learning_rate": 2.3170560343003826e-05, "loss": 0.0163, "num_input_tokens_seen": 26087552, "step": 7845 }, { "epoch": 1.5706675337018234, "grad_norm": 0.4573599398136139, "learning_rate": 2.3144435928401528e-05, "loss": 0.026, "num_input_tokens_seen": 26104224, "step": 7850 }, { "epoch": 1.571667958882525, "grad_norm": 0.5140572190284729, "learning_rate": 2.3118313551096763e-05, "loss": 0.0376, "num_input_tokens_seen": 26119712, "step": 7855 }, { "epoch": 1.5726683840632267, "grad_norm": 0.6111047863960266, "learning_rate": 2.3092193239770338e-05, "loss": 0.0205, "num_input_tokens_seen": 26136448, "step": 7860 }, { "epoch": 1.5736688092439288, "grad_norm": 0.6026287078857422, "learning_rate": 2.3066075023100785e-05, "loss": 0.0127, "num_input_tokens_seen": 26152896, "step": 7865 }, { "epoch": 1.5746692344246305, "grad_norm": 0.5394666790962219, "learning_rate": 2.303995892976432e-05, "loss": 0.0282, "num_input_tokens_seen": 26169536, "step": 7870 }, { "epoch": 1.5756696596053321, "grad_norm": 0.20104095339775085, "learning_rate": 2.3013844988434842e-05, "loss": 0.0239, "num_input_tokens_seen": 26186304, "step": 7875 }, { "epoch": 1.5766700847860342, "grad_norm": 0.17194046080112457, "learning_rate": 2.298773322778389e-05, "loss": 0.0413, "num_input_tokens_seen": 26203616, "step": 7880 }, { "epoch": 1.5776705099667359, "grad_norm": 0.5429868102073669, "learning_rate": 2.296162367648061e-05, "loss": 0.0224, "num_input_tokens_seen": 26220736, "step": 7885 }, { "epoch": 1.5786709351474375, "grad_norm": 0.4686752259731293, "learning_rate": 2.2935516363191693e-05, "loss": 0.0352, "num_input_tokens_seen": 26237376, "step": 7890 }, { "epoch": 1.5796713603281396, "grad_norm": 0.4157911539077759, "learning_rate": 2.2909411316581418e-05, "loss": 0.0154, "num_input_tokens_seen": 26253664, "step": 7895 }, { "epoch": 1.5806717855088412, "grad_norm": 0.17762956023216248, "learning_rate": 2.288330856531155e-05, "loss": 0.0164, "num_input_tokens_seen": 26270240, "step": 7900 }, { "epoch": 1.581672210689543, "grad_norm": 0.3104545474052429, "learning_rate": 2.2857208138041316e-05, "loss": 0.0224, "num_input_tokens_seen": 26286400, "step": 7905 }, { "epoch": 1.582672635870245, "grad_norm": 0.2843524217605591, "learning_rate": 2.2831110063427443e-05, "loss": 0.0192, "num_input_tokens_seen": 26303328, "step": 7910 }, { "epoch": 1.5836730610509466, "grad_norm": 0.7966655492782593, "learning_rate": 2.2805014370124024e-05, "loss": 0.0229, "num_input_tokens_seen": 26319456, "step": 7915 }, { "epoch": 1.5846734862316485, "grad_norm": 0.5430777072906494, "learning_rate": 2.277892108678257e-05, "loss": 0.0111, "num_input_tokens_seen": 26335808, "step": 7920 }, { "epoch": 1.5856739114123504, "grad_norm": 0.20004427433013916, "learning_rate": 2.275283024205193e-05, "loss": 0.0166, "num_input_tokens_seen": 26354496, "step": 7925 }, { "epoch": 1.586674336593052, "grad_norm": 0.35423436760902405, "learning_rate": 2.2726741864578265e-05, "loss": 0.009, "num_input_tokens_seen": 26372160, "step": 7930 }, { "epoch": 1.5876747617737539, "grad_norm": 0.5483704805374146, "learning_rate": 2.2700655983005064e-05, "loss": 0.0283, "num_input_tokens_seen": 26388544, "step": 7935 }, { "epoch": 1.5886751869544558, "grad_norm": 0.6132058501243591, "learning_rate": 2.2674572625973047e-05, "loss": 0.0208, "num_input_tokens_seen": 26404192, "step": 7940 }, { "epoch": 1.5896756121351574, "grad_norm": 1.8248412609100342, "learning_rate": 2.264849182212015e-05, "loss": 0.0302, "num_input_tokens_seen": 26421152, "step": 7945 }, { "epoch": 1.5906760373158593, "grad_norm": 0.5395341515541077, "learning_rate": 2.262241360008155e-05, "loss": 0.0344, "num_input_tokens_seen": 26436992, "step": 7950 }, { "epoch": 1.5916764624965611, "grad_norm": 0.19765163958072662, "learning_rate": 2.2596337988489537e-05, "loss": 0.0122, "num_input_tokens_seen": 26453248, "step": 7955 }, { "epoch": 1.5926768876772628, "grad_norm": 0.23685020208358765, "learning_rate": 2.257026501597359e-05, "loss": 0.0223, "num_input_tokens_seen": 26469280, "step": 7960 }, { "epoch": 1.5936773128579647, "grad_norm": 0.15128207206726074, "learning_rate": 2.2544194711160243e-05, "loss": 0.0077, "num_input_tokens_seen": 26485088, "step": 7965 }, { "epoch": 1.5946777380386665, "grad_norm": 0.14374390244483948, "learning_rate": 2.2518127102673123e-05, "loss": 0.0092, "num_input_tokens_seen": 26500992, "step": 7970 }, { "epoch": 1.5956781632193682, "grad_norm": 0.05345458537340164, "learning_rate": 2.24920622191329e-05, "loss": 0.011, "num_input_tokens_seen": 26516864, "step": 7975 }, { "epoch": 1.59667858840007, "grad_norm": 0.22516244649887085, "learning_rate": 2.246600008915724e-05, "loss": 0.033, "num_input_tokens_seen": 26534144, "step": 7980 }, { "epoch": 1.597679013580772, "grad_norm": 0.5458850860595703, "learning_rate": 2.2439940741360784e-05, "loss": 0.0175, "num_input_tokens_seen": 26551328, "step": 7985 }, { "epoch": 1.5986794387614736, "grad_norm": 0.33864474296569824, "learning_rate": 2.2413884204355144e-05, "loss": 0.0182, "num_input_tokens_seen": 26567616, "step": 7990 }, { "epoch": 1.5996798639421754, "grad_norm": 0.9156394004821777, "learning_rate": 2.2387830506748812e-05, "loss": 0.0269, "num_input_tokens_seen": 26585824, "step": 7995 }, { "epoch": 1.6006802891228773, "grad_norm": 0.8333908915519714, "learning_rate": 2.236177967714718e-05, "loss": 0.025, "num_input_tokens_seen": 26602208, "step": 8000 }, { "epoch": 1.601680714303579, "grad_norm": 0.16611388325691223, "learning_rate": 2.2335731744152497e-05, "loss": 0.0041, "num_input_tokens_seen": 26618432, "step": 8005 }, { "epoch": 1.6026811394842808, "grad_norm": 0.17586727440357208, "learning_rate": 2.230968673636381e-05, "loss": 0.0063, "num_input_tokens_seen": 26634368, "step": 8010 }, { "epoch": 1.6036815646649827, "grad_norm": 1.160220742225647, "learning_rate": 2.2283644682376984e-05, "loss": 0.0338, "num_input_tokens_seen": 26650976, "step": 8015 }, { "epoch": 1.6046819898456843, "grad_norm": 0.23919957876205444, "learning_rate": 2.2257605610784622e-05, "loss": 0.0182, "num_input_tokens_seen": 26668320, "step": 8020 }, { "epoch": 1.6056824150263862, "grad_norm": 0.24601928889751434, "learning_rate": 2.2231569550176042e-05, "loss": 0.0229, "num_input_tokens_seen": 26686400, "step": 8025 }, { "epoch": 1.606682840207088, "grad_norm": 0.7848149538040161, "learning_rate": 2.220553652913729e-05, "loss": 0.0239, "num_input_tokens_seen": 26702688, "step": 8030 }, { "epoch": 1.6076832653877897, "grad_norm": 0.1857006996870041, "learning_rate": 2.217950657625104e-05, "loss": 0.0397, "num_input_tokens_seen": 26719040, "step": 8035 }, { "epoch": 1.6086836905684916, "grad_norm": 0.1828911006450653, "learning_rate": 2.2153479720096613e-05, "loss": 0.0144, "num_input_tokens_seen": 26735168, "step": 8040 }, { "epoch": 1.6096841157491935, "grad_norm": 0.3723798096179962, "learning_rate": 2.2127455989249933e-05, "loss": 0.0197, "num_input_tokens_seen": 26753280, "step": 8045 }, { "epoch": 1.610684540929895, "grad_norm": 0.9142296314239502, "learning_rate": 2.2101435412283493e-05, "loss": 0.0357, "num_input_tokens_seen": 26769472, "step": 8050 }, { "epoch": 1.611684966110597, "grad_norm": 0.45003557205200195, "learning_rate": 2.20754180177663e-05, "loss": 0.0123, "num_input_tokens_seen": 26785920, "step": 8055 }, { "epoch": 1.6126853912912988, "grad_norm": 0.26906803250312805, "learning_rate": 2.2049403834263896e-05, "loss": 0.0332, "num_input_tokens_seen": 26802496, "step": 8060 }, { "epoch": 1.6136858164720005, "grad_norm": 0.5820974707603455, "learning_rate": 2.20233928903383e-05, "loss": 0.012, "num_input_tokens_seen": 26820320, "step": 8065 }, { "epoch": 1.6146862416527024, "grad_norm": 0.3259919285774231, "learning_rate": 2.199738521454795e-05, "loss": 0.0192, "num_input_tokens_seen": 26837664, "step": 8070 }, { "epoch": 1.6156866668334042, "grad_norm": 0.9476006031036377, "learning_rate": 2.1971380835447708e-05, "loss": 0.017, "num_input_tokens_seen": 26853248, "step": 8075 }, { "epoch": 1.6166870920141059, "grad_norm": 0.7303922772407532, "learning_rate": 2.1945379781588808e-05, "loss": 0.0207, "num_input_tokens_seen": 26869440, "step": 8080 }, { "epoch": 1.6176875171948077, "grad_norm": 0.12725351750850677, "learning_rate": 2.191938208151885e-05, "loss": 0.0167, "num_input_tokens_seen": 26886720, "step": 8085 }, { "epoch": 1.6186879423755096, "grad_norm": 0.18985620141029358, "learning_rate": 2.1893387763781742e-05, "loss": 0.0162, "num_input_tokens_seen": 26902944, "step": 8090 }, { "epoch": 1.6196883675562113, "grad_norm": 1.6756082773208618, "learning_rate": 2.1867396856917665e-05, "loss": 0.0236, "num_input_tokens_seen": 26919616, "step": 8095 }, { "epoch": 1.6206887927369134, "grad_norm": 0.510668158531189, "learning_rate": 2.1841409389463088e-05, "loss": 0.0125, "num_input_tokens_seen": 26934848, "step": 8100 }, { "epoch": 1.621689217917615, "grad_norm": 0.5919009447097778, "learning_rate": 2.1815425389950675e-05, "loss": 0.013, "num_input_tokens_seen": 26951136, "step": 8105 }, { "epoch": 1.6226896430983166, "grad_norm": 0.7118249535560608, "learning_rate": 2.178944488690928e-05, "loss": 0.0177, "num_input_tokens_seen": 26967424, "step": 8110 }, { "epoch": 1.6236900682790187, "grad_norm": 1.4784985780715942, "learning_rate": 2.176346790886395e-05, "loss": 0.0392, "num_input_tokens_seen": 26985984, "step": 8115 }, { "epoch": 1.6246904934597204, "grad_norm": 0.253864586353302, "learning_rate": 2.1737494484335835e-05, "loss": 0.0214, "num_input_tokens_seen": 27002016, "step": 8120 }, { "epoch": 1.625690918640422, "grad_norm": 0.6842542886734009, "learning_rate": 2.171152464184219e-05, "loss": 0.0128, "num_input_tokens_seen": 27018848, "step": 8125 }, { "epoch": 1.6266913438211241, "grad_norm": 0.03511577472090721, "learning_rate": 2.1685558409896336e-05, "loss": 0.0339, "num_input_tokens_seen": 27034848, "step": 8130 }, { "epoch": 1.6276917690018258, "grad_norm": 0.21839222311973572, "learning_rate": 2.1659595817007623e-05, "loss": 0.0134, "num_input_tokens_seen": 27052608, "step": 8135 }, { "epoch": 1.6286921941825274, "grad_norm": 0.5350809693336487, "learning_rate": 2.163363689168143e-05, "loss": 0.0233, "num_input_tokens_seen": 27068512, "step": 8140 }, { "epoch": 1.6296926193632295, "grad_norm": 0.44204947352409363, "learning_rate": 2.1607681662419085e-05, "loss": 0.0143, "num_input_tokens_seen": 27084256, "step": 8145 }, { "epoch": 1.6306930445439312, "grad_norm": 0.2233804613351822, "learning_rate": 2.1581730157717856e-05, "loss": 0.0148, "num_input_tokens_seen": 27100448, "step": 8150 }, { "epoch": 1.6316934697246328, "grad_norm": 0.16001921892166138, "learning_rate": 2.1555782406070948e-05, "loss": 0.02, "num_input_tokens_seen": 27117600, "step": 8155 }, { "epoch": 1.632693894905335, "grad_norm": 0.18442663550376892, "learning_rate": 2.152983843596741e-05, "loss": 0.0234, "num_input_tokens_seen": 27133248, "step": 8160 }, { "epoch": 1.6336943200860365, "grad_norm": 0.1674097180366516, "learning_rate": 2.1503898275892177e-05, "loss": 0.0084, "num_input_tokens_seen": 27150272, "step": 8165 }, { "epoch": 1.6346947452667384, "grad_norm": 0.4220287501811981, "learning_rate": 2.147796195432597e-05, "loss": 0.0106, "num_input_tokens_seen": 27167296, "step": 8170 }, { "epoch": 1.6356951704474403, "grad_norm": 0.1048525869846344, "learning_rate": 2.145202949974532e-05, "loss": 0.0117, "num_input_tokens_seen": 27184672, "step": 8175 }, { "epoch": 1.636695595628142, "grad_norm": 0.7683660387992859, "learning_rate": 2.1426100940622483e-05, "loss": 0.0221, "num_input_tokens_seen": 27200768, "step": 8180 }, { "epoch": 1.6376960208088438, "grad_norm": 0.10124363005161285, "learning_rate": 2.140017630542546e-05, "loss": 0.0246, "num_input_tokens_seen": 27216992, "step": 8185 }, { "epoch": 1.6386964459895457, "grad_norm": 0.5351779460906982, "learning_rate": 2.137425562261795e-05, "loss": 0.0229, "num_input_tokens_seen": 27233184, "step": 8190 }, { "epoch": 1.6396968711702473, "grad_norm": 0.08957615494728088, "learning_rate": 2.1348338920659285e-05, "loss": 0.0223, "num_input_tokens_seen": 27248928, "step": 8195 }, { "epoch": 1.6406972963509492, "grad_norm": 0.12967830896377563, "learning_rate": 2.1322426228004453e-05, "loss": 0.0236, "num_input_tokens_seen": 27265536, "step": 8200 }, { "epoch": 1.641697721531651, "grad_norm": 0.0696130022406578, "learning_rate": 2.1296517573104015e-05, "loss": 0.0145, "num_input_tokens_seen": 27282112, "step": 8205 }, { "epoch": 1.6426981467123527, "grad_norm": 1.087754249572754, "learning_rate": 2.127061298440413e-05, "loss": 0.0291, "num_input_tokens_seen": 27298592, "step": 8210 }, { "epoch": 1.6436985718930546, "grad_norm": 0.2898842394351959, "learning_rate": 2.1244712490346455e-05, "loss": 0.0129, "num_input_tokens_seen": 27315008, "step": 8215 }, { "epoch": 1.6446989970737564, "grad_norm": 0.8593503832817078, "learning_rate": 2.1218816119368194e-05, "loss": 0.0198, "num_input_tokens_seen": 27331776, "step": 8220 }, { "epoch": 1.645699422254458, "grad_norm": 0.2977852523326874, "learning_rate": 2.119292389990199e-05, "loss": 0.0149, "num_input_tokens_seen": 27347712, "step": 8225 }, { "epoch": 1.64669984743516, "grad_norm": 0.6195009350776672, "learning_rate": 2.1167035860375947e-05, "loss": 0.0225, "num_input_tokens_seen": 27364992, "step": 8230 }, { "epoch": 1.6477002726158618, "grad_norm": 0.445385605096817, "learning_rate": 2.114115202921357e-05, "loss": 0.0193, "num_input_tokens_seen": 27380672, "step": 8235 }, { "epoch": 1.6487006977965635, "grad_norm": 0.402042418718338, "learning_rate": 2.1115272434833733e-05, "loss": 0.0113, "num_input_tokens_seen": 27397888, "step": 8240 }, { "epoch": 1.6497011229772653, "grad_norm": 0.3347969651222229, "learning_rate": 2.108939710565069e-05, "loss": 0.0363, "num_input_tokens_seen": 27415520, "step": 8245 }, { "epoch": 1.6507015481579672, "grad_norm": 0.5899851322174072, "learning_rate": 2.1063526070073986e-05, "loss": 0.031, "num_input_tokens_seen": 27432192, "step": 8250 }, { "epoch": 1.6517019733386689, "grad_norm": 0.4121209681034088, "learning_rate": 2.1037659356508457e-05, "loss": 0.0322, "num_input_tokens_seen": 27447712, "step": 8255 }, { "epoch": 1.6527023985193707, "grad_norm": 0.3264901041984558, "learning_rate": 2.1011796993354192e-05, "loss": 0.0208, "num_input_tokens_seen": 27464288, "step": 8260 }, { "epoch": 1.6537028237000726, "grad_norm": 0.5783780813217163, "learning_rate": 2.0985939009006507e-05, "loss": 0.0328, "num_input_tokens_seen": 27480480, "step": 8265 }, { "epoch": 1.6547032488807742, "grad_norm": 0.177225723862648, "learning_rate": 2.0960085431855925e-05, "loss": 0.0116, "num_input_tokens_seen": 27496800, "step": 8270 }, { "epoch": 1.6557036740614761, "grad_norm": 0.3714253306388855, "learning_rate": 2.093423629028811e-05, "loss": 0.0274, "num_input_tokens_seen": 27513600, "step": 8275 }, { "epoch": 1.656704099242178, "grad_norm": 0.38959836959838867, "learning_rate": 2.0908391612683854e-05, "loss": 0.0197, "num_input_tokens_seen": 27529376, "step": 8280 }, { "epoch": 1.6577045244228796, "grad_norm": 0.29594042897224426, "learning_rate": 2.088255142741906e-05, "loss": 0.0079, "num_input_tokens_seen": 27545568, "step": 8285 }, { "epoch": 1.6587049496035815, "grad_norm": 0.9828856587409973, "learning_rate": 2.08567157628647e-05, "loss": 0.0287, "num_input_tokens_seen": 27561536, "step": 8290 }, { "epoch": 1.6597053747842834, "grad_norm": 0.7645307779312134, "learning_rate": 2.0830884647386767e-05, "loss": 0.0311, "num_input_tokens_seen": 27578464, "step": 8295 }, { "epoch": 1.660705799964985, "grad_norm": 0.29915305972099304, "learning_rate": 2.080505810934628e-05, "loss": 0.0192, "num_input_tokens_seen": 27596192, "step": 8300 }, { "epoch": 1.6617062251456869, "grad_norm": 0.6114938259124756, "learning_rate": 2.077923617709921e-05, "loss": 0.018, "num_input_tokens_seen": 27612288, "step": 8305 }, { "epoch": 1.6627066503263888, "grad_norm": 0.9057605862617493, "learning_rate": 2.075341887899649e-05, "loss": 0.0204, "num_input_tokens_seen": 27628736, "step": 8310 }, { "epoch": 1.6637070755070904, "grad_norm": 0.3348783254623413, "learning_rate": 2.072760624338395e-05, "loss": 0.0142, "num_input_tokens_seen": 27644608, "step": 8315 }, { "epoch": 1.6647075006877923, "grad_norm": 0.11718225479125977, "learning_rate": 2.070179829860232e-05, "loss": 0.0136, "num_input_tokens_seen": 27661344, "step": 8320 }, { "epoch": 1.6657079258684941, "grad_norm": 0.1801997274160385, "learning_rate": 2.0675995072987165e-05, "loss": 0.033, "num_input_tokens_seen": 27678368, "step": 8325 }, { "epoch": 1.6667083510491958, "grad_norm": 1.089145541191101, "learning_rate": 2.065019659486887e-05, "loss": 0.0174, "num_input_tokens_seen": 27696480, "step": 8330 }, { "epoch": 1.6677087762298977, "grad_norm": 0.39637240767478943, "learning_rate": 2.0624402892572607e-05, "loss": 0.0153, "num_input_tokens_seen": 27712480, "step": 8335 }, { "epoch": 1.6687092014105995, "grad_norm": 1.060081958770752, "learning_rate": 2.0598613994418308e-05, "loss": 0.0222, "num_input_tokens_seen": 27729376, "step": 8340 }, { "epoch": 1.6697096265913012, "grad_norm": 0.2965173125267029, "learning_rate": 2.0572829928720635e-05, "loss": 0.0108, "num_input_tokens_seen": 27746560, "step": 8345 }, { "epoch": 1.670710051772003, "grad_norm": 0.1402323842048645, "learning_rate": 2.054705072378893e-05, "loss": 0.0116, "num_input_tokens_seen": 27764896, "step": 8350 }, { "epoch": 1.671710476952705, "grad_norm": 0.20911061763763428, "learning_rate": 2.0521276407927212e-05, "loss": 0.0231, "num_input_tokens_seen": 27782336, "step": 8355 }, { "epoch": 1.6727109021334066, "grad_norm": 0.0908847227692604, "learning_rate": 2.0495507009434127e-05, "loss": 0.0123, "num_input_tokens_seen": 27798272, "step": 8360 }, { "epoch": 1.6737113273141087, "grad_norm": 0.2123504877090454, "learning_rate": 2.046974255660291e-05, "loss": 0.0115, "num_input_tokens_seen": 27814528, "step": 8365 }, { "epoch": 1.6747117524948103, "grad_norm": 1.0239791870117188, "learning_rate": 2.044398307772139e-05, "loss": 0.0282, "num_input_tokens_seen": 27830080, "step": 8370 }, { "epoch": 1.675712177675512, "grad_norm": 0.0846494659781456, "learning_rate": 2.0418228601071927e-05, "loss": 0.0091, "num_input_tokens_seen": 27846432, "step": 8375 }, { "epoch": 1.676712602856214, "grad_norm": 0.5295249223709106, "learning_rate": 2.0392479154931377e-05, "loss": 0.0225, "num_input_tokens_seen": 27862720, "step": 8380 }, { "epoch": 1.6777130280369157, "grad_norm": 0.9511027932167053, "learning_rate": 2.036673476757108e-05, "loss": 0.0289, "num_input_tokens_seen": 27878336, "step": 8385 }, { "epoch": 1.6787134532176173, "grad_norm": 0.4604302942752838, "learning_rate": 2.034099546725682e-05, "loss": 0.0123, "num_input_tokens_seen": 27894368, "step": 8390 }, { "epoch": 1.6797138783983194, "grad_norm": 1.2224334478378296, "learning_rate": 2.031526128224881e-05, "loss": 0.0183, "num_input_tokens_seen": 27910720, "step": 8395 }, { "epoch": 1.680714303579021, "grad_norm": 0.639065682888031, "learning_rate": 2.0289532240801618e-05, "loss": 0.0237, "num_input_tokens_seen": 27927584, "step": 8400 }, { "epoch": 1.6817147287597227, "grad_norm": 0.16994817554950714, "learning_rate": 2.0263808371164185e-05, "loss": 0.0369, "num_input_tokens_seen": 27944064, "step": 8405 }, { "epoch": 1.6827151539404248, "grad_norm": 0.16935010254383087, "learning_rate": 2.023808970157978e-05, "loss": 0.0136, "num_input_tokens_seen": 27963712, "step": 8410 }, { "epoch": 1.6837155791211265, "grad_norm": 0.22500985860824585, "learning_rate": 2.0212376260285944e-05, "loss": 0.0135, "num_input_tokens_seen": 27979360, "step": 8415 }, { "epoch": 1.6847160043018283, "grad_norm": 0.639189600944519, "learning_rate": 2.018666807551448e-05, "loss": 0.0235, "num_input_tokens_seen": 27997728, "step": 8420 }, { "epoch": 1.6857164294825302, "grad_norm": 0.4524305760860443, "learning_rate": 2.0160965175491444e-05, "loss": 0.0187, "num_input_tokens_seen": 28014752, "step": 8425 }, { "epoch": 1.6867168546632318, "grad_norm": 0.16592879593372345, "learning_rate": 2.0135267588437054e-05, "loss": 0.0206, "num_input_tokens_seen": 28030688, "step": 8430 }, { "epoch": 1.6877172798439337, "grad_norm": 0.2719612121582031, "learning_rate": 2.0109575342565728e-05, "loss": 0.0228, "num_input_tokens_seen": 28047712, "step": 8435 }, { "epoch": 1.6887177050246356, "grad_norm": 0.34056931734085083, "learning_rate": 2.008388846608599e-05, "loss": 0.0189, "num_input_tokens_seen": 28065376, "step": 8440 }, { "epoch": 1.6897181302053372, "grad_norm": 0.1926608383655548, "learning_rate": 2.0058206987200488e-05, "loss": 0.0109, "num_input_tokens_seen": 28081760, "step": 8445 }, { "epoch": 1.690718555386039, "grad_norm": 1.205339789390564, "learning_rate": 2.003253093410594e-05, "loss": 0.0221, "num_input_tokens_seen": 28097728, "step": 8450 }, { "epoch": 1.691718980566741, "grad_norm": 0.37929344177246094, "learning_rate": 2.0006860334993105e-05, "loss": 0.015, "num_input_tokens_seen": 28115488, "step": 8455 }, { "epoch": 1.6927194057474426, "grad_norm": 0.446153461933136, "learning_rate": 1.9981195218046746e-05, "loss": 0.0092, "num_input_tokens_seen": 28132128, "step": 8460 }, { "epoch": 1.6937198309281445, "grad_norm": 0.4648114740848541, "learning_rate": 1.9955535611445625e-05, "loss": 0.0229, "num_input_tokens_seen": 28149056, "step": 8465 }, { "epoch": 1.6947202561088464, "grad_norm": 0.5778411626815796, "learning_rate": 1.9929881543362432e-05, "loss": 0.0278, "num_input_tokens_seen": 28165472, "step": 8470 }, { "epoch": 1.695720681289548, "grad_norm": 0.12256700545549393, "learning_rate": 1.9904233041963803e-05, "loss": 0.0121, "num_input_tokens_seen": 28181376, "step": 8475 }, { "epoch": 1.6967211064702499, "grad_norm": 0.1696489453315735, "learning_rate": 1.9878590135410244e-05, "loss": 0.0144, "num_input_tokens_seen": 28196672, "step": 8480 }, { "epoch": 1.6977215316509517, "grad_norm": 0.9251601696014404, "learning_rate": 1.9852952851856112e-05, "loss": 0.0308, "num_input_tokens_seen": 28213440, "step": 8485 }, { "epoch": 1.6987219568316534, "grad_norm": 1.0078340768814087, "learning_rate": 1.9827321219449608e-05, "loss": 0.036, "num_input_tokens_seen": 28230400, "step": 8490 }, { "epoch": 1.6997223820123553, "grad_norm": 0.4716654419898987, "learning_rate": 1.9801695266332715e-05, "loss": 0.0343, "num_input_tokens_seen": 28248160, "step": 8495 }, { "epoch": 1.7007228071930571, "grad_norm": 0.4144385755062103, "learning_rate": 1.9776075020641183e-05, "loss": 0.0052, "num_input_tokens_seen": 28265472, "step": 8500 }, { "epoch": 1.7017232323737588, "grad_norm": 0.1860032081604004, "learning_rate": 1.9750460510504508e-05, "loss": 0.0172, "num_input_tokens_seen": 28282272, "step": 8505 }, { "epoch": 1.7027236575544606, "grad_norm": 0.5012482404708862, "learning_rate": 1.972485176404587e-05, "loss": 0.0333, "num_input_tokens_seen": 28300064, "step": 8510 }, { "epoch": 1.7037240827351625, "grad_norm": 0.4445567727088928, "learning_rate": 1.969924880938213e-05, "loss": 0.0233, "num_input_tokens_seen": 28316384, "step": 8515 }, { "epoch": 1.7047245079158642, "grad_norm": 0.2888290584087372, "learning_rate": 1.967365167462379e-05, "loss": 0.0197, "num_input_tokens_seen": 28334368, "step": 8520 }, { "epoch": 1.705724933096566, "grad_norm": 0.28883203864097595, "learning_rate": 1.9648060387874967e-05, "loss": 0.0182, "num_input_tokens_seen": 28350624, "step": 8525 }, { "epoch": 1.706725358277268, "grad_norm": 0.4163239598274231, "learning_rate": 1.9622474977233352e-05, "loss": 0.0166, "num_input_tokens_seen": 28367680, "step": 8530 }, { "epoch": 1.7077257834579695, "grad_norm": 0.604323148727417, "learning_rate": 1.9596895470790174e-05, "loss": 0.022, "num_input_tokens_seen": 28385120, "step": 8535 }, { "epoch": 1.7087262086386714, "grad_norm": 0.5926190614700317, "learning_rate": 1.95713218966302e-05, "loss": 0.0121, "num_input_tokens_seen": 28401568, "step": 8540 }, { "epoch": 1.7097266338193733, "grad_norm": 0.17457112669944763, "learning_rate": 1.954575428283167e-05, "loss": 0.0346, "num_input_tokens_seen": 28419424, "step": 8545 }, { "epoch": 1.710727059000075, "grad_norm": 0.48416203260421753, "learning_rate": 1.9520192657466284e-05, "loss": 0.0211, "num_input_tokens_seen": 28435296, "step": 8550 }, { "epoch": 1.7117274841807768, "grad_norm": 0.3750211298465729, "learning_rate": 1.9494637048599158e-05, "loss": 0.0189, "num_input_tokens_seen": 28453120, "step": 8555 }, { "epoch": 1.7127279093614787, "grad_norm": 0.8351686000823975, "learning_rate": 1.9469087484288825e-05, "loss": 0.0353, "num_input_tokens_seen": 28469376, "step": 8560 }, { "epoch": 1.7137283345421803, "grad_norm": 0.5917026996612549, "learning_rate": 1.944354399258716e-05, "loss": 0.0206, "num_input_tokens_seen": 28487520, "step": 8565 }, { "epoch": 1.7147287597228822, "grad_norm": 0.6885085105895996, "learning_rate": 1.941800660153937e-05, "loss": 0.0165, "num_input_tokens_seen": 28503936, "step": 8570 }, { "epoch": 1.715729184903584, "grad_norm": 0.05982868745923042, "learning_rate": 1.9392475339183975e-05, "loss": 0.0221, "num_input_tokens_seen": 28520224, "step": 8575 }, { "epoch": 1.7167296100842857, "grad_norm": 0.817966878414154, "learning_rate": 1.9366950233552777e-05, "loss": 0.0148, "num_input_tokens_seen": 28536416, "step": 8580 }, { "epoch": 1.7177300352649876, "grad_norm": 0.600110650062561, "learning_rate": 1.9341431312670797e-05, "loss": 0.0301, "num_input_tokens_seen": 28552128, "step": 8585 }, { "epoch": 1.7187304604456894, "grad_norm": 0.5472889542579651, "learning_rate": 1.931591860455626e-05, "loss": 0.0257, "num_input_tokens_seen": 28568352, "step": 8590 }, { "epoch": 1.719730885626391, "grad_norm": 0.10418272018432617, "learning_rate": 1.9290412137220598e-05, "loss": 0.0158, "num_input_tokens_seen": 28584864, "step": 8595 }, { "epoch": 1.720731310807093, "grad_norm": 0.30167078971862793, "learning_rate": 1.926491193866837e-05, "loss": 0.014, "num_input_tokens_seen": 28601920, "step": 8600 }, { "epoch": 1.7217317359877948, "grad_norm": 0.22344553470611572, "learning_rate": 1.9239418036897253e-05, "loss": 0.0215, "num_input_tokens_seen": 28618464, "step": 8605 }, { "epoch": 1.7227321611684965, "grad_norm": 0.18009479343891144, "learning_rate": 1.9213930459898014e-05, "loss": 0.0219, "num_input_tokens_seen": 28634784, "step": 8610 }, { "epoch": 1.7237325863491986, "grad_norm": 0.43777209520339966, "learning_rate": 1.9188449235654488e-05, "loss": 0.0175, "num_input_tokens_seen": 28651296, "step": 8615 }, { "epoch": 1.7247330115299002, "grad_norm": 0.30056893825531006, "learning_rate": 1.9162974392143513e-05, "loss": 0.021, "num_input_tokens_seen": 28667776, "step": 8620 }, { "epoch": 1.7257334367106019, "grad_norm": 0.4077411890029907, "learning_rate": 1.9137505957334927e-05, "loss": 0.0179, "num_input_tokens_seen": 28683648, "step": 8625 }, { "epoch": 1.726733861891304, "grad_norm": 0.9050543904304504, "learning_rate": 1.911204395919155e-05, "loss": 0.0287, "num_input_tokens_seen": 28699712, "step": 8630 }, { "epoch": 1.7277342870720056, "grad_norm": 0.17446036636829376, "learning_rate": 1.9086588425669125e-05, "loss": 0.0225, "num_input_tokens_seen": 28715264, "step": 8635 }, { "epoch": 1.7287347122527073, "grad_norm": 0.09911100566387177, "learning_rate": 1.9061139384716282e-05, "loss": 0.0107, "num_input_tokens_seen": 28734208, "step": 8640 }, { "epoch": 1.7297351374334093, "grad_norm": 0.3007764518260956, "learning_rate": 1.9035696864274537e-05, "loss": 0.0354, "num_input_tokens_seen": 28752000, "step": 8645 }, { "epoch": 1.730735562614111, "grad_norm": 0.3763737976551056, "learning_rate": 1.901026089227825e-05, "loss": 0.0146, "num_input_tokens_seen": 28767776, "step": 8650 }, { "epoch": 1.7317359877948126, "grad_norm": 0.645039439201355, "learning_rate": 1.898483149665458e-05, "loss": 0.0175, "num_input_tokens_seen": 28784352, "step": 8655 }, { "epoch": 1.7327364129755147, "grad_norm": 0.3146284818649292, "learning_rate": 1.8959408705323473e-05, "loss": 0.0255, "num_input_tokens_seen": 28800384, "step": 8660 }, { "epoch": 1.7337368381562164, "grad_norm": 0.5576115250587463, "learning_rate": 1.8933992546197622e-05, "loss": 0.0239, "num_input_tokens_seen": 28817248, "step": 8665 }, { "epoch": 1.7347372633369182, "grad_norm": 0.5347667932510376, "learning_rate": 1.8908583047182436e-05, "loss": 0.0195, "num_input_tokens_seen": 28833344, "step": 8670 }, { "epoch": 1.7357376885176201, "grad_norm": 0.22791774570941925, "learning_rate": 1.888318023617601e-05, "loss": 0.0379, "num_input_tokens_seen": 28849984, "step": 8675 }, { "epoch": 1.7367381136983218, "grad_norm": 0.201163187623024, "learning_rate": 1.8857784141069113e-05, "loss": 0.029, "num_input_tokens_seen": 28865056, "step": 8680 }, { "epoch": 1.7377385388790236, "grad_norm": 0.0956418439745903, "learning_rate": 1.8832394789745117e-05, "loss": 0.0145, "num_input_tokens_seen": 28882016, "step": 8685 }, { "epoch": 1.7387389640597255, "grad_norm": 0.1477956920862198, "learning_rate": 1.8807012210080005e-05, "loss": 0.0238, "num_input_tokens_seen": 28898624, "step": 8690 }, { "epoch": 1.7397393892404271, "grad_norm": 0.6203426122665405, "learning_rate": 1.8781636429942318e-05, "loss": 0.0249, "num_input_tokens_seen": 28915040, "step": 8695 }, { "epoch": 1.740739814421129, "grad_norm": 0.15471091866493225, "learning_rate": 1.875626747719313e-05, "loss": 0.012, "num_input_tokens_seen": 28932032, "step": 8700 }, { "epoch": 1.7417402396018309, "grad_norm": 0.3013649880886078, "learning_rate": 1.8730905379686026e-05, "loss": 0.0166, "num_input_tokens_seen": 28947136, "step": 8705 }, { "epoch": 1.7427406647825325, "grad_norm": 0.5449125170707703, "learning_rate": 1.8705550165267067e-05, "loss": 0.0181, "num_input_tokens_seen": 28963360, "step": 8710 }, { "epoch": 1.7437410899632344, "grad_norm": 0.3873887360095978, "learning_rate": 1.8680201861774733e-05, "loss": 0.0177, "num_input_tokens_seen": 28979584, "step": 8715 }, { "epoch": 1.7447415151439363, "grad_norm": 0.4926188588142395, "learning_rate": 1.8654860497039954e-05, "loss": 0.0297, "num_input_tokens_seen": 28996288, "step": 8720 }, { "epoch": 1.745741940324638, "grad_norm": 0.5046607851982117, "learning_rate": 1.8629526098886007e-05, "loss": 0.0267, "num_input_tokens_seen": 29012384, "step": 8725 }, { "epoch": 1.7467423655053398, "grad_norm": 0.5384324193000793, "learning_rate": 1.8604198695128534e-05, "loss": 0.0204, "num_input_tokens_seen": 29028544, "step": 8730 }, { "epoch": 1.7477427906860417, "grad_norm": 0.30341100692749023, "learning_rate": 1.8578878313575516e-05, "loss": 0.0178, "num_input_tokens_seen": 29045056, "step": 8735 }, { "epoch": 1.7487432158667433, "grad_norm": 0.12679381668567657, "learning_rate": 1.8553564982027183e-05, "loss": 0.0167, "num_input_tokens_seen": 29062624, "step": 8740 }, { "epoch": 1.7497436410474452, "grad_norm": 0.23378930985927582, "learning_rate": 1.8528258728276067e-05, "loss": 0.0159, "num_input_tokens_seen": 29079456, "step": 8745 }, { "epoch": 1.750744066228147, "grad_norm": 0.6869626641273499, "learning_rate": 1.8502959580106906e-05, "loss": 0.0117, "num_input_tokens_seen": 29095904, "step": 8750 }, { "epoch": 1.7517444914088487, "grad_norm": 1.1471084356307983, "learning_rate": 1.8477667565296623e-05, "loss": 0.0229, "num_input_tokens_seen": 29112480, "step": 8755 }, { "epoch": 1.7527449165895506, "grad_norm": 0.5438526272773743, "learning_rate": 1.845238271161435e-05, "loss": 0.02, "num_input_tokens_seen": 29128288, "step": 8760 }, { "epoch": 1.7537453417702524, "grad_norm": 0.5923305153846741, "learning_rate": 1.842710504682132e-05, "loss": 0.0242, "num_input_tokens_seen": 29145216, "step": 8765 }, { "epoch": 1.754745766950954, "grad_norm": 0.9874956011772156, "learning_rate": 1.8401834598670887e-05, "loss": 0.0268, "num_input_tokens_seen": 29161728, "step": 8770 }, { "epoch": 1.755746192131656, "grad_norm": 0.6199465990066528, "learning_rate": 1.837657139490848e-05, "loss": 0.0185, "num_input_tokens_seen": 29178048, "step": 8775 }, { "epoch": 1.7567466173123578, "grad_norm": 0.3383076786994934, "learning_rate": 1.8351315463271568e-05, "loss": 0.0314, "num_input_tokens_seen": 29196032, "step": 8780 }, { "epoch": 1.7577470424930595, "grad_norm": 0.3935118317604065, "learning_rate": 1.8326066831489663e-05, "loss": 0.0234, "num_input_tokens_seen": 29211840, "step": 8785 }, { "epoch": 1.7587474676737613, "grad_norm": 0.1697453409433365, "learning_rate": 1.8300825527284225e-05, "loss": 0.023, "num_input_tokens_seen": 29227392, "step": 8790 }, { "epoch": 1.7597478928544632, "grad_norm": 0.48267650604248047, "learning_rate": 1.8275591578368683e-05, "loss": 0.0259, "num_input_tokens_seen": 29242528, "step": 8795 }, { "epoch": 1.7607483180351648, "grad_norm": 0.3457343876361847, "learning_rate": 1.82503650124484e-05, "loss": 0.0219, "num_input_tokens_seen": 29258912, "step": 8800 }, { "epoch": 1.7617487432158667, "grad_norm": 0.3773598074913025, "learning_rate": 1.822514585722063e-05, "loss": 0.009, "num_input_tokens_seen": 29275904, "step": 8805 }, { "epoch": 1.7627491683965686, "grad_norm": 0.44147181510925293, "learning_rate": 1.8199934140374465e-05, "loss": 0.0227, "num_input_tokens_seen": 29292192, "step": 8810 }, { "epoch": 1.7637495935772702, "grad_norm": 0.5126002430915833, "learning_rate": 1.8174729889590874e-05, "loss": 0.0142, "num_input_tokens_seen": 29308640, "step": 8815 }, { "epoch": 1.764750018757972, "grad_norm": 1.294550895690918, "learning_rate": 1.8149533132542594e-05, "loss": 0.0182, "num_input_tokens_seen": 29324672, "step": 8820 }, { "epoch": 1.765750443938674, "grad_norm": 0.2473556399345398, "learning_rate": 1.8124343896894148e-05, "loss": 0.0188, "num_input_tokens_seen": 29341504, "step": 8825 }, { "epoch": 1.7667508691193756, "grad_norm": 0.36207565665245056, "learning_rate": 1.8099162210301795e-05, "loss": 0.0115, "num_input_tokens_seen": 29357952, "step": 8830 }, { "epoch": 1.7677512943000775, "grad_norm": 0.5558183193206787, "learning_rate": 1.8073988100413514e-05, "loss": 0.0276, "num_input_tokens_seen": 29374752, "step": 8835 }, { "epoch": 1.7687517194807794, "grad_norm": 0.6651312708854675, "learning_rate": 1.804882159486897e-05, "loss": 0.0193, "num_input_tokens_seen": 29391936, "step": 8840 }, { "epoch": 1.769752144661481, "grad_norm": 0.13344928622245789, "learning_rate": 1.8023662721299455e-05, "loss": 0.0306, "num_input_tokens_seen": 29408512, "step": 8845 }, { "epoch": 1.7707525698421829, "grad_norm": 0.3455791473388672, "learning_rate": 1.7998511507327898e-05, "loss": 0.0137, "num_input_tokens_seen": 29425632, "step": 8850 }, { "epoch": 1.7717529950228847, "grad_norm": 0.30371081829071045, "learning_rate": 1.7973367980568822e-05, "loss": 0.0168, "num_input_tokens_seen": 29443840, "step": 8855 }, { "epoch": 1.7727534202035864, "grad_norm": 0.3325488269329071, "learning_rate": 1.7948232168628305e-05, "loss": 0.0279, "num_input_tokens_seen": 29459584, "step": 8860 }, { "epoch": 1.7737538453842885, "grad_norm": 1.1632492542266846, "learning_rate": 1.7923104099103945e-05, "loss": 0.0253, "num_input_tokens_seen": 29476704, "step": 8865 }, { "epoch": 1.7747542705649901, "grad_norm": 0.23372527956962585, "learning_rate": 1.7897983799584856e-05, "loss": 0.0166, "num_input_tokens_seen": 29493920, "step": 8870 }, { "epoch": 1.7757546957456918, "grad_norm": 0.11346451938152313, "learning_rate": 1.7872871297651613e-05, "loss": 0.0192, "num_input_tokens_seen": 29511392, "step": 8875 }, { "epoch": 1.7767551209263939, "grad_norm": 0.7949175238609314, "learning_rate": 1.7847766620876223e-05, "loss": 0.0273, "num_input_tokens_seen": 29528096, "step": 8880 }, { "epoch": 1.7777555461070955, "grad_norm": 0.3690131902694702, "learning_rate": 1.782266979682211e-05, "loss": 0.0314, "num_input_tokens_seen": 29544512, "step": 8885 }, { "epoch": 1.7787559712877972, "grad_norm": 0.9416623115539551, "learning_rate": 1.779758085304409e-05, "loss": 0.0331, "num_input_tokens_seen": 29561184, "step": 8890 }, { "epoch": 1.7797563964684993, "grad_norm": 0.3709040582180023, "learning_rate": 1.7772499817088297e-05, "loss": 0.0313, "num_input_tokens_seen": 29577408, "step": 8895 }, { "epoch": 1.780756821649201, "grad_norm": 0.3078094720840454, "learning_rate": 1.77474267164922e-05, "loss": 0.016, "num_input_tokens_seen": 29594944, "step": 8900 }, { "epoch": 1.7817572468299026, "grad_norm": 0.5173088908195496, "learning_rate": 1.7722361578784547e-05, "loss": 0.0282, "num_input_tokens_seen": 29612416, "step": 8905 }, { "epoch": 1.7827576720106046, "grad_norm": 0.11358284950256348, "learning_rate": 1.7697304431485358e-05, "loss": 0.0117, "num_input_tokens_seen": 29628736, "step": 8910 }, { "epoch": 1.7837580971913063, "grad_norm": 0.6156454682350159, "learning_rate": 1.7672255302105868e-05, "loss": 0.0298, "num_input_tokens_seen": 29646144, "step": 8915 }, { "epoch": 1.784758522372008, "grad_norm": 0.7819226980209351, "learning_rate": 1.7647214218148505e-05, "loss": 0.0254, "num_input_tokens_seen": 29662752, "step": 8920 }, { "epoch": 1.78575894755271, "grad_norm": 0.25487658381462097, "learning_rate": 1.7622181207106884e-05, "loss": 0.013, "num_input_tokens_seen": 29679296, "step": 8925 }, { "epoch": 1.7867593727334117, "grad_norm": 0.5328194499015808, "learning_rate": 1.7597156296465734e-05, "loss": 0.0133, "num_input_tokens_seen": 29695840, "step": 8930 }, { "epoch": 1.7877597979141135, "grad_norm": 0.5457685589790344, "learning_rate": 1.7572139513700885e-05, "loss": 0.025, "num_input_tokens_seen": 29713088, "step": 8935 }, { "epoch": 1.7887602230948154, "grad_norm": 0.23857742547988892, "learning_rate": 1.7547130886279283e-05, "loss": 0.0107, "num_input_tokens_seen": 29729632, "step": 8940 }, { "epoch": 1.789760648275517, "grad_norm": 0.4583163857460022, "learning_rate": 1.7522130441658884e-05, "loss": 0.0196, "num_input_tokens_seen": 29745760, "step": 8945 }, { "epoch": 1.790761073456219, "grad_norm": 0.3863668143749237, "learning_rate": 1.749713820728867e-05, "loss": 0.0236, "num_input_tokens_seen": 29762272, "step": 8950 }, { "epoch": 1.7917614986369208, "grad_norm": 0.2928677201271057, "learning_rate": 1.747215421060861e-05, "loss": 0.0267, "num_input_tokens_seen": 29778976, "step": 8955 }, { "epoch": 1.7927619238176224, "grad_norm": 0.4981392025947571, "learning_rate": 1.7447178479049615e-05, "loss": 0.0195, "num_input_tokens_seen": 29795680, "step": 8960 }, { "epoch": 1.7937623489983243, "grad_norm": 0.04591726139187813, "learning_rate": 1.742221104003355e-05, "loss": 0.011, "num_input_tokens_seen": 29813728, "step": 8965 }, { "epoch": 1.7947627741790262, "grad_norm": 0.23080898821353912, "learning_rate": 1.7397251920973146e-05, "loss": 0.0148, "num_input_tokens_seen": 29829440, "step": 8970 }, { "epoch": 1.7957631993597278, "grad_norm": 0.935471773147583, "learning_rate": 1.7372301149272012e-05, "loss": 0.019, "num_input_tokens_seen": 29848000, "step": 8975 }, { "epoch": 1.7967636245404297, "grad_norm": 0.413185715675354, "learning_rate": 1.7347358752324605e-05, "loss": 0.0181, "num_input_tokens_seen": 29866240, "step": 8980 }, { "epoch": 1.7977640497211316, "grad_norm": 0.6893980503082275, "learning_rate": 1.7322424757516155e-05, "loss": 0.0212, "num_input_tokens_seen": 29883360, "step": 8985 }, { "epoch": 1.7987644749018332, "grad_norm": 0.6806471347808838, "learning_rate": 1.7297499192222705e-05, "loss": 0.0303, "num_input_tokens_seen": 29899744, "step": 8990 }, { "epoch": 1.799764900082535, "grad_norm": 0.20109929144382477, "learning_rate": 1.727258208381101e-05, "loss": 0.0311, "num_input_tokens_seen": 29917024, "step": 8995 }, { "epoch": 1.800765325263237, "grad_norm": 0.954805314540863, "learning_rate": 1.7247673459638563e-05, "loss": 0.0167, "num_input_tokens_seen": 29933472, "step": 9000 }, { "epoch": 1.8017657504439386, "grad_norm": 0.4293767213821411, "learning_rate": 1.722277334705353e-05, "loss": 0.0238, "num_input_tokens_seen": 29950528, "step": 9005 }, { "epoch": 1.8027661756246405, "grad_norm": 0.3819023072719574, "learning_rate": 1.7197881773394723e-05, "loss": 0.0382, "num_input_tokens_seen": 29967008, "step": 9010 }, { "epoch": 1.8037666008053423, "grad_norm": 0.42908862233161926, "learning_rate": 1.7172998765991604e-05, "loss": 0.0233, "num_input_tokens_seen": 29983808, "step": 9015 }, { "epoch": 1.804767025986044, "grad_norm": 0.07979098707437515, "learning_rate": 1.7148124352164216e-05, "loss": 0.016, "num_input_tokens_seen": 30002112, "step": 9020 }, { "epoch": 1.8057674511667459, "grad_norm": 0.5107935667037964, "learning_rate": 1.712325855922316e-05, "loss": 0.0196, "num_input_tokens_seen": 30018944, "step": 9025 }, { "epoch": 1.8067678763474477, "grad_norm": 0.34811583161354065, "learning_rate": 1.7098401414469574e-05, "loss": 0.0183, "num_input_tokens_seen": 30035616, "step": 9030 }, { "epoch": 1.8077683015281494, "grad_norm": 0.20033659040927887, "learning_rate": 1.7073552945195105e-05, "loss": 0.0207, "num_input_tokens_seen": 30051072, "step": 9035 }, { "epoch": 1.8087687267088512, "grad_norm": 0.5653337836265564, "learning_rate": 1.7048713178681892e-05, "loss": 0.03, "num_input_tokens_seen": 30068864, "step": 9040 }, { "epoch": 1.8097691518895531, "grad_norm": 0.3743863105773926, "learning_rate": 1.702388214220249e-05, "loss": 0.012, "num_input_tokens_seen": 30086592, "step": 9045 }, { "epoch": 1.8107695770702548, "grad_norm": 0.17334720492362976, "learning_rate": 1.6999059863019877e-05, "loss": 0.0143, "num_input_tokens_seen": 30102752, "step": 9050 }, { "epoch": 1.8117700022509566, "grad_norm": 0.31735971570014954, "learning_rate": 1.697424636838743e-05, "loss": 0.0129, "num_input_tokens_seen": 30118592, "step": 9055 }, { "epoch": 1.8127704274316585, "grad_norm": 0.394786536693573, "learning_rate": 1.694944168554886e-05, "loss": 0.0236, "num_input_tokens_seen": 30136320, "step": 9060 }, { "epoch": 1.8137708526123602, "grad_norm": 0.0013086630497127771, "learning_rate": 1.6924645841738223e-05, "loss": 0.0117, "num_input_tokens_seen": 30153376, "step": 9065 }, { "epoch": 1.814771277793062, "grad_norm": 0.12593862414360046, "learning_rate": 1.6899858864179858e-05, "loss": 0.0111, "num_input_tokens_seen": 30169728, "step": 9070 }, { "epoch": 1.815771702973764, "grad_norm": 0.44221794605255127, "learning_rate": 1.687508078008837e-05, "loss": 0.0185, "num_input_tokens_seen": 30186976, "step": 9075 }, { "epoch": 1.8167721281544655, "grad_norm": 0.30854108929634094, "learning_rate": 1.6850311616668606e-05, "loss": 0.0211, "num_input_tokens_seen": 30204704, "step": 9080 }, { "epoch": 1.8177725533351674, "grad_norm": 0.26250120997428894, "learning_rate": 1.6825551401115616e-05, "loss": 0.0112, "num_input_tokens_seen": 30221696, "step": 9085 }, { "epoch": 1.8187729785158693, "grad_norm": 0.13034704327583313, "learning_rate": 1.680080016061461e-05, "loss": 0.0233, "num_input_tokens_seen": 30238208, "step": 9090 }, { "epoch": 1.819773403696571, "grad_norm": 0.7788635492324829, "learning_rate": 1.677605792234099e-05, "loss": 0.0147, "num_input_tokens_seen": 30255392, "step": 9095 }, { "epoch": 1.8207738288772728, "grad_norm": 0.3505137264728546, "learning_rate": 1.6751324713460214e-05, "loss": 0.006, "num_input_tokens_seen": 30272768, "step": 9100 }, { "epoch": 1.8217742540579747, "grad_norm": 0.4570329487323761, "learning_rate": 1.6726600561127865e-05, "loss": 0.0275, "num_input_tokens_seen": 30290464, "step": 9105 }, { "epoch": 1.8227746792386763, "grad_norm": 0.5529378056526184, "learning_rate": 1.6701885492489576e-05, "loss": 0.0229, "num_input_tokens_seen": 30307296, "step": 9110 }, { "epoch": 1.8237751044193784, "grad_norm": 0.5718985199928284, "learning_rate": 1.6677179534681e-05, "loss": 0.0148, "num_input_tokens_seen": 30323712, "step": 9115 }, { "epoch": 1.82477552960008, "grad_norm": 0.6405457854270935, "learning_rate": 1.665248271482778e-05, "loss": 0.0151, "num_input_tokens_seen": 30341504, "step": 9120 }, { "epoch": 1.8257759547807817, "grad_norm": 0.3813199996948242, "learning_rate": 1.6627795060045556e-05, "loss": 0.0135, "num_input_tokens_seen": 30358752, "step": 9125 }, { "epoch": 1.8267763799614838, "grad_norm": 0.25917142629623413, "learning_rate": 1.660311659743987e-05, "loss": 0.0163, "num_input_tokens_seen": 30375200, "step": 9130 }, { "epoch": 1.8277768051421854, "grad_norm": 0.25971898436546326, "learning_rate": 1.6578447354106195e-05, "loss": 0.0169, "num_input_tokens_seen": 30393312, "step": 9135 }, { "epoch": 1.828777230322887, "grad_norm": 0.0786486342549324, "learning_rate": 1.655378735712986e-05, "loss": 0.0116, "num_input_tokens_seen": 30410848, "step": 9140 }, { "epoch": 1.8297776555035892, "grad_norm": 1.411215901374817, "learning_rate": 1.6529136633586075e-05, "loss": 0.0195, "num_input_tokens_seen": 30429664, "step": 9145 }, { "epoch": 1.8307780806842908, "grad_norm": 0.7602545619010925, "learning_rate": 1.6504495210539845e-05, "loss": 0.0066, "num_input_tokens_seen": 30446400, "step": 9150 }, { "epoch": 1.8317785058649925, "grad_norm": 0.48819780349731445, "learning_rate": 1.6479863115045956e-05, "loss": 0.0255, "num_input_tokens_seen": 30462848, "step": 9155 }, { "epoch": 1.8327789310456946, "grad_norm": 0.2878517806529999, "learning_rate": 1.645524037414897e-05, "loss": 0.0188, "num_input_tokens_seen": 30480448, "step": 9160 }, { "epoch": 1.8337793562263962, "grad_norm": 0.7377524375915527, "learning_rate": 1.643062701488318e-05, "loss": 0.0225, "num_input_tokens_seen": 30496992, "step": 9165 }, { "epoch": 1.8347797814070979, "grad_norm": 0.295469731092453, "learning_rate": 1.640602306427257e-05, "loss": 0.0077, "num_input_tokens_seen": 30514016, "step": 9170 }, { "epoch": 1.8357802065878, "grad_norm": 0.7386573553085327, "learning_rate": 1.638142854933078e-05, "loss": 0.0185, "num_input_tokens_seen": 30529216, "step": 9175 }, { "epoch": 1.8367806317685016, "grad_norm": 0.5637227296829224, "learning_rate": 1.6356843497061126e-05, "loss": 0.0295, "num_input_tokens_seen": 30545408, "step": 9180 }, { "epoch": 1.8377810569492035, "grad_norm": 0.15489616990089417, "learning_rate": 1.63322679344565e-05, "loss": 0.0167, "num_input_tokens_seen": 30563168, "step": 9185 }, { "epoch": 1.8387814821299053, "grad_norm": 0.34152811765670776, "learning_rate": 1.6307701888499387e-05, "loss": 0.0123, "num_input_tokens_seen": 30579776, "step": 9190 }, { "epoch": 1.839781907310607, "grad_norm": 0.14039745926856995, "learning_rate": 1.628314538616184e-05, "loss": 0.017, "num_input_tokens_seen": 30595744, "step": 9195 }, { "epoch": 1.8407823324913088, "grad_norm": 1.1402007341384888, "learning_rate": 1.6258598454405402e-05, "loss": 0.0237, "num_input_tokens_seen": 30610816, "step": 9200 }, { "epoch": 1.8417827576720107, "grad_norm": 0.2333715558052063, "learning_rate": 1.6234061120181142e-05, "loss": 0.0209, "num_input_tokens_seen": 30627552, "step": 9205 }, { "epoch": 1.8427831828527124, "grad_norm": 0.45906734466552734, "learning_rate": 1.6209533410429565e-05, "loss": 0.0157, "num_input_tokens_seen": 30644288, "step": 9210 }, { "epoch": 1.8437836080334142, "grad_norm": 0.45235124230384827, "learning_rate": 1.618501535208061e-05, "loss": 0.0193, "num_input_tokens_seen": 30661856, "step": 9215 }, { "epoch": 1.844784033214116, "grad_norm": 0.8836572170257568, "learning_rate": 1.616050697205365e-05, "loss": 0.0282, "num_input_tokens_seen": 30677568, "step": 9220 }, { "epoch": 1.8457844583948178, "grad_norm": 0.4476397633552551, "learning_rate": 1.6136008297257395e-05, "loss": 0.0089, "num_input_tokens_seen": 30693664, "step": 9225 }, { "epoch": 1.8467848835755196, "grad_norm": 0.5498594045639038, "learning_rate": 1.6111519354589905e-05, "loss": 0.0153, "num_input_tokens_seen": 30710368, "step": 9230 }, { "epoch": 1.8477853087562215, "grad_norm": 0.28257307410240173, "learning_rate": 1.608704017093858e-05, "loss": 0.0191, "num_input_tokens_seen": 30727584, "step": 9235 }, { "epoch": 1.8487857339369231, "grad_norm": 0.4074607491493225, "learning_rate": 1.6062570773180083e-05, "loss": 0.0203, "num_input_tokens_seen": 30744224, "step": 9240 }, { "epoch": 1.849786159117625, "grad_norm": 0.40445345640182495, "learning_rate": 1.6038111188180328e-05, "loss": 0.025, "num_input_tokens_seen": 30761024, "step": 9245 }, { "epoch": 1.8507865842983269, "grad_norm": 0.6417256593704224, "learning_rate": 1.601366144279448e-05, "loss": 0.0199, "num_input_tokens_seen": 30777792, "step": 9250 }, { "epoch": 1.8517870094790285, "grad_norm": 0.21276015043258667, "learning_rate": 1.5989221563866882e-05, "loss": 0.0132, "num_input_tokens_seen": 30794880, "step": 9255 }, { "epoch": 1.8527874346597304, "grad_norm": 0.8027820587158203, "learning_rate": 1.5964791578231052e-05, "loss": 0.0174, "num_input_tokens_seen": 30811328, "step": 9260 }, { "epoch": 1.8537878598404323, "grad_norm": 0.9955580234527588, "learning_rate": 1.5940371512709634e-05, "loss": 0.0142, "num_input_tokens_seen": 30826912, "step": 9265 }, { "epoch": 1.854788285021134, "grad_norm": 0.3065422475337982, "learning_rate": 1.5915961394114386e-05, "loss": 0.0178, "num_input_tokens_seen": 30844224, "step": 9270 }, { "epoch": 1.8557887102018358, "grad_norm": 0.17857109010219574, "learning_rate": 1.5891561249246162e-05, "loss": 0.0105, "num_input_tokens_seen": 30860960, "step": 9275 }, { "epoch": 1.8567891353825376, "grad_norm": 0.3081003725528717, "learning_rate": 1.5867171104894843e-05, "loss": 0.0132, "num_input_tokens_seen": 30877280, "step": 9280 }, { "epoch": 1.8577895605632393, "grad_norm": 0.386561781167984, "learning_rate": 1.584279098783933e-05, "loss": 0.0229, "num_input_tokens_seen": 30893088, "step": 9285 }, { "epoch": 1.8587899857439412, "grad_norm": 0.22408875823020935, "learning_rate": 1.5818420924847535e-05, "loss": 0.0305, "num_input_tokens_seen": 30910496, "step": 9290 }, { "epoch": 1.859790410924643, "grad_norm": 1.0091034173965454, "learning_rate": 1.5794060942676302e-05, "loss": 0.0418, "num_input_tokens_seen": 30926432, "step": 9295 }, { "epoch": 1.8607908361053447, "grad_norm": 0.43769577145576477, "learning_rate": 1.5769711068071446e-05, "loss": 0.0366, "num_input_tokens_seen": 30942976, "step": 9300 }, { "epoch": 1.8617912612860466, "grad_norm": 0.45385193824768066, "learning_rate": 1.5745371327767643e-05, "loss": 0.0187, "num_input_tokens_seen": 30961216, "step": 9305 }, { "epoch": 1.8627916864667484, "grad_norm": 0.21157459914684296, "learning_rate": 1.572104174848848e-05, "loss": 0.0315, "num_input_tokens_seen": 30976928, "step": 9310 }, { "epoch": 1.86379211164745, "grad_norm": 0.009228398092091084, "learning_rate": 1.5696722356946354e-05, "loss": 0.0136, "num_input_tokens_seen": 30993440, "step": 9315 }, { "epoch": 1.864792536828152, "grad_norm": 0.22807136178016663, "learning_rate": 1.5672413179842494e-05, "loss": 0.0205, "num_input_tokens_seen": 31010304, "step": 9320 }, { "epoch": 1.8657929620088538, "grad_norm": 0.08223238587379456, "learning_rate": 1.564811424386691e-05, "loss": 0.0044, "num_input_tokens_seen": 31026144, "step": 9325 }, { "epoch": 1.8667933871895555, "grad_norm": 0.0866706371307373, "learning_rate": 1.5623825575698376e-05, "loss": 0.0273, "num_input_tokens_seen": 31042368, "step": 9330 }, { "epoch": 1.8677938123702573, "grad_norm": 1.7599283456802368, "learning_rate": 1.5599547202004385e-05, "loss": 0.0312, "num_input_tokens_seen": 31059264, "step": 9335 }, { "epoch": 1.8687942375509592, "grad_norm": 0.42392227053642273, "learning_rate": 1.5575279149441124e-05, "loss": 0.0206, "num_input_tokens_seen": 31076448, "step": 9340 }, { "epoch": 1.8697946627316608, "grad_norm": 0.3624468147754669, "learning_rate": 1.5551021444653447e-05, "loss": 0.0197, "num_input_tokens_seen": 31091872, "step": 9345 }, { "epoch": 1.8707950879123627, "grad_norm": 0.2185261994600296, "learning_rate": 1.5526774114274866e-05, "loss": 0.0133, "num_input_tokens_seen": 31107968, "step": 9350 }, { "epoch": 1.8717955130930646, "grad_norm": 0.10084425657987595, "learning_rate": 1.5502537184927486e-05, "loss": 0.0194, "num_input_tokens_seen": 31124384, "step": 9355 }, { "epoch": 1.8727959382737662, "grad_norm": 0.32103344798088074, "learning_rate": 1.5478310683221987e-05, "loss": 0.0216, "num_input_tokens_seen": 31140832, "step": 9360 }, { "epoch": 1.873796363454468, "grad_norm": 0.2012992799282074, "learning_rate": 1.5454094635757616e-05, "loss": 0.0147, "num_input_tokens_seen": 31157312, "step": 9365 }, { "epoch": 1.87479678863517, "grad_norm": 0.29973459243774414, "learning_rate": 1.5429889069122133e-05, "loss": 0.0149, "num_input_tokens_seen": 31173152, "step": 9370 }, { "epoch": 1.8757972138158716, "grad_norm": 0.7842708826065063, "learning_rate": 1.5405694009891787e-05, "loss": 0.0182, "num_input_tokens_seen": 31191136, "step": 9375 }, { "epoch": 1.8767976389965737, "grad_norm": 0.7371029853820801, "learning_rate": 1.538150948463129e-05, "loss": 0.0169, "num_input_tokens_seen": 31208160, "step": 9380 }, { "epoch": 1.8777980641772754, "grad_norm": 1.6760576963424683, "learning_rate": 1.535733551989381e-05, "loss": 0.0198, "num_input_tokens_seen": 31224224, "step": 9385 }, { "epoch": 1.878798489357977, "grad_norm": 0.3584641218185425, "learning_rate": 1.5333172142220892e-05, "loss": 0.0089, "num_input_tokens_seen": 31242272, "step": 9390 }, { "epoch": 1.879798914538679, "grad_norm": 0.059135545045137405, "learning_rate": 1.5309019378142468e-05, "loss": 0.013, "num_input_tokens_seen": 31259776, "step": 9395 }, { "epoch": 1.8807993397193807, "grad_norm": 0.26289892196655273, "learning_rate": 1.5284877254176814e-05, "loss": 0.0126, "num_input_tokens_seen": 31275808, "step": 9400 }, { "epoch": 1.8817997649000824, "grad_norm": 0.49559512734413147, "learning_rate": 1.5260745796830545e-05, "loss": 0.0194, "num_input_tokens_seen": 31292128, "step": 9405 }, { "epoch": 1.8828001900807845, "grad_norm": 0.1455199271440506, "learning_rate": 1.5236625032598533e-05, "loss": 0.0215, "num_input_tokens_seen": 31308576, "step": 9410 }, { "epoch": 1.8838006152614861, "grad_norm": 0.8640939593315125, "learning_rate": 1.5212514987963923e-05, "loss": 0.0207, "num_input_tokens_seen": 31325984, "step": 9415 }, { "epoch": 1.8848010404421878, "grad_norm": 0.04954162985086441, "learning_rate": 1.51884156893981e-05, "loss": 0.0198, "num_input_tokens_seen": 31342816, "step": 9420 }, { "epoch": 1.8858014656228899, "grad_norm": 0.5669306516647339, "learning_rate": 1.516432716336064e-05, "loss": 0.0136, "num_input_tokens_seen": 31359520, "step": 9425 }, { "epoch": 1.8868018908035915, "grad_norm": 0.32257279753685, "learning_rate": 1.514024943629928e-05, "loss": 0.016, "num_input_tokens_seen": 31377184, "step": 9430 }, { "epoch": 1.8878023159842934, "grad_norm": 0.2889237701892853, "learning_rate": 1.511618253464993e-05, "loss": 0.0131, "num_input_tokens_seen": 31393216, "step": 9435 }, { "epoch": 1.8888027411649952, "grad_norm": 0.5731465816497803, "learning_rate": 1.5092126484836594e-05, "loss": 0.0118, "num_input_tokens_seen": 31410272, "step": 9440 }, { "epoch": 1.889803166345697, "grad_norm": 0.0007583815604448318, "learning_rate": 1.506808131327136e-05, "loss": 0.0259, "num_input_tokens_seen": 31427136, "step": 9445 }, { "epoch": 1.8908035915263988, "grad_norm": 0.330671101808548, "learning_rate": 1.5044047046354368e-05, "loss": 0.0118, "num_input_tokens_seen": 31445184, "step": 9450 }, { "epoch": 1.8918040167071006, "grad_norm": 0.39024069905281067, "learning_rate": 1.5020023710473812e-05, "loss": 0.0112, "num_input_tokens_seen": 31462016, "step": 9455 }, { "epoch": 1.8928044418878023, "grad_norm": 0.10673998296260834, "learning_rate": 1.4996011332005867e-05, "loss": 0.0333, "num_input_tokens_seen": 31479296, "step": 9460 }, { "epoch": 1.8938048670685041, "grad_norm": 0.1995709240436554, "learning_rate": 1.4972009937314673e-05, "loss": 0.0207, "num_input_tokens_seen": 31497184, "step": 9465 }, { "epoch": 1.894805292249206, "grad_norm": 0.5582648515701294, "learning_rate": 1.494801955275231e-05, "loss": 0.0151, "num_input_tokens_seen": 31514784, "step": 9470 }, { "epoch": 1.8958057174299077, "grad_norm": 0.05137334764003754, "learning_rate": 1.4924040204658785e-05, "loss": 0.012, "num_input_tokens_seen": 31531488, "step": 9475 }, { "epoch": 1.8968061426106095, "grad_norm": 0.010768786072731018, "learning_rate": 1.4900071919361969e-05, "loss": 0.0155, "num_input_tokens_seen": 31549056, "step": 9480 }, { "epoch": 1.8978065677913114, "grad_norm": 0.21186643838882446, "learning_rate": 1.4876114723177596e-05, "loss": 0.013, "num_input_tokens_seen": 31566368, "step": 9485 }, { "epoch": 1.898806992972013, "grad_norm": 0.3738987445831299, "learning_rate": 1.4852168642409226e-05, "loss": 0.0105, "num_input_tokens_seen": 31582528, "step": 9490 }, { "epoch": 1.899807418152715, "grad_norm": 0.7078602910041809, "learning_rate": 1.4828233703348216e-05, "loss": 0.0289, "num_input_tokens_seen": 31599136, "step": 9495 }, { "epoch": 1.9008078433334168, "grad_norm": 0.599572479724884, "learning_rate": 1.4804309932273669e-05, "loss": 0.0364, "num_input_tokens_seen": 31615360, "step": 9500 }, { "epoch": 1.9018082685141184, "grad_norm": 1.5461905002593994, "learning_rate": 1.4780397355452472e-05, "loss": 0.0254, "num_input_tokens_seen": 31633152, "step": 9505 }, { "epoch": 1.9028086936948203, "grad_norm": 0.45884576439857483, "learning_rate": 1.4756495999139171e-05, "loss": 0.0114, "num_input_tokens_seen": 31648224, "step": 9510 }, { "epoch": 1.9038091188755222, "grad_norm": 0.07703045755624771, "learning_rate": 1.4732605889576031e-05, "loss": 0.0221, "num_input_tokens_seen": 31665632, "step": 9515 }, { "epoch": 1.9048095440562238, "grad_norm": 0.08801163733005524, "learning_rate": 1.4708727052992943e-05, "loss": 0.014, "num_input_tokens_seen": 31682944, "step": 9520 }, { "epoch": 1.9058099692369257, "grad_norm": 0.22972284257411957, "learning_rate": 1.468485951560743e-05, "loss": 0.0065, "num_input_tokens_seen": 31698400, "step": 9525 }, { "epoch": 1.9068103944176276, "grad_norm": 0.7997241616249084, "learning_rate": 1.4661003303624619e-05, "loss": 0.0184, "num_input_tokens_seen": 31714784, "step": 9530 }, { "epoch": 1.9078108195983292, "grad_norm": 0.12815675139427185, "learning_rate": 1.4637158443237187e-05, "loss": 0.0082, "num_input_tokens_seen": 31731168, "step": 9535 }, { "epoch": 1.908811244779031, "grad_norm": 0.04719047620892525, "learning_rate": 1.4613324960625346e-05, "loss": 0.0084, "num_input_tokens_seen": 31748544, "step": 9540 }, { "epoch": 1.909811669959733, "grad_norm": 0.7937561273574829, "learning_rate": 1.4589502881956838e-05, "loss": 0.0273, "num_input_tokens_seen": 31764864, "step": 9545 }, { "epoch": 1.9108120951404346, "grad_norm": 0.45275047421455383, "learning_rate": 1.4565692233386857e-05, "loss": 0.0076, "num_input_tokens_seen": 31780736, "step": 9550 }, { "epoch": 1.9118125203211365, "grad_norm": 0.061869651079177856, "learning_rate": 1.4541893041058068e-05, "loss": 0.0275, "num_input_tokens_seen": 31797856, "step": 9555 }, { "epoch": 1.9128129455018383, "grad_norm": 0.6054242253303528, "learning_rate": 1.451810533110055e-05, "loss": 0.0151, "num_input_tokens_seen": 31813440, "step": 9560 }, { "epoch": 1.91381337068254, "grad_norm": 0.12409298866987228, "learning_rate": 1.4494329129631786e-05, "loss": 0.0092, "num_input_tokens_seen": 31830048, "step": 9565 }, { "epoch": 1.9148137958632419, "grad_norm": 0.36158084869384766, "learning_rate": 1.4470564462756591e-05, "loss": 0.0245, "num_input_tokens_seen": 31846016, "step": 9570 }, { "epoch": 1.9158142210439437, "grad_norm": 0.6042336225509644, "learning_rate": 1.4446811356567147e-05, "loss": 0.0122, "num_input_tokens_seen": 31861056, "step": 9575 }, { "epoch": 1.9168146462246454, "grad_norm": 0.11119186133146286, "learning_rate": 1.442306983714294e-05, "loss": 0.0076, "num_input_tokens_seen": 31877248, "step": 9580 }, { "epoch": 1.9178150714053472, "grad_norm": 0.3187255859375, "learning_rate": 1.4399339930550716e-05, "loss": 0.0234, "num_input_tokens_seen": 31892928, "step": 9585 }, { "epoch": 1.918815496586049, "grad_norm": 0.45982155203819275, "learning_rate": 1.4375621662844485e-05, "loss": 0.019, "num_input_tokens_seen": 31908576, "step": 9590 }, { "epoch": 1.9198159217667508, "grad_norm": 1.2846043109893799, "learning_rate": 1.4351915060065488e-05, "loss": 0.0203, "num_input_tokens_seen": 31924160, "step": 9595 }, { "epoch": 1.9208163469474526, "grad_norm": 0.534823477268219, "learning_rate": 1.4328220148242127e-05, "loss": 0.0306, "num_input_tokens_seen": 31941920, "step": 9600 }, { "epoch": 1.9218167721281545, "grad_norm": 0.16988834738731384, "learning_rate": 1.4304536953389996e-05, "loss": 0.0084, "num_input_tokens_seen": 31959072, "step": 9605 }, { "epoch": 1.9228171973088561, "grad_norm": 0.36865609884262085, "learning_rate": 1.428086550151182e-05, "loss": 0.0119, "num_input_tokens_seen": 31975584, "step": 9610 }, { "epoch": 1.923817622489558, "grad_norm": 0.47268640995025635, "learning_rate": 1.4257205818597424e-05, "loss": 0.0138, "num_input_tokens_seen": 31991360, "step": 9615 }, { "epoch": 1.9248180476702599, "grad_norm": 0.41165751218795776, "learning_rate": 1.4233557930623725e-05, "loss": 0.0142, "num_input_tokens_seen": 32007648, "step": 9620 }, { "epoch": 1.9258184728509615, "grad_norm": 0.7218590378761292, "learning_rate": 1.4209921863554654e-05, "loss": 0.0218, "num_input_tokens_seen": 32023872, "step": 9625 }, { "epoch": 1.9268188980316636, "grad_norm": 0.5501959919929504, "learning_rate": 1.4186297643341207e-05, "loss": 0.0186, "num_input_tokens_seen": 32041280, "step": 9630 }, { "epoch": 1.9278193232123653, "grad_norm": 0.3440016806125641, "learning_rate": 1.4162685295921358e-05, "loss": 0.0127, "num_input_tokens_seen": 32057408, "step": 9635 }, { "epoch": 1.928819748393067, "grad_norm": 0.5781524181365967, "learning_rate": 1.4139084847220024e-05, "loss": 0.013, "num_input_tokens_seen": 32073344, "step": 9640 }, { "epoch": 1.929820173573769, "grad_norm": 1.9640519618988037, "learning_rate": 1.4115496323149086e-05, "loss": 0.0204, "num_input_tokens_seen": 32090464, "step": 9645 }, { "epoch": 1.9308205987544707, "grad_norm": 0.5000998973846436, "learning_rate": 1.409191974960733e-05, "loss": 0.0223, "num_input_tokens_seen": 32107936, "step": 9650 }, { "epoch": 1.9318210239351723, "grad_norm": 1.0520826578140259, "learning_rate": 1.4068355152480384e-05, "loss": 0.0345, "num_input_tokens_seen": 32125632, "step": 9655 }, { "epoch": 1.9328214491158744, "grad_norm": 0.2433328628540039, "learning_rate": 1.4044802557640795e-05, "loss": 0.0197, "num_input_tokens_seen": 32142144, "step": 9660 }, { "epoch": 1.933821874296576, "grad_norm": 1.2062532901763916, "learning_rate": 1.4021261990947867e-05, "loss": 0.0179, "num_input_tokens_seen": 32157536, "step": 9665 }, { "epoch": 1.9348222994772777, "grad_norm": 0.26744315028190613, "learning_rate": 1.399773347824773e-05, "loss": 0.0249, "num_input_tokens_seen": 32173632, "step": 9670 }, { "epoch": 1.9358227246579798, "grad_norm": 0.1802188903093338, "learning_rate": 1.3974217045373284e-05, "loss": 0.0169, "num_input_tokens_seen": 32190816, "step": 9675 }, { "epoch": 1.9368231498386814, "grad_norm": 0.3061814606189728, "learning_rate": 1.3950712718144133e-05, "loss": 0.0166, "num_input_tokens_seen": 32206016, "step": 9680 }, { "epoch": 1.9378235750193833, "grad_norm": 0.2795793116092682, "learning_rate": 1.3927220522366624e-05, "loss": 0.0109, "num_input_tokens_seen": 32223008, "step": 9685 }, { "epoch": 1.9388240002000852, "grad_norm": 0.250918984413147, "learning_rate": 1.3903740483833788e-05, "loss": 0.0111, "num_input_tokens_seen": 32239808, "step": 9690 }, { "epoch": 1.9398244253807868, "grad_norm": 0.6707315444946289, "learning_rate": 1.3880272628325264e-05, "loss": 0.0159, "num_input_tokens_seen": 32256192, "step": 9695 }, { "epoch": 1.9408248505614887, "grad_norm": 0.18448741734027863, "learning_rate": 1.3856816981607355e-05, "loss": 0.0163, "num_input_tokens_seen": 32272736, "step": 9700 }, { "epoch": 1.9418252757421905, "grad_norm": 0.5773769617080688, "learning_rate": 1.383337356943295e-05, "loss": 0.0121, "num_input_tokens_seen": 32290144, "step": 9705 }, { "epoch": 1.9428257009228922, "grad_norm": 0.10217957198619843, "learning_rate": 1.38099424175415e-05, "loss": 0.0172, "num_input_tokens_seen": 32306496, "step": 9710 }, { "epoch": 1.943826126103594, "grad_norm": 0.021434210240840912, "learning_rate": 1.3786523551659013e-05, "loss": 0.0162, "num_input_tokens_seen": 32324160, "step": 9715 }, { "epoch": 1.944826551284296, "grad_norm": 0.20603245496749878, "learning_rate": 1.3763116997497968e-05, "loss": 0.0249, "num_input_tokens_seen": 32340000, "step": 9720 }, { "epoch": 1.9458269764649976, "grad_norm": 0.4069787561893463, "learning_rate": 1.3739722780757364e-05, "loss": 0.0299, "num_input_tokens_seen": 32356416, "step": 9725 }, { "epoch": 1.9468274016456995, "grad_norm": 0.08867352455854416, "learning_rate": 1.3716340927122647e-05, "loss": 0.0144, "num_input_tokens_seen": 32373696, "step": 9730 }, { "epoch": 1.9478278268264013, "grad_norm": 0.9749134182929993, "learning_rate": 1.369297146226567e-05, "loss": 0.0156, "num_input_tokens_seen": 32389824, "step": 9735 }, { "epoch": 1.948828252007103, "grad_norm": 0.26606303453445435, "learning_rate": 1.36696144118447e-05, "loss": 0.0232, "num_input_tokens_seen": 32405568, "step": 9740 }, { "epoch": 1.9498286771878048, "grad_norm": 0.22685298323631287, "learning_rate": 1.3646269801504388e-05, "loss": 0.0304, "num_input_tokens_seen": 32421344, "step": 9745 }, { "epoch": 1.9508291023685067, "grad_norm": 0.25356295704841614, "learning_rate": 1.3622937656875685e-05, "loss": 0.0138, "num_input_tokens_seen": 32437504, "step": 9750 }, { "epoch": 1.9518295275492084, "grad_norm": 0.7330453991889954, "learning_rate": 1.359961800357589e-05, "loss": 0.0129, "num_input_tokens_seen": 32453152, "step": 9755 }, { "epoch": 1.9528299527299102, "grad_norm": 0.6261770725250244, "learning_rate": 1.3576310867208577e-05, "loss": 0.011, "num_input_tokens_seen": 32470912, "step": 9760 }, { "epoch": 1.953830377910612, "grad_norm": 0.3458264172077179, "learning_rate": 1.3553016273363578e-05, "loss": 0.0171, "num_input_tokens_seen": 32488000, "step": 9765 }, { "epoch": 1.9548308030913137, "grad_norm": 0.7723591327667236, "learning_rate": 1.3529734247616965e-05, "loss": 0.0124, "num_input_tokens_seen": 32503392, "step": 9770 }, { "epoch": 1.9558312282720156, "grad_norm": 0.3869456350803375, "learning_rate": 1.350646481553098e-05, "loss": 0.0131, "num_input_tokens_seen": 32520160, "step": 9775 }, { "epoch": 1.9568316534527175, "grad_norm": 0.3871535360813141, "learning_rate": 1.3483208002654063e-05, "loss": 0.0171, "num_input_tokens_seen": 32537984, "step": 9780 }, { "epoch": 1.9578320786334191, "grad_norm": 0.1062915176153183, "learning_rate": 1.3459963834520806e-05, "loss": 0.0211, "num_input_tokens_seen": 32555296, "step": 9785 }, { "epoch": 1.958832503814121, "grad_norm": 0.2570017874240875, "learning_rate": 1.3436732336651887e-05, "loss": 0.0059, "num_input_tokens_seen": 32572736, "step": 9790 }, { "epoch": 1.9598329289948229, "grad_norm": 1.2094992399215698, "learning_rate": 1.3413513534554096e-05, "loss": 0.0247, "num_input_tokens_seen": 32588384, "step": 9795 }, { "epoch": 1.9608333541755245, "grad_norm": 0.49232643842697144, "learning_rate": 1.3390307453720292e-05, "loss": 0.0081, "num_input_tokens_seen": 32604768, "step": 9800 }, { "epoch": 1.9618337793562264, "grad_norm": 0.025176681578159332, "learning_rate": 1.336711411962933e-05, "loss": 0.0121, "num_input_tokens_seen": 32620384, "step": 9805 }, { "epoch": 1.9628342045369283, "grad_norm": 0.6701842546463013, "learning_rate": 1.3343933557746108e-05, "loss": 0.0219, "num_input_tokens_seen": 32636128, "step": 9810 }, { "epoch": 1.96383462971763, "grad_norm": 0.750227153301239, "learning_rate": 1.3320765793521484e-05, "loss": 0.0348, "num_input_tokens_seen": 32653824, "step": 9815 }, { "epoch": 1.9648350548983318, "grad_norm": 0.0017446859274059534, "learning_rate": 1.3297610852392264e-05, "loss": 0.0332, "num_input_tokens_seen": 32669536, "step": 9820 }, { "epoch": 1.9658354800790336, "grad_norm": 0.49023136496543884, "learning_rate": 1.3274468759781195e-05, "loss": 0.0238, "num_input_tokens_seen": 32687392, "step": 9825 }, { "epoch": 1.9668359052597353, "grad_norm": 0.20972096920013428, "learning_rate": 1.3251339541096874e-05, "loss": 0.0429, "num_input_tokens_seen": 32705120, "step": 9830 }, { "epoch": 1.9678363304404372, "grad_norm": 0.4810045659542084, "learning_rate": 1.32282232217338e-05, "loss": 0.0316, "num_input_tokens_seen": 32721088, "step": 9835 }, { "epoch": 1.968836755621139, "grad_norm": 0.5527705550193787, "learning_rate": 1.3205119827072315e-05, "loss": 0.0208, "num_input_tokens_seen": 32737376, "step": 9840 }, { "epoch": 1.9698371808018407, "grad_norm": 1.1657907962799072, "learning_rate": 1.3182029382478528e-05, "loss": 0.0364, "num_input_tokens_seen": 32753312, "step": 9845 }, { "epoch": 1.9708376059825425, "grad_norm": 0.5193189382553101, "learning_rate": 1.3158951913304366e-05, "loss": 0.0203, "num_input_tokens_seen": 32770880, "step": 9850 }, { "epoch": 1.9718380311632444, "grad_norm": 0.48777249455451965, "learning_rate": 1.3135887444887513e-05, "loss": 0.017, "num_input_tokens_seen": 32788544, "step": 9855 }, { "epoch": 1.972838456343946, "grad_norm": 0.12154678255319595, "learning_rate": 1.3112836002551332e-05, "loss": 0.0132, "num_input_tokens_seen": 32804384, "step": 9860 }, { "epoch": 1.973838881524648, "grad_norm": 0.1488259881734848, "learning_rate": 1.3089797611604949e-05, "loss": 0.0089, "num_input_tokens_seen": 32820864, "step": 9865 }, { "epoch": 1.9748393067053498, "grad_norm": 0.40845534205436707, "learning_rate": 1.306677229734313e-05, "loss": 0.0232, "num_input_tokens_seen": 32839552, "step": 9870 }, { "epoch": 1.9758397318860514, "grad_norm": 0.8280574679374695, "learning_rate": 1.3043760085046264e-05, "loss": 0.0265, "num_input_tokens_seen": 32856800, "step": 9875 }, { "epoch": 1.9768401570667535, "grad_norm": 0.20856891572475433, "learning_rate": 1.3020760999980385e-05, "loss": 0.0125, "num_input_tokens_seen": 32873728, "step": 9880 }, { "epoch": 1.9778405822474552, "grad_norm": 0.4518989324569702, "learning_rate": 1.2997775067397095e-05, "loss": 0.0107, "num_input_tokens_seen": 32888800, "step": 9885 }, { "epoch": 1.9788410074281568, "grad_norm": 0.8835201859474182, "learning_rate": 1.2974802312533569e-05, "loss": 0.0389, "num_input_tokens_seen": 32906144, "step": 9890 }, { "epoch": 1.979841432608859, "grad_norm": 0.5036636590957642, "learning_rate": 1.2951842760612509e-05, "loss": 0.0213, "num_input_tokens_seen": 32921792, "step": 9895 }, { "epoch": 1.9808418577895606, "grad_norm": 0.06584909558296204, "learning_rate": 1.2928896436842108e-05, "loss": 0.0148, "num_input_tokens_seen": 32938528, "step": 9900 }, { "epoch": 1.9818422829702622, "grad_norm": 0.184793159365654, "learning_rate": 1.2905963366416057e-05, "loss": 0.0373, "num_input_tokens_seen": 32954784, "step": 9905 }, { "epoch": 1.9828427081509643, "grad_norm": 0.11269237101078033, "learning_rate": 1.288304357451348e-05, "loss": 0.0206, "num_input_tokens_seen": 32971328, "step": 9910 }, { "epoch": 1.983843133331666, "grad_norm": 1.0727444887161255, "learning_rate": 1.2860137086298934e-05, "loss": 0.0213, "num_input_tokens_seen": 32987552, "step": 9915 }, { "epoch": 1.9848435585123676, "grad_norm": 0.5628823637962341, "learning_rate": 1.283724392692236e-05, "loss": 0.0141, "num_input_tokens_seen": 33004672, "step": 9920 }, { "epoch": 1.9858439836930697, "grad_norm": 0.7233517169952393, "learning_rate": 1.2814364121519074e-05, "loss": 0.0287, "num_input_tokens_seen": 33022400, "step": 9925 }, { "epoch": 1.9868444088737713, "grad_norm": 0.36505573987960815, "learning_rate": 1.279149769520971e-05, "loss": 0.0066, "num_input_tokens_seen": 33038528, "step": 9930 }, { "epoch": 1.987844834054473, "grad_norm": 0.08851584792137146, "learning_rate": 1.2768644673100238e-05, "loss": 0.0144, "num_input_tokens_seen": 33056864, "step": 9935 }, { "epoch": 1.988845259235175, "grad_norm": 0.4341006278991699, "learning_rate": 1.2745805080281886e-05, "loss": 0.0294, "num_input_tokens_seen": 33072736, "step": 9940 }, { "epoch": 1.9898456844158767, "grad_norm": 0.40328821539878845, "learning_rate": 1.2722978941831162e-05, "loss": 0.0201, "num_input_tokens_seen": 33089344, "step": 9945 }, { "epoch": 1.9908461095965786, "grad_norm": 0.24123868346214294, "learning_rate": 1.2700166282809794e-05, "loss": 0.0235, "num_input_tokens_seen": 33105408, "step": 9950 }, { "epoch": 1.9918465347772805, "grad_norm": 0.962817370891571, "learning_rate": 1.2677367128264695e-05, "loss": 0.0334, "num_input_tokens_seen": 33121856, "step": 9955 }, { "epoch": 1.9928469599579821, "grad_norm": 0.2703090012073517, "learning_rate": 1.2654581503227963e-05, "loss": 0.0077, "num_input_tokens_seen": 33140000, "step": 9960 }, { "epoch": 1.993847385138684, "grad_norm": 0.5985310077667236, "learning_rate": 1.2631809432716852e-05, "loss": 0.0256, "num_input_tokens_seen": 33156160, "step": 9965 }, { "epoch": 1.9948478103193859, "grad_norm": 0.005395437590777874, "learning_rate": 1.2609050941733713e-05, "loss": 0.0255, "num_input_tokens_seen": 33172800, "step": 9970 }, { "epoch": 1.9958482355000875, "grad_norm": 0.32826924324035645, "learning_rate": 1.2586306055266006e-05, "loss": 0.0179, "num_input_tokens_seen": 33189152, "step": 9975 }, { "epoch": 1.9968486606807894, "grad_norm": 0.030345836654305458, "learning_rate": 1.2563574798286249e-05, "loss": 0.0103, "num_input_tokens_seen": 33206208, "step": 9980 }, { "epoch": 1.9978490858614912, "grad_norm": 0.15808875858783722, "learning_rate": 1.2540857195751978e-05, "loss": 0.0246, "num_input_tokens_seen": 33222816, "step": 9985 }, { "epoch": 1.9988495110421929, "grad_norm": 0.5443884134292603, "learning_rate": 1.251815327260576e-05, "loss": 0.0197, "num_input_tokens_seen": 33239232, "step": 9990 }, { "epoch": 1.9998499362228948, "grad_norm": 0.31862539052963257, "learning_rate": 1.2495463053775145e-05, "loss": 0.0209, "num_input_tokens_seen": 33256928, "step": 9995 }, { "epoch": 2.0008503614035966, "grad_norm": 0.3561738133430481, "learning_rate": 1.2472786564172604e-05, "loss": 0.0061, "num_input_tokens_seen": 33272928, "step": 10000 }, { "epoch": 2.0018507865842983, "grad_norm": 0.16990409791469574, "learning_rate": 1.2450123828695576e-05, "loss": 0.0111, "num_input_tokens_seen": 33289824, "step": 10005 }, { "epoch": 2.002851211765, "grad_norm": 0.13149109482765198, "learning_rate": 1.2427474872226363e-05, "loss": 0.0056, "num_input_tokens_seen": 33306080, "step": 10010 }, { "epoch": 2.003851636945702, "grad_norm": 0.04130624979734421, "learning_rate": 1.240483971963215e-05, "loss": 0.0125, "num_input_tokens_seen": 33322688, "step": 10015 }, { "epoch": 2.0048520621264037, "grad_norm": 0.3268091380596161, "learning_rate": 1.2382218395765002e-05, "loss": 0.006, "num_input_tokens_seen": 33338048, "step": 10020 }, { "epoch": 2.0058524873071053, "grad_norm": 0.2938809096813202, "learning_rate": 1.2359610925461742e-05, "loss": 0.0101, "num_input_tokens_seen": 33353952, "step": 10025 }, { "epoch": 2.0068529124878074, "grad_norm": 0.4705606997013092, "learning_rate": 1.2337017333544018e-05, "loss": 0.0095, "num_input_tokens_seen": 33370880, "step": 10030 }, { "epoch": 2.007853337668509, "grad_norm": 0.09025679528713226, "learning_rate": 1.231443764481824e-05, "loss": 0.0029, "num_input_tokens_seen": 33387968, "step": 10035 }, { "epoch": 2.008853762849211, "grad_norm": 0.023138917982578278, "learning_rate": 1.2291871884075532e-05, "loss": 0.0073, "num_input_tokens_seen": 33406560, "step": 10040 }, { "epoch": 2.009854188029913, "grad_norm": 0.16036993265151978, "learning_rate": 1.2269320076091742e-05, "loss": 0.0069, "num_input_tokens_seen": 33423392, "step": 10045 }, { "epoch": 2.0108546132106144, "grad_norm": 0.3855239152908325, "learning_rate": 1.2246782245627412e-05, "loss": 0.0065, "num_input_tokens_seen": 33439616, "step": 10050 }, { "epoch": 2.0118550383913165, "grad_norm": 0.2660057544708252, "learning_rate": 1.22242584174277e-05, "loss": 0.009, "num_input_tokens_seen": 33455776, "step": 10055 }, { "epoch": 2.012855463572018, "grad_norm": 0.12530750036239624, "learning_rate": 1.220174861622243e-05, "loss": 0.0178, "num_input_tokens_seen": 33471360, "step": 10060 }, { "epoch": 2.01385588875272, "grad_norm": 0.2971198558807373, "learning_rate": 1.2179252866725981e-05, "loss": 0.007, "num_input_tokens_seen": 33487136, "step": 10065 }, { "epoch": 2.014856313933422, "grad_norm": 0.12205822020769119, "learning_rate": 1.2156771193637359e-05, "loss": 0.0077, "num_input_tokens_seen": 33502816, "step": 10070 }, { "epoch": 2.0158567391141236, "grad_norm": 0.11058386415243149, "learning_rate": 1.2134303621640087e-05, "loss": 0.0029, "num_input_tokens_seen": 33518560, "step": 10075 }, { "epoch": 2.016857164294825, "grad_norm": 0.11671654880046844, "learning_rate": 1.2111850175402183e-05, "loss": 0.0035, "num_input_tokens_seen": 33535328, "step": 10080 }, { "epoch": 2.0178575894755273, "grad_norm": 0.25610536336898804, "learning_rate": 1.208941087957619e-05, "loss": 0.0104, "num_input_tokens_seen": 33551520, "step": 10085 }, { "epoch": 2.018858014656229, "grad_norm": 0.21770890057086945, "learning_rate": 1.2066985758799113e-05, "loss": 0.005, "num_input_tokens_seen": 33567840, "step": 10090 }, { "epoch": 2.0198584398369306, "grad_norm": 0.0675746500492096, "learning_rate": 1.204457483769236e-05, "loss": 0.0047, "num_input_tokens_seen": 33584576, "step": 10095 }, { "epoch": 2.0208588650176327, "grad_norm": 0.09338697791099548, "learning_rate": 1.2022178140861787e-05, "loss": 0.0041, "num_input_tokens_seen": 33601600, "step": 10100 }, { "epoch": 2.0218592901983343, "grad_norm": 0.1736832857131958, "learning_rate": 1.1999795692897623e-05, "loss": 0.006, "num_input_tokens_seen": 33618592, "step": 10105 }, { "epoch": 2.022859715379036, "grad_norm": 0.4741303324699402, "learning_rate": 1.1977427518374428e-05, "loss": 0.0073, "num_input_tokens_seen": 33635008, "step": 10110 }, { "epoch": 2.023860140559738, "grad_norm": 0.46921661496162415, "learning_rate": 1.1955073641851125e-05, "loss": 0.0173, "num_input_tokens_seen": 33652224, "step": 10115 }, { "epoch": 2.0248605657404397, "grad_norm": 0.16761521995067596, "learning_rate": 1.1932734087870917e-05, "loss": 0.0053, "num_input_tokens_seen": 33667904, "step": 10120 }, { "epoch": 2.0258609909211414, "grad_norm": 0.506805956363678, "learning_rate": 1.1910408880961291e-05, "loss": 0.0086, "num_input_tokens_seen": 33685376, "step": 10125 }, { "epoch": 2.0268614161018434, "grad_norm": 0.5889264941215515, "learning_rate": 1.1888098045633988e-05, "loss": 0.0064, "num_input_tokens_seen": 33700928, "step": 10130 }, { "epoch": 2.027861841282545, "grad_norm": 0.128665953874588, "learning_rate": 1.1865801606384944e-05, "loss": 0.0024, "num_input_tokens_seen": 33716384, "step": 10135 }, { "epoch": 2.0288622664632467, "grad_norm": 0.0878659337759018, "learning_rate": 1.184351958769431e-05, "loss": 0.0128, "num_input_tokens_seen": 33734272, "step": 10140 }, { "epoch": 2.029862691643949, "grad_norm": 0.2913253605365753, "learning_rate": 1.1821252014026413e-05, "loss": 0.0082, "num_input_tokens_seen": 33750752, "step": 10145 }, { "epoch": 2.0308631168246505, "grad_norm": 0.05499362573027611, "learning_rate": 1.179899890982969e-05, "loss": 0.007, "num_input_tokens_seen": 33766784, "step": 10150 }, { "epoch": 2.031863542005352, "grad_norm": 0.35297057032585144, "learning_rate": 1.1776760299536713e-05, "loss": 0.006, "num_input_tokens_seen": 33784544, "step": 10155 }, { "epoch": 2.032863967186054, "grad_norm": 0.06490829586982727, "learning_rate": 1.1754536207564149e-05, "loss": 0.0021, "num_input_tokens_seen": 33800960, "step": 10160 }, { "epoch": 2.033864392366756, "grad_norm": 0.5872918367385864, "learning_rate": 1.1732326658312693e-05, "loss": 0.0056, "num_input_tokens_seen": 33817120, "step": 10165 }, { "epoch": 2.0348648175474575, "grad_norm": 0.03993507847189903, "learning_rate": 1.17101316761671e-05, "loss": 0.0041, "num_input_tokens_seen": 33833376, "step": 10170 }, { "epoch": 2.0358652427281596, "grad_norm": 0.09653317928314209, "learning_rate": 1.1687951285496124e-05, "loss": 0.0015, "num_input_tokens_seen": 33850272, "step": 10175 }, { "epoch": 2.0368656679088613, "grad_norm": 0.30753350257873535, "learning_rate": 1.1665785510652494e-05, "loss": 0.0075, "num_input_tokens_seen": 33867072, "step": 10180 }, { "epoch": 2.037866093089563, "grad_norm": 0.9518786668777466, "learning_rate": 1.1643634375972908e-05, "loss": 0.009, "num_input_tokens_seen": 33883232, "step": 10185 }, { "epoch": 2.038866518270265, "grad_norm": 0.5314575433731079, "learning_rate": 1.1621497905777962e-05, "loss": 0.0088, "num_input_tokens_seen": 33899616, "step": 10190 }, { "epoch": 2.0398669434509666, "grad_norm": 0.4062546491622925, "learning_rate": 1.1599376124372166e-05, "loss": 0.0072, "num_input_tokens_seen": 33915136, "step": 10195 }, { "epoch": 2.0408673686316683, "grad_norm": 0.11941560357809067, "learning_rate": 1.1577269056043922e-05, "loss": 0.006, "num_input_tokens_seen": 33930656, "step": 10200 }, { "epoch": 2.0418677938123704, "grad_norm": 0.353001207113266, "learning_rate": 1.1555176725065434e-05, "loss": 0.0061, "num_input_tokens_seen": 33948032, "step": 10205 }, { "epoch": 2.042868218993072, "grad_norm": 0.20794694125652313, "learning_rate": 1.1533099155692762e-05, "loss": 0.0034, "num_input_tokens_seen": 33966816, "step": 10210 }, { "epoch": 2.0438686441737737, "grad_norm": 0.7453445792198181, "learning_rate": 1.1511036372165754e-05, "loss": 0.0133, "num_input_tokens_seen": 33982560, "step": 10215 }, { "epoch": 2.0448690693544758, "grad_norm": 0.7288349270820618, "learning_rate": 1.1488988398707992e-05, "loss": 0.0041, "num_input_tokens_seen": 33999104, "step": 10220 }, { "epoch": 2.0458694945351774, "grad_norm": 0.3507794439792633, "learning_rate": 1.146695525952686e-05, "loss": 0.0109, "num_input_tokens_seen": 34015456, "step": 10225 }, { "epoch": 2.046869919715879, "grad_norm": 0.008352497592568398, "learning_rate": 1.1444936978813386e-05, "loss": 0.0114, "num_input_tokens_seen": 34032032, "step": 10230 }, { "epoch": 2.047870344896581, "grad_norm": 0.28601670265197754, "learning_rate": 1.1422933580742327e-05, "loss": 0.0056, "num_input_tokens_seen": 34048544, "step": 10235 }, { "epoch": 2.048870770077283, "grad_norm": 0.3275420367717743, "learning_rate": 1.1400945089472095e-05, "loss": 0.005, "num_input_tokens_seen": 34065024, "step": 10240 }, { "epoch": 2.0498711952579844, "grad_norm": 0.2914477586746216, "learning_rate": 1.1378971529144714e-05, "loss": 0.0087, "num_input_tokens_seen": 34080320, "step": 10245 }, { "epoch": 2.0508716204386865, "grad_norm": 0.12133269011974335, "learning_rate": 1.1357012923885835e-05, "loss": 0.0063, "num_input_tokens_seen": 34096352, "step": 10250 }, { "epoch": 2.051872045619388, "grad_norm": 0.3188471496105194, "learning_rate": 1.1335069297804693e-05, "loss": 0.0023, "num_input_tokens_seen": 34113696, "step": 10255 }, { "epoch": 2.05287247080009, "grad_norm": 0.023096419870853424, "learning_rate": 1.1313140674994052e-05, "loss": 0.0026, "num_input_tokens_seen": 34130304, "step": 10260 }, { "epoch": 2.053872895980792, "grad_norm": 0.6211907267570496, "learning_rate": 1.1291227079530222e-05, "loss": 0.0073, "num_input_tokens_seen": 34147840, "step": 10265 }, { "epoch": 2.0548733211614936, "grad_norm": 0.0840262845158577, "learning_rate": 1.1269328535473025e-05, "loss": 0.0064, "num_input_tokens_seen": 34163680, "step": 10270 }, { "epoch": 2.055873746342195, "grad_norm": 0.15804554522037506, "learning_rate": 1.1247445066865714e-05, "loss": 0.0145, "num_input_tokens_seen": 34180288, "step": 10275 }, { "epoch": 2.0568741715228973, "grad_norm": 0.09390754997730255, "learning_rate": 1.1225576697735055e-05, "loss": 0.0019, "num_input_tokens_seen": 34197440, "step": 10280 }, { "epoch": 2.057874596703599, "grad_norm": 1.0508612394332886, "learning_rate": 1.120372345209117e-05, "loss": 0.0104, "num_input_tokens_seen": 34213504, "step": 10285 }, { "epoch": 2.058875021884301, "grad_norm": 0.0515342615544796, "learning_rate": 1.1181885353927616e-05, "loss": 0.0061, "num_input_tokens_seen": 34230784, "step": 10290 }, { "epoch": 2.0598754470650027, "grad_norm": 1.2798871994018555, "learning_rate": 1.116006242722132e-05, "loss": 0.0044, "num_input_tokens_seen": 34246464, "step": 10295 }, { "epoch": 2.0608758722457043, "grad_norm": 0.10574700683355331, "learning_rate": 1.1138254695932524e-05, "loss": 0.0123, "num_input_tokens_seen": 34263424, "step": 10300 }, { "epoch": 2.0618762974264064, "grad_norm": 0.6526563763618469, "learning_rate": 1.1116462184004806e-05, "loss": 0.0127, "num_input_tokens_seen": 34279488, "step": 10305 }, { "epoch": 2.062876722607108, "grad_norm": 0.6106184720993042, "learning_rate": 1.1094684915365045e-05, "loss": 0.0089, "num_input_tokens_seen": 34295456, "step": 10310 }, { "epoch": 2.0638771477878097, "grad_norm": 0.10786963254213333, "learning_rate": 1.1072922913923348e-05, "loss": 0.0047, "num_input_tokens_seen": 34311904, "step": 10315 }, { "epoch": 2.064877572968512, "grad_norm": 0.5320205688476562, "learning_rate": 1.1051176203573094e-05, "loss": 0.0079, "num_input_tokens_seen": 34328544, "step": 10320 }, { "epoch": 2.0658779981492135, "grad_norm": 0.08332104980945587, "learning_rate": 1.1029444808190859e-05, "loss": 0.0033, "num_input_tokens_seen": 34346880, "step": 10325 }, { "epoch": 2.066878423329915, "grad_norm": 0.5802815556526184, "learning_rate": 1.1007728751636403e-05, "loss": 0.0077, "num_input_tokens_seen": 34363712, "step": 10330 }, { "epoch": 2.067878848510617, "grad_norm": 1.655693769454956, "learning_rate": 1.0986028057752662e-05, "loss": 0.0099, "num_input_tokens_seen": 34379232, "step": 10335 }, { "epoch": 2.068879273691319, "grad_norm": 0.04671124368906021, "learning_rate": 1.0964342750365667e-05, "loss": 0.0071, "num_input_tokens_seen": 34395936, "step": 10340 }, { "epoch": 2.0698796988720205, "grad_norm": 0.47704026103019714, "learning_rate": 1.0942672853284592e-05, "loss": 0.0097, "num_input_tokens_seen": 34412032, "step": 10345 }, { "epoch": 2.0708801240527226, "grad_norm": 0.32408711314201355, "learning_rate": 1.0921018390301683e-05, "loss": 0.0096, "num_input_tokens_seen": 34428352, "step": 10350 }, { "epoch": 2.0718805492334242, "grad_norm": 0.0068885162472724915, "learning_rate": 1.0899379385192221e-05, "loss": 0.0016, "num_input_tokens_seen": 34445344, "step": 10355 }, { "epoch": 2.072880974414126, "grad_norm": 1.3240070343017578, "learning_rate": 1.0877755861714539e-05, "loss": 0.0122, "num_input_tokens_seen": 34463680, "step": 10360 }, { "epoch": 2.073881399594828, "grad_norm": 0.024377919733524323, "learning_rate": 1.0856147843609968e-05, "loss": 0.0034, "num_input_tokens_seen": 34481088, "step": 10365 }, { "epoch": 2.0748818247755296, "grad_norm": 0.23395225405693054, "learning_rate": 1.0834555354602796e-05, "loss": 0.0044, "num_input_tokens_seen": 34497440, "step": 10370 }, { "epoch": 2.0758822499562313, "grad_norm": 0.13731922209262848, "learning_rate": 1.0812978418400282e-05, "loss": 0.0113, "num_input_tokens_seen": 34514400, "step": 10375 }, { "epoch": 2.0768826751369334, "grad_norm": 0.10927022993564606, "learning_rate": 1.0791417058692601e-05, "loss": 0.0091, "num_input_tokens_seen": 34531680, "step": 10380 }, { "epoch": 2.077883100317635, "grad_norm": 0.002200738526880741, "learning_rate": 1.0769871299152829e-05, "loss": 0.0064, "num_input_tokens_seen": 34549376, "step": 10385 }, { "epoch": 2.0788835254983367, "grad_norm": 0.07721970230340958, "learning_rate": 1.074834116343692e-05, "loss": 0.0146, "num_input_tokens_seen": 34565440, "step": 10390 }, { "epoch": 2.0798839506790388, "grad_norm": 0.17168541252613068, "learning_rate": 1.0726826675183647e-05, "loss": 0.0082, "num_input_tokens_seen": 34582560, "step": 10395 }, { "epoch": 2.0808843758597404, "grad_norm": 0.6241939663887024, "learning_rate": 1.0705327858014627e-05, "loss": 0.0101, "num_input_tokens_seen": 34599456, "step": 10400 }, { "epoch": 2.081884801040442, "grad_norm": 0.056994225829839706, "learning_rate": 1.0683844735534282e-05, "loss": 0.0102, "num_input_tokens_seen": 34616160, "step": 10405 }, { "epoch": 2.082885226221144, "grad_norm": 0.20421095192432404, "learning_rate": 1.0662377331329765e-05, "loss": 0.0081, "num_input_tokens_seen": 34632352, "step": 10410 }, { "epoch": 2.083885651401846, "grad_norm": 0.42918482422828674, "learning_rate": 1.0640925668971e-05, "loss": 0.0064, "num_input_tokens_seen": 34648800, "step": 10415 }, { "epoch": 2.0848860765825474, "grad_norm": 0.022502677515149117, "learning_rate": 1.0619489772010626e-05, "loss": 0.0063, "num_input_tokens_seen": 34664864, "step": 10420 }, { "epoch": 2.0858865017632495, "grad_norm": 0.12481766939163208, "learning_rate": 1.0598069663983948e-05, "loss": 0.0016, "num_input_tokens_seen": 34680736, "step": 10425 }, { "epoch": 2.086886926943951, "grad_norm": 0.04550350829958916, "learning_rate": 1.057666536840898e-05, "loss": 0.0052, "num_input_tokens_seen": 34697664, "step": 10430 }, { "epoch": 2.087887352124653, "grad_norm": 0.00910136103630066, "learning_rate": 1.055527690878633e-05, "loss": 0.0013, "num_input_tokens_seen": 34713952, "step": 10435 }, { "epoch": 2.088887777305355, "grad_norm": 0.4201542139053345, "learning_rate": 1.0533904308599241e-05, "loss": 0.0072, "num_input_tokens_seen": 34730752, "step": 10440 }, { "epoch": 2.0898882024860566, "grad_norm": 0.9132028222084045, "learning_rate": 1.0512547591313555e-05, "loss": 0.0389, "num_input_tokens_seen": 34748096, "step": 10445 }, { "epoch": 2.090888627666758, "grad_norm": 0.2836069166660309, "learning_rate": 1.0491206780377636e-05, "loss": 0.0161, "num_input_tokens_seen": 34765600, "step": 10450 }, { "epoch": 2.0918890528474603, "grad_norm": 0.186019167304039, "learning_rate": 1.0469881899222419e-05, "loss": 0.0088, "num_input_tokens_seen": 34782176, "step": 10455 }, { "epoch": 2.092889478028162, "grad_norm": 0.476346492767334, "learning_rate": 1.0448572971261353e-05, "loss": 0.0057, "num_input_tokens_seen": 34798144, "step": 10460 }, { "epoch": 2.0938899032088636, "grad_norm": 0.48843252658843994, "learning_rate": 1.0427280019890332e-05, "loss": 0.0093, "num_input_tokens_seen": 34814592, "step": 10465 }, { "epoch": 2.0948903283895657, "grad_norm": 0.641743540763855, "learning_rate": 1.0406003068487747e-05, "loss": 0.0038, "num_input_tokens_seen": 34832416, "step": 10470 }, { "epoch": 2.0958907535702673, "grad_norm": 0.03875632956624031, "learning_rate": 1.0384742140414416e-05, "loss": 0.0061, "num_input_tokens_seen": 34848960, "step": 10475 }, { "epoch": 2.096891178750969, "grad_norm": 0.8721215128898621, "learning_rate": 1.0363497259013529e-05, "loss": 0.0091, "num_input_tokens_seen": 34864960, "step": 10480 }, { "epoch": 2.097891603931671, "grad_norm": 0.8210321664810181, "learning_rate": 1.0342268447610722e-05, "loss": 0.0087, "num_input_tokens_seen": 34883264, "step": 10485 }, { "epoch": 2.0988920291123727, "grad_norm": 0.5005853772163391, "learning_rate": 1.0321055729513928e-05, "loss": 0.0161, "num_input_tokens_seen": 34899488, "step": 10490 }, { "epoch": 2.0998924542930744, "grad_norm": 0.21855968236923218, "learning_rate": 1.0299859128013437e-05, "loss": 0.0051, "num_input_tokens_seen": 34917664, "step": 10495 }, { "epoch": 2.1008928794737765, "grad_norm": 0.03365173190832138, "learning_rate": 1.027867866638185e-05, "loss": 0.0054, "num_input_tokens_seen": 34933024, "step": 10500 }, { "epoch": 2.101893304654478, "grad_norm": 0.046729862689971924, "learning_rate": 1.0257514367874022e-05, "loss": 0.0044, "num_input_tokens_seen": 34950080, "step": 10505 }, { "epoch": 2.1028937298351797, "grad_norm": 0.2730543315410614, "learning_rate": 1.0236366255727085e-05, "loss": 0.0212, "num_input_tokens_seen": 34966208, "step": 10510 }, { "epoch": 2.103894155015882, "grad_norm": 0.02403431199491024, "learning_rate": 1.0215234353160399e-05, "loss": 0.0035, "num_input_tokens_seen": 34983360, "step": 10515 }, { "epoch": 2.1048945801965835, "grad_norm": 0.06615236401557922, "learning_rate": 1.0194118683375503e-05, "loss": 0.0041, "num_input_tokens_seen": 35000160, "step": 10520 }, { "epoch": 2.105895005377285, "grad_norm": 0.13943585753440857, "learning_rate": 1.017301926955614e-05, "loss": 0.0034, "num_input_tokens_seen": 35016288, "step": 10525 }, { "epoch": 2.1068954305579872, "grad_norm": 0.6373512148857117, "learning_rate": 1.0151936134868198e-05, "loss": 0.0035, "num_input_tokens_seen": 35032544, "step": 10530 }, { "epoch": 2.107895855738689, "grad_norm": 0.19955801963806152, "learning_rate": 1.0130869302459684e-05, "loss": 0.0045, "num_input_tokens_seen": 35048064, "step": 10535 }, { "epoch": 2.1088962809193905, "grad_norm": 0.06729407608509064, "learning_rate": 1.0109818795460718e-05, "loss": 0.0039, "num_input_tokens_seen": 35064032, "step": 10540 }, { "epoch": 2.1098967061000926, "grad_norm": 0.6858115196228027, "learning_rate": 1.0088784636983473e-05, "loss": 0.0107, "num_input_tokens_seen": 35080352, "step": 10545 }, { "epoch": 2.1108971312807943, "grad_norm": 0.11107315868139267, "learning_rate": 1.0067766850122199e-05, "loss": 0.0094, "num_input_tokens_seen": 35096480, "step": 10550 }, { "epoch": 2.1118975564614964, "grad_norm": 0.18054699897766113, "learning_rate": 1.0046765457953163e-05, "loss": 0.0058, "num_input_tokens_seen": 35113248, "step": 10555 }, { "epoch": 2.112897981642198, "grad_norm": 0.3239651918411255, "learning_rate": 1.0025780483534616e-05, "loss": 0.0112, "num_input_tokens_seen": 35129728, "step": 10560 }, { "epoch": 2.1138984068228996, "grad_norm": 0.35435789823532104, "learning_rate": 1.00048119499068e-05, "loss": 0.0033, "num_input_tokens_seen": 35146464, "step": 10565 }, { "epoch": 2.1148988320036017, "grad_norm": 0.1789417862892151, "learning_rate": 9.983859880091912e-06, "loss": 0.0175, "num_input_tokens_seen": 35163872, "step": 10570 }, { "epoch": 2.1158992571843034, "grad_norm": 0.28334328532218933, "learning_rate": 9.962924297094043e-06, "loss": 0.0066, "num_input_tokens_seen": 35180512, "step": 10575 }, { "epoch": 2.116899682365005, "grad_norm": 0.7784582376480103, "learning_rate": 9.942005223899212e-06, "loss": 0.0052, "num_input_tokens_seen": 35196832, "step": 10580 }, { "epoch": 2.117900107545707, "grad_norm": 0.05406097322702408, "learning_rate": 9.921102683475303e-06, "loss": 0.0034, "num_input_tokens_seen": 35213792, "step": 10585 }, { "epoch": 2.1189005327264088, "grad_norm": 0.17640522122383118, "learning_rate": 9.900216698772042e-06, "loss": 0.004, "num_input_tokens_seen": 35231008, "step": 10590 }, { "epoch": 2.1199009579071104, "grad_norm": 0.11796704679727554, "learning_rate": 9.879347292720989e-06, "loss": 0.004, "num_input_tokens_seen": 35247968, "step": 10595 }, { "epoch": 2.1209013830878125, "grad_norm": 0.06427179276943207, "learning_rate": 9.858494488235484e-06, "loss": 0.0194, "num_input_tokens_seen": 35264128, "step": 10600 }, { "epoch": 2.121901808268514, "grad_norm": 0.7149617075920105, "learning_rate": 9.837658308210652e-06, "loss": 0.0027, "num_input_tokens_seen": 35282304, "step": 10605 }, { "epoch": 2.122902233449216, "grad_norm": 0.14961405098438263, "learning_rate": 9.816838775523374e-06, "loss": 0.0048, "num_input_tokens_seen": 35297792, "step": 10610 }, { "epoch": 2.123902658629918, "grad_norm": 0.10325565189123154, "learning_rate": 9.796035913032226e-06, "loss": 0.0027, "num_input_tokens_seen": 35313792, "step": 10615 }, { "epoch": 2.1249030838106195, "grad_norm": 0.04255054146051407, "learning_rate": 9.775249743577505e-06, "loss": 0.0017, "num_input_tokens_seen": 35329856, "step": 10620 }, { "epoch": 2.125903508991321, "grad_norm": 0.03326898068189621, "learning_rate": 9.75448028998118e-06, "loss": 0.0014, "num_input_tokens_seen": 35347520, "step": 10625 }, { "epoch": 2.1269039341720233, "grad_norm": 0.05380835011601448, "learning_rate": 9.733727575046845e-06, "loss": 0.0115, "num_input_tokens_seen": 35363648, "step": 10630 }, { "epoch": 2.127904359352725, "grad_norm": 0.23670682311058044, "learning_rate": 9.712991621559738e-06, "loss": 0.0082, "num_input_tokens_seen": 35380896, "step": 10635 }, { "epoch": 2.1289047845334266, "grad_norm": 0.13752107322216034, "learning_rate": 9.692272452286685e-06, "loss": 0.0029, "num_input_tokens_seen": 35397120, "step": 10640 }, { "epoch": 2.1299052097141287, "grad_norm": 1.3863645792007446, "learning_rate": 9.67157008997609e-06, "loss": 0.0201, "num_input_tokens_seen": 35414112, "step": 10645 }, { "epoch": 2.1309056348948303, "grad_norm": 1.1318001747131348, "learning_rate": 9.6508845573579e-06, "loss": 0.0149, "num_input_tokens_seen": 35430464, "step": 10650 }, { "epoch": 2.131906060075532, "grad_norm": 0.30626359581947327, "learning_rate": 9.630215877143575e-06, "loss": 0.0028, "num_input_tokens_seen": 35447584, "step": 10655 }, { "epoch": 2.132906485256234, "grad_norm": 0.2643603980541229, "learning_rate": 9.609564072026083e-06, "loss": 0.0036, "num_input_tokens_seen": 35464064, "step": 10660 }, { "epoch": 2.1339069104369357, "grad_norm": 0.03090248629450798, "learning_rate": 9.588929164679871e-06, "loss": 0.0063, "num_input_tokens_seen": 35480704, "step": 10665 }, { "epoch": 2.1349073356176373, "grad_norm": 0.8625773787498474, "learning_rate": 9.56831117776081e-06, "loss": 0.0047, "num_input_tokens_seen": 35499136, "step": 10670 }, { "epoch": 2.1359077607983394, "grad_norm": 0.42030104994773865, "learning_rate": 9.547710133906213e-06, "loss": 0.0171, "num_input_tokens_seen": 35515616, "step": 10675 }, { "epoch": 2.136908185979041, "grad_norm": 0.5008081793785095, "learning_rate": 9.527126055734795e-06, "loss": 0.0043, "num_input_tokens_seen": 35534688, "step": 10680 }, { "epoch": 2.1379086111597427, "grad_norm": 0.25674349069595337, "learning_rate": 9.506558965846604e-06, "loss": 0.0013, "num_input_tokens_seen": 35551904, "step": 10685 }, { "epoch": 2.138909036340445, "grad_norm": 0.038256000727415085, "learning_rate": 9.486008886823095e-06, "loss": 0.0006, "num_input_tokens_seen": 35568992, "step": 10690 }, { "epoch": 2.1399094615211465, "grad_norm": 0.1449941247701645, "learning_rate": 9.465475841227017e-06, "loss": 0.0053, "num_input_tokens_seen": 35585440, "step": 10695 }, { "epoch": 2.140909886701848, "grad_norm": 0.37136611342430115, "learning_rate": 9.444959851602397e-06, "loss": 0.0047, "num_input_tokens_seen": 35601888, "step": 10700 }, { "epoch": 2.14191031188255, "grad_norm": 0.022761758416891098, "learning_rate": 9.424460940474573e-06, "loss": 0.0232, "num_input_tokens_seen": 35618400, "step": 10705 }, { "epoch": 2.142910737063252, "grad_norm": 1.0737857818603516, "learning_rate": 9.4039791303501e-06, "loss": 0.0138, "num_input_tokens_seen": 35635872, "step": 10710 }, { "epoch": 2.1439111622439535, "grad_norm": 0.10832738131284714, "learning_rate": 9.383514443716774e-06, "loss": 0.004, "num_input_tokens_seen": 35652352, "step": 10715 }, { "epoch": 2.1449115874246556, "grad_norm": 0.0014874746557325125, "learning_rate": 9.363066903043601e-06, "loss": 0.0008, "num_input_tokens_seen": 35669120, "step": 10720 }, { "epoch": 2.1459120126053572, "grad_norm": 0.30137619376182556, "learning_rate": 9.342636530780733e-06, "loss": 0.0075, "num_input_tokens_seen": 35685728, "step": 10725 }, { "epoch": 2.146912437786059, "grad_norm": 0.03261270001530647, "learning_rate": 9.322223349359491e-06, "loss": 0.0022, "num_input_tokens_seen": 35703392, "step": 10730 }, { "epoch": 2.147912862966761, "grad_norm": 0.08350440859794617, "learning_rate": 9.301827381192321e-06, "loss": 0.004, "num_input_tokens_seen": 35720480, "step": 10735 }, { "epoch": 2.1489132881474626, "grad_norm": 0.08948685228824615, "learning_rate": 9.28144864867277e-06, "loss": 0.0026, "num_input_tokens_seen": 35737728, "step": 10740 }, { "epoch": 2.1499137133281643, "grad_norm": 0.060130346566438675, "learning_rate": 9.26108717417545e-06, "loss": 0.0086, "num_input_tokens_seen": 35753664, "step": 10745 }, { "epoch": 2.1509141385088664, "grad_norm": 1.0318474769592285, "learning_rate": 9.240742980056049e-06, "loss": 0.0042, "num_input_tokens_seen": 35770688, "step": 10750 }, { "epoch": 2.151914563689568, "grad_norm": 0.5060439109802246, "learning_rate": 9.220416088651249e-06, "loss": 0.0085, "num_input_tokens_seen": 35787712, "step": 10755 }, { "epoch": 2.1529149888702697, "grad_norm": 0.07923345267772675, "learning_rate": 9.200106522278765e-06, "loss": 0.0019, "num_input_tokens_seen": 35804192, "step": 10760 }, { "epoch": 2.1539154140509718, "grad_norm": 0.09731190651655197, "learning_rate": 9.179814303237263e-06, "loss": 0.0073, "num_input_tokens_seen": 35820864, "step": 10765 }, { "epoch": 2.1549158392316734, "grad_norm": 0.12063892930746078, "learning_rate": 9.159539453806385e-06, "loss": 0.0053, "num_input_tokens_seen": 35836896, "step": 10770 }, { "epoch": 2.155916264412375, "grad_norm": 0.8915739059448242, "learning_rate": 9.139281996246699e-06, "loss": 0.007, "num_input_tokens_seen": 35853728, "step": 10775 }, { "epoch": 2.156916689593077, "grad_norm": 0.9608829021453857, "learning_rate": 9.119041952799662e-06, "loss": 0.0077, "num_input_tokens_seen": 35870848, "step": 10780 }, { "epoch": 2.157917114773779, "grad_norm": 0.0003298874362371862, "learning_rate": 9.098819345687622e-06, "loss": 0.0017, "num_input_tokens_seen": 35888128, "step": 10785 }, { "epoch": 2.158917539954481, "grad_norm": 0.026527972891926765, "learning_rate": 9.078614197113794e-06, "loss": 0.0011, "num_input_tokens_seen": 35904992, "step": 10790 }, { "epoch": 2.1599179651351825, "grad_norm": 0.1923072785139084, "learning_rate": 9.058426529262207e-06, "loss": 0.0046, "num_input_tokens_seen": 35920896, "step": 10795 }, { "epoch": 2.160918390315884, "grad_norm": 0.00016449119721073657, "learning_rate": 9.0382563642977e-06, "loss": 0.0009, "num_input_tokens_seen": 35937760, "step": 10800 }, { "epoch": 2.161918815496586, "grad_norm": 0.6942543983459473, "learning_rate": 9.018103724365918e-06, "loss": 0.0097, "num_input_tokens_seen": 35954752, "step": 10805 }, { "epoch": 2.162919240677288, "grad_norm": 0.0013587274588644505, "learning_rate": 8.997968631593217e-06, "loss": 0.0069, "num_input_tokens_seen": 35970432, "step": 10810 }, { "epoch": 2.1639196658579896, "grad_norm": 0.7722641229629517, "learning_rate": 8.977851108086735e-06, "loss": 0.0105, "num_input_tokens_seen": 35986496, "step": 10815 }, { "epoch": 2.1649200910386917, "grad_norm": 0.07212720066308975, "learning_rate": 8.957751175934302e-06, "loss": 0.0026, "num_input_tokens_seen": 36003072, "step": 10820 }, { "epoch": 2.1659205162193933, "grad_norm": 0.03139474615454674, "learning_rate": 8.937668857204412e-06, "loss": 0.0006, "num_input_tokens_seen": 36019680, "step": 10825 }, { "epoch": 2.166920941400095, "grad_norm": 0.331551730632782, "learning_rate": 8.917604173946268e-06, "loss": 0.0037, "num_input_tokens_seen": 36036832, "step": 10830 }, { "epoch": 2.167921366580797, "grad_norm": 0.05217951908707619, "learning_rate": 8.89755714818966e-06, "loss": 0.0067, "num_input_tokens_seen": 36053184, "step": 10835 }, { "epoch": 2.1689217917614987, "grad_norm": 0.08116187900304794, "learning_rate": 8.877527801945012e-06, "loss": 0.0034, "num_input_tokens_seen": 36070688, "step": 10840 }, { "epoch": 2.1699222169422003, "grad_norm": 0.14644820988178253, "learning_rate": 8.857516157203372e-06, "loss": 0.0045, "num_input_tokens_seen": 36086432, "step": 10845 }, { "epoch": 2.1709226421229024, "grad_norm": 1.253024697303772, "learning_rate": 8.837522235936287e-06, "loss": 0.011, "num_input_tokens_seen": 36102976, "step": 10850 }, { "epoch": 2.171923067303604, "grad_norm": 0.13459981977939606, "learning_rate": 8.81754606009589e-06, "loss": 0.0008, "num_input_tokens_seen": 36119616, "step": 10855 }, { "epoch": 2.1729234924843057, "grad_norm": 0.00015339571109507233, "learning_rate": 8.79758765161483e-06, "loss": 0.0021, "num_input_tokens_seen": 36136064, "step": 10860 }, { "epoch": 2.173923917665008, "grad_norm": 0.02076278254389763, "learning_rate": 8.777647032406214e-06, "loss": 0.0059, "num_input_tokens_seen": 36152576, "step": 10865 }, { "epoch": 2.1749243428457095, "grad_norm": 0.4944254159927368, "learning_rate": 8.757724224363654e-06, "loss": 0.0011, "num_input_tokens_seen": 36168320, "step": 10870 }, { "epoch": 2.175924768026411, "grad_norm": 0.014887170866131783, "learning_rate": 8.737819249361195e-06, "loss": 0.0141, "num_input_tokens_seen": 36185792, "step": 10875 }, { "epoch": 2.176925193207113, "grad_norm": 0.007804557681083679, "learning_rate": 8.71793212925329e-06, "loss": 0.0023, "num_input_tokens_seen": 36202048, "step": 10880 }, { "epoch": 2.177925618387815, "grad_norm": 0.16051238775253296, "learning_rate": 8.69806288587481e-06, "loss": 0.0034, "num_input_tokens_seen": 36219168, "step": 10885 }, { "epoch": 2.1789260435685165, "grad_norm": 0.12735103070735931, "learning_rate": 8.678211541040968e-06, "loss": 0.0035, "num_input_tokens_seen": 36236288, "step": 10890 }, { "epoch": 2.1799264687492186, "grad_norm": 0.007964778691530228, "learning_rate": 8.658378116547361e-06, "loss": 0.0003, "num_input_tokens_seen": 36252992, "step": 10895 }, { "epoch": 2.1809268939299202, "grad_norm": 0.00014435694902203977, "learning_rate": 8.638562634169903e-06, "loss": 0.0003, "num_input_tokens_seen": 36269440, "step": 10900 }, { "epoch": 2.181927319110622, "grad_norm": 0.11460757255554199, "learning_rate": 8.618765115664775e-06, "loss": 0.0028, "num_input_tokens_seen": 36285568, "step": 10905 }, { "epoch": 2.182927744291324, "grad_norm": 0.5251172780990601, "learning_rate": 8.598985582768474e-06, "loss": 0.0147, "num_input_tokens_seen": 36302560, "step": 10910 }, { "epoch": 2.1839281694720256, "grad_norm": 0.01360283326357603, "learning_rate": 8.579224057197738e-06, "loss": 0.0036, "num_input_tokens_seen": 36319616, "step": 10915 }, { "epoch": 2.1849285946527273, "grad_norm": 2.9669125080108643, "learning_rate": 8.559480560649519e-06, "loss": 0.0098, "num_input_tokens_seen": 36335808, "step": 10920 }, { "epoch": 2.1859290198334294, "grad_norm": 0.08378507941961288, "learning_rate": 8.539755114800995e-06, "loss": 0.0035, "num_input_tokens_seen": 36351808, "step": 10925 }, { "epoch": 2.186929445014131, "grad_norm": 0.003294085618108511, "learning_rate": 8.520047741309522e-06, "loss": 0.0007, "num_input_tokens_seen": 36368192, "step": 10930 }, { "epoch": 2.1879298701948326, "grad_norm": 1.2479307651519775, "learning_rate": 8.500358461812594e-06, "loss": 0.0115, "num_input_tokens_seen": 36384096, "step": 10935 }, { "epoch": 2.1889302953755347, "grad_norm": 0.0866975262761116, "learning_rate": 8.480687297927862e-06, "loss": 0.0011, "num_input_tokens_seen": 36400512, "step": 10940 }, { "epoch": 2.1899307205562364, "grad_norm": 0.13979388773441315, "learning_rate": 8.461034271253076e-06, "loss": 0.0015, "num_input_tokens_seen": 36416608, "step": 10945 }, { "epoch": 2.190931145736938, "grad_norm": 0.07170166820287704, "learning_rate": 8.441399403366079e-06, "loss": 0.0032, "num_input_tokens_seen": 36433088, "step": 10950 }, { "epoch": 2.19193157091764, "grad_norm": 0.08745460212230682, "learning_rate": 8.421782715824781e-06, "loss": 0.0059, "num_input_tokens_seen": 36450304, "step": 10955 }, { "epoch": 2.1929319960983418, "grad_norm": 0.1059829369187355, "learning_rate": 8.4021842301671e-06, "loss": 0.0036, "num_input_tokens_seen": 36466528, "step": 10960 }, { "epoch": 2.1939324212790434, "grad_norm": 1.6656731367111206, "learning_rate": 8.38260396791101e-06, "loss": 0.0179, "num_input_tokens_seen": 36483360, "step": 10965 }, { "epoch": 2.1949328464597455, "grad_norm": 0.9303325414657593, "learning_rate": 8.363041950554463e-06, "loss": 0.0027, "num_input_tokens_seen": 36500128, "step": 10970 }, { "epoch": 2.195933271640447, "grad_norm": 1.052852749824524, "learning_rate": 8.343498199575354e-06, "loss": 0.0048, "num_input_tokens_seen": 36515648, "step": 10975 }, { "epoch": 2.196933696821149, "grad_norm": 1.1024091243743896, "learning_rate": 8.323972736431556e-06, "loss": 0.0189, "num_input_tokens_seen": 36532576, "step": 10980 }, { "epoch": 2.197934122001851, "grad_norm": 0.06839191168546677, "learning_rate": 8.304465582560864e-06, "loss": 0.0023, "num_input_tokens_seen": 36548832, "step": 10985 }, { "epoch": 2.1989345471825525, "grad_norm": 0.03600554168224335, "learning_rate": 8.284976759380937e-06, "loss": 0.0086, "num_input_tokens_seen": 36565088, "step": 10990 }, { "epoch": 2.199934972363254, "grad_norm": 0.3253752589225769, "learning_rate": 8.265506288289333e-06, "loss": 0.0019, "num_input_tokens_seen": 36580448, "step": 10995 }, { "epoch": 2.2009353975439563, "grad_norm": 0.13283377885818481, "learning_rate": 8.24605419066346e-06, "loss": 0.0065, "num_input_tokens_seen": 36596704, "step": 11000 }, { "epoch": 2.201935822724658, "grad_norm": 1.415050745010376, "learning_rate": 8.22662048786055e-06, "loss": 0.0076, "num_input_tokens_seen": 36613792, "step": 11005 }, { "epoch": 2.2029362479053596, "grad_norm": 0.2603066861629486, "learning_rate": 8.207205201217644e-06, "loss": 0.0022, "num_input_tokens_seen": 36630240, "step": 11010 }, { "epoch": 2.2039366730860617, "grad_norm": 0.22146141529083252, "learning_rate": 8.187808352051543e-06, "loss": 0.0018, "num_input_tokens_seen": 36647136, "step": 11015 }, { "epoch": 2.2049370982667633, "grad_norm": 0.5399295091629028, "learning_rate": 8.168429961658822e-06, "loss": 0.0031, "num_input_tokens_seen": 36662880, "step": 11020 }, { "epoch": 2.205937523447465, "grad_norm": 0.7557527422904968, "learning_rate": 8.1490700513158e-06, "loss": 0.0141, "num_input_tokens_seen": 36678784, "step": 11025 }, { "epoch": 2.206937948628167, "grad_norm": 0.2671133577823639, "learning_rate": 8.12972864227847e-06, "loss": 0.0028, "num_input_tokens_seen": 36695808, "step": 11030 }, { "epoch": 2.2079383738088687, "grad_norm": 0.17605052888393402, "learning_rate": 8.110405755782543e-06, "loss": 0.0055, "num_input_tokens_seen": 36710752, "step": 11035 }, { "epoch": 2.2089387989895704, "grad_norm": 0.21284525096416473, "learning_rate": 8.091101413043395e-06, "loss": 0.0043, "num_input_tokens_seen": 36727232, "step": 11040 }, { "epoch": 2.2099392241702724, "grad_norm": 0.05113387480378151, "learning_rate": 8.071815635256009e-06, "loss": 0.005, "num_input_tokens_seen": 36743904, "step": 11045 }, { "epoch": 2.210939649350974, "grad_norm": 0.6746780872344971, "learning_rate": 8.052548443595031e-06, "loss": 0.0064, "num_input_tokens_seen": 36762944, "step": 11050 }, { "epoch": 2.211940074531676, "grad_norm": 0.5726050734519958, "learning_rate": 8.033299859214657e-06, "loss": 0.0055, "num_input_tokens_seen": 36780224, "step": 11055 }, { "epoch": 2.212940499712378, "grad_norm": 0.5448541641235352, "learning_rate": 8.014069903248683e-06, "loss": 0.0072, "num_input_tokens_seen": 36797568, "step": 11060 }, { "epoch": 2.2139409248930795, "grad_norm": 0.078267902135849, "learning_rate": 7.994858596810448e-06, "loss": 0.0026, "num_input_tokens_seen": 36814432, "step": 11065 }, { "epoch": 2.214941350073781, "grad_norm": 0.40484005212783813, "learning_rate": 7.97566596099279e-06, "loss": 0.0043, "num_input_tokens_seen": 36832096, "step": 11070 }, { "epoch": 2.215941775254483, "grad_norm": 0.2648407220840454, "learning_rate": 7.956492016868077e-06, "loss": 0.0058, "num_input_tokens_seen": 36847936, "step": 11075 }, { "epoch": 2.216942200435185, "grad_norm": 0.037626806646585464, "learning_rate": 7.937336785488156e-06, "loss": 0.0004, "num_input_tokens_seen": 36863616, "step": 11080 }, { "epoch": 2.217942625615887, "grad_norm": 0.0008298809407278895, "learning_rate": 7.918200287884294e-06, "loss": 0.0058, "num_input_tokens_seen": 36879648, "step": 11085 }, { "epoch": 2.2189430507965886, "grad_norm": 0.01786094717681408, "learning_rate": 7.899082545067227e-06, "loss": 0.0007, "num_input_tokens_seen": 36896896, "step": 11090 }, { "epoch": 2.2199434759772902, "grad_norm": 0.058479081839323044, "learning_rate": 7.879983578027078e-06, "loss": 0.0064, "num_input_tokens_seen": 36913760, "step": 11095 }, { "epoch": 2.2209439011579923, "grad_norm": 0.10139820724725723, "learning_rate": 7.860903407733366e-06, "loss": 0.0014, "num_input_tokens_seen": 36930304, "step": 11100 }, { "epoch": 2.221944326338694, "grad_norm": 1.0013636350631714, "learning_rate": 7.841842055134977e-06, "loss": 0.0047, "num_input_tokens_seen": 36947136, "step": 11105 }, { "epoch": 2.2229447515193956, "grad_norm": 0.07671966403722763, "learning_rate": 7.822799541160109e-06, "loss": 0.0079, "num_input_tokens_seen": 36962496, "step": 11110 }, { "epoch": 2.2239451767000977, "grad_norm": 0.21829085052013397, "learning_rate": 7.803775886716303e-06, "loss": 0.0044, "num_input_tokens_seen": 36979392, "step": 11115 }, { "epoch": 2.2249456018807994, "grad_norm": 0.16538646817207336, "learning_rate": 7.784771112690393e-06, "loss": 0.0073, "num_input_tokens_seen": 36996736, "step": 11120 }, { "epoch": 2.225946027061501, "grad_norm": 0.448978066444397, "learning_rate": 7.765785239948462e-06, "loss": 0.0033, "num_input_tokens_seen": 37012608, "step": 11125 }, { "epoch": 2.226946452242203, "grad_norm": 1.5056158304214478, "learning_rate": 7.746818289335861e-06, "loss": 0.0115, "num_input_tokens_seen": 37029280, "step": 11130 }, { "epoch": 2.2279468774229048, "grad_norm": 0.013198950327932835, "learning_rate": 7.727870281677168e-06, "loss": 0.0038, "num_input_tokens_seen": 37045760, "step": 11135 }, { "epoch": 2.2289473026036064, "grad_norm": 0.2678176760673523, "learning_rate": 7.708941237776138e-06, "loss": 0.0054, "num_input_tokens_seen": 37063392, "step": 11140 }, { "epoch": 2.2299477277843085, "grad_norm": 0.024856559932231903, "learning_rate": 7.690031178415724e-06, "loss": 0.0174, "num_input_tokens_seen": 37079936, "step": 11145 }, { "epoch": 2.23094815296501, "grad_norm": 0.018685590475797653, "learning_rate": 7.67114012435804e-06, "loss": 0.0015, "num_input_tokens_seen": 37096224, "step": 11150 }, { "epoch": 2.231948578145712, "grad_norm": 0.20162878930568695, "learning_rate": 7.652268096344315e-06, "loss": 0.0104, "num_input_tokens_seen": 37112896, "step": 11155 }, { "epoch": 2.232949003326414, "grad_norm": 1.602971076965332, "learning_rate": 7.633415115094913e-06, "loss": 0.0122, "num_input_tokens_seen": 37130464, "step": 11160 }, { "epoch": 2.2339494285071155, "grad_norm": 1.0054552555084229, "learning_rate": 7.614581201309254e-06, "loss": 0.0114, "num_input_tokens_seen": 37148096, "step": 11165 }, { "epoch": 2.234949853687817, "grad_norm": 0.8621109127998352, "learning_rate": 7.59576637566585e-06, "loss": 0.0101, "num_input_tokens_seen": 37165856, "step": 11170 }, { "epoch": 2.2359502788685193, "grad_norm": 0.13474692404270172, "learning_rate": 7.576970658822252e-06, "loss": 0.0024, "num_input_tokens_seen": 37182400, "step": 11175 }, { "epoch": 2.236950704049221, "grad_norm": 0.7479958534240723, "learning_rate": 7.558194071415009e-06, "loss": 0.0078, "num_input_tokens_seen": 37198304, "step": 11180 }, { "epoch": 2.2379511292299226, "grad_norm": 0.8761096000671387, "learning_rate": 7.539436634059688e-06, "loss": 0.0035, "num_input_tokens_seen": 37215136, "step": 11185 }, { "epoch": 2.2389515544106247, "grad_norm": 0.5879620909690857, "learning_rate": 7.520698367350837e-06, "loss": 0.004, "num_input_tokens_seen": 37232416, "step": 11190 }, { "epoch": 2.2399519795913263, "grad_norm": 0.04014047607779503, "learning_rate": 7.501979291861927e-06, "loss": 0.0038, "num_input_tokens_seen": 37248768, "step": 11195 }, { "epoch": 2.240952404772028, "grad_norm": 0.0336315892636776, "learning_rate": 7.483279428145382e-06, "loss": 0.0014, "num_input_tokens_seen": 37265984, "step": 11200 }, { "epoch": 2.24195282995273, "grad_norm": 0.35822203755378723, "learning_rate": 7.464598796732525e-06, "loss": 0.0012, "num_input_tokens_seen": 37281632, "step": 11205 }, { "epoch": 2.2429532551334317, "grad_norm": 0.014730866998434067, "learning_rate": 7.445937418133564e-06, "loss": 0.0019, "num_input_tokens_seen": 37297920, "step": 11210 }, { "epoch": 2.2439536803141333, "grad_norm": 1.2691651582717896, "learning_rate": 7.427295312837576e-06, "loss": 0.0142, "num_input_tokens_seen": 37315040, "step": 11215 }, { "epoch": 2.2449541054948354, "grad_norm": 0.8418992757797241, "learning_rate": 7.408672501312455e-06, "loss": 0.0046, "num_input_tokens_seen": 37332864, "step": 11220 }, { "epoch": 2.245954530675537, "grad_norm": 1.1171756982803345, "learning_rate": 7.390069004004932e-06, "loss": 0.0126, "num_input_tokens_seen": 37350112, "step": 11225 }, { "epoch": 2.2469549558562387, "grad_norm": 0.37548863887786865, "learning_rate": 7.371484841340537e-06, "loss": 0.0098, "num_input_tokens_seen": 37366752, "step": 11230 }, { "epoch": 2.247955381036941, "grad_norm": 0.3306463658809662, "learning_rate": 7.35292003372354e-06, "loss": 0.0037, "num_input_tokens_seen": 37383008, "step": 11235 }, { "epoch": 2.2489558062176425, "grad_norm": 0.09577401727437973, "learning_rate": 7.334374601536992e-06, "loss": 0.0127, "num_input_tokens_seen": 37400064, "step": 11240 }, { "epoch": 2.249956231398344, "grad_norm": 1.3612958192825317, "learning_rate": 7.3158485651426685e-06, "loss": 0.0118, "num_input_tokens_seen": 37417376, "step": 11245 }, { "epoch": 2.250956656579046, "grad_norm": 0.19891683757305145, "learning_rate": 7.297341944881017e-06, "loss": 0.0053, "num_input_tokens_seen": 37433696, "step": 11250 }, { "epoch": 2.251957081759748, "grad_norm": 0.17722542583942413, "learning_rate": 7.2788547610712215e-06, "loss": 0.0042, "num_input_tokens_seen": 37450752, "step": 11255 }, { "epoch": 2.25295750694045, "grad_norm": 0.6168380379676819, "learning_rate": 7.2603870340110734e-06, "loss": 0.0121, "num_input_tokens_seen": 37467648, "step": 11260 }, { "epoch": 2.2539579321211516, "grad_norm": 0.46064338088035583, "learning_rate": 7.2419387839770345e-06, "loss": 0.011, "num_input_tokens_seen": 37483808, "step": 11265 }, { "epoch": 2.2549583573018532, "grad_norm": 0.0065690092742443085, "learning_rate": 7.223510031224173e-06, "loss": 0.0033, "num_input_tokens_seen": 37500416, "step": 11270 }, { "epoch": 2.255958782482555, "grad_norm": 0.28575095534324646, "learning_rate": 7.205100795986139e-06, "loss": 0.0072, "num_input_tokens_seen": 37517344, "step": 11275 }, { "epoch": 2.256959207663257, "grad_norm": 0.6938003897666931, "learning_rate": 7.1867110984751715e-06, "loss": 0.0032, "num_input_tokens_seen": 37533824, "step": 11280 }, { "epoch": 2.2579596328439586, "grad_norm": 0.454542338848114, "learning_rate": 7.168340958882053e-06, "loss": 0.0027, "num_input_tokens_seen": 37549568, "step": 11285 }, { "epoch": 2.2589600580246607, "grad_norm": 0.028962474316358566, "learning_rate": 7.14999039737608e-06, "loss": 0.0128, "num_input_tokens_seen": 37566656, "step": 11290 }, { "epoch": 2.2599604832053624, "grad_norm": 0.02451271563768387, "learning_rate": 7.1316594341050675e-06, "loss": 0.0013, "num_input_tokens_seen": 37583424, "step": 11295 }, { "epoch": 2.260960908386064, "grad_norm": 0.48972272872924805, "learning_rate": 7.11334808919531e-06, "loss": 0.0078, "num_input_tokens_seen": 37599360, "step": 11300 }, { "epoch": 2.2619613335667657, "grad_norm": 0.10499460250139236, "learning_rate": 7.095056382751558e-06, "loss": 0.0117, "num_input_tokens_seen": 37615552, "step": 11305 }, { "epoch": 2.2629617587474677, "grad_norm": 0.054841239005327225, "learning_rate": 7.076784334857018e-06, "loss": 0.0046, "num_input_tokens_seen": 37632736, "step": 11310 }, { "epoch": 2.2639621839281694, "grad_norm": 0.31568577885627747, "learning_rate": 7.058531965573273e-06, "loss": 0.0067, "num_input_tokens_seen": 37649440, "step": 11315 }, { "epoch": 2.2649626091088715, "grad_norm": 0.027038007974624634, "learning_rate": 7.040299294940339e-06, "loss": 0.003, "num_input_tokens_seen": 37665888, "step": 11320 }, { "epoch": 2.265963034289573, "grad_norm": 0.35627177357673645, "learning_rate": 7.022086342976594e-06, "loss": 0.0016, "num_input_tokens_seen": 37682816, "step": 11325 }, { "epoch": 2.2669634594702748, "grad_norm": 0.6563451290130615, "learning_rate": 7.003893129678749e-06, "loss": 0.0105, "num_input_tokens_seen": 37699008, "step": 11330 }, { "epoch": 2.2679638846509764, "grad_norm": 0.029778748750686646, "learning_rate": 6.98571967502186e-06, "loss": 0.0039, "num_input_tokens_seen": 37716288, "step": 11335 }, { "epoch": 2.2689643098316785, "grad_norm": 0.818820059299469, "learning_rate": 6.967565998959297e-06, "loss": 0.002, "num_input_tokens_seen": 37732704, "step": 11340 }, { "epoch": 2.26996473501238, "grad_norm": 2.850776195526123, "learning_rate": 6.949432121422683e-06, "loss": 0.0131, "num_input_tokens_seen": 37749216, "step": 11345 }, { "epoch": 2.2709651601930823, "grad_norm": 0.8199514150619507, "learning_rate": 6.931318062321934e-06, "loss": 0.0041, "num_input_tokens_seen": 37765184, "step": 11350 }, { "epoch": 2.271965585373784, "grad_norm": 1.1873388290405273, "learning_rate": 6.913223841545194e-06, "loss": 0.0081, "num_input_tokens_seen": 37781152, "step": 11355 }, { "epoch": 2.2729660105544855, "grad_norm": 1.3290454149246216, "learning_rate": 6.8951494789588264e-06, "loss": 0.007, "num_input_tokens_seen": 37798304, "step": 11360 }, { "epoch": 2.2739664357351876, "grad_norm": 0.0264931358397007, "learning_rate": 6.877094994407402e-06, "loss": 0.0086, "num_input_tokens_seen": 37814080, "step": 11365 }, { "epoch": 2.2749668609158893, "grad_norm": 0.7083085179328918, "learning_rate": 6.859060407713638e-06, "loss": 0.0051, "num_input_tokens_seen": 37829856, "step": 11370 }, { "epoch": 2.275967286096591, "grad_norm": 1.5754274129867554, "learning_rate": 6.8410457386784335e-06, "loss": 0.0077, "num_input_tokens_seen": 37846912, "step": 11375 }, { "epoch": 2.276967711277293, "grad_norm": 1.5928505659103394, "learning_rate": 6.823051007080822e-06, "loss": 0.0235, "num_input_tokens_seen": 37864416, "step": 11380 }, { "epoch": 2.2779681364579947, "grad_norm": 0.2157992571592331, "learning_rate": 6.80507623267791e-06, "loss": 0.003, "num_input_tokens_seen": 37881184, "step": 11385 }, { "epoch": 2.2789685616386963, "grad_norm": 0.4091234803199768, "learning_rate": 6.7871214352049326e-06, "loss": 0.0016, "num_input_tokens_seen": 37896480, "step": 11390 }, { "epoch": 2.2799689868193984, "grad_norm": 0.056644193828105927, "learning_rate": 6.7691866343751745e-06, "loss": 0.0081, "num_input_tokens_seen": 37912224, "step": 11395 }, { "epoch": 2.2809694120001, "grad_norm": 1.0580017566680908, "learning_rate": 6.751271849879961e-06, "loss": 0.0047, "num_input_tokens_seen": 37927744, "step": 11400 }, { "epoch": 2.2819698371808017, "grad_norm": 0.44456249475479126, "learning_rate": 6.733377101388646e-06, "loss": 0.0117, "num_input_tokens_seen": 37943712, "step": 11405 }, { "epoch": 2.282970262361504, "grad_norm": 0.5159664750099182, "learning_rate": 6.715502408548588e-06, "loss": 0.0103, "num_input_tokens_seen": 37960864, "step": 11410 }, { "epoch": 2.2839706875422054, "grad_norm": 0.04254378750920296, "learning_rate": 6.697647790985123e-06, "loss": 0.0064, "num_input_tokens_seen": 37978880, "step": 11415 }, { "epoch": 2.284971112722907, "grad_norm": 0.09852634370326996, "learning_rate": 6.679813268301552e-06, "loss": 0.0075, "num_input_tokens_seen": 37994624, "step": 11420 }, { "epoch": 2.285971537903609, "grad_norm": 1.8152574300765991, "learning_rate": 6.661998860079094e-06, "loss": 0.0158, "num_input_tokens_seen": 38012288, "step": 11425 }, { "epoch": 2.286971963084311, "grad_norm": 0.040850501507520676, "learning_rate": 6.644204585876898e-06, "loss": 0.0056, "num_input_tokens_seen": 38029728, "step": 11430 }, { "epoch": 2.2879723882650125, "grad_norm": 0.6898013949394226, "learning_rate": 6.626430465232014e-06, "loss": 0.0054, "num_input_tokens_seen": 38046112, "step": 11435 }, { "epoch": 2.2889728134457146, "grad_norm": 2.1513373851776123, "learning_rate": 6.608676517659346e-06, "loss": 0.003, "num_input_tokens_seen": 38062368, "step": 11440 }, { "epoch": 2.289973238626416, "grad_norm": 0.37828773260116577, "learning_rate": 6.590942762651659e-06, "loss": 0.0014, "num_input_tokens_seen": 38079520, "step": 11445 }, { "epoch": 2.290973663807118, "grad_norm": 0.4980011284351349, "learning_rate": 6.573229219679555e-06, "loss": 0.0042, "num_input_tokens_seen": 38096320, "step": 11450 }, { "epoch": 2.29197408898782, "grad_norm": 0.7982547879219055, "learning_rate": 6.5555359081914205e-06, "loss": 0.0058, "num_input_tokens_seen": 38112384, "step": 11455 }, { "epoch": 2.2929745141685216, "grad_norm": 0.02630678378045559, "learning_rate": 6.537862847613463e-06, "loss": 0.0013, "num_input_tokens_seen": 38130432, "step": 11460 }, { "epoch": 2.2939749393492233, "grad_norm": 0.009921961463987827, "learning_rate": 6.52021005734964e-06, "loss": 0.0026, "num_input_tokens_seen": 38145344, "step": 11465 }, { "epoch": 2.2949753645299253, "grad_norm": 0.8031901121139526, "learning_rate": 6.502577556781636e-06, "loss": 0.0074, "num_input_tokens_seen": 38161376, "step": 11470 }, { "epoch": 2.295975789710627, "grad_norm": 0.6120681166648865, "learning_rate": 6.484965365268888e-06, "loss": 0.0046, "num_input_tokens_seen": 38178944, "step": 11475 }, { "epoch": 2.2969762148913286, "grad_norm": 0.6634009480476379, "learning_rate": 6.46737350214851e-06, "loss": 0.0091, "num_input_tokens_seen": 38195968, "step": 11480 }, { "epoch": 2.2979766400720307, "grad_norm": 0.17244035005569458, "learning_rate": 6.44980198673531e-06, "loss": 0.0075, "num_input_tokens_seen": 38213280, "step": 11485 }, { "epoch": 2.2989770652527324, "grad_norm": 1.0651710033416748, "learning_rate": 6.432250838321762e-06, "loss": 0.0051, "num_input_tokens_seen": 38229504, "step": 11490 }, { "epoch": 2.299977490433434, "grad_norm": 1.1801623106002808, "learning_rate": 6.4147200761779575e-06, "loss": 0.0239, "num_input_tokens_seen": 38246240, "step": 11495 }, { "epoch": 2.300977915614136, "grad_norm": 1.4246106147766113, "learning_rate": 6.39720971955162e-06, "loss": 0.02, "num_input_tokens_seen": 38262144, "step": 11500 }, { "epoch": 2.3019783407948378, "grad_norm": 0.26787394285202026, "learning_rate": 6.379719787668079e-06, "loss": 0.0036, "num_input_tokens_seen": 38279744, "step": 11505 }, { "epoch": 2.3029787659755394, "grad_norm": 0.01519018318504095, "learning_rate": 6.362250299730196e-06, "loss": 0.0046, "num_input_tokens_seen": 38297184, "step": 11510 }, { "epoch": 2.3039791911562415, "grad_norm": 0.12452206760644913, "learning_rate": 6.344801274918446e-06, "loss": 0.0071, "num_input_tokens_seen": 38313280, "step": 11515 }, { "epoch": 2.304979616336943, "grad_norm": 1.0121663808822632, "learning_rate": 6.327372732390802e-06, "loss": 0.0065, "num_input_tokens_seen": 38329824, "step": 11520 }, { "epoch": 2.3059800415176452, "grad_norm": 0.31872105598449707, "learning_rate": 6.309964691282738e-06, "loss": 0.0017, "num_input_tokens_seen": 38348512, "step": 11525 }, { "epoch": 2.306980466698347, "grad_norm": 0.16815949976444244, "learning_rate": 6.292577170707256e-06, "loss": 0.0097, "num_input_tokens_seen": 38365664, "step": 11530 }, { "epoch": 2.3079808918790485, "grad_norm": 0.09678701311349869, "learning_rate": 6.275210189754782e-06, "loss": 0.012, "num_input_tokens_seen": 38382304, "step": 11535 }, { "epoch": 2.30898131705975, "grad_norm": 0.6526739001274109, "learning_rate": 6.2578637674932275e-06, "loss": 0.0042, "num_input_tokens_seen": 38399584, "step": 11540 }, { "epoch": 2.3099817422404523, "grad_norm": 0.11651992797851562, "learning_rate": 6.2405379229679265e-06, "loss": 0.0155, "num_input_tokens_seen": 38416064, "step": 11545 }, { "epoch": 2.310982167421154, "grad_norm": 0.03568281978368759, "learning_rate": 6.2232326752015956e-06, "loss": 0.0021, "num_input_tokens_seen": 38432640, "step": 11550 }, { "epoch": 2.311982592601856, "grad_norm": 0.10640271008014679, "learning_rate": 6.205948043194359e-06, "loss": 0.0038, "num_input_tokens_seen": 38449440, "step": 11555 }, { "epoch": 2.3129830177825577, "grad_norm": 0.18235915899276733, "learning_rate": 6.1886840459237065e-06, "loss": 0.0059, "num_input_tokens_seen": 38466848, "step": 11560 }, { "epoch": 2.3139834429632593, "grad_norm": 0.034905605018138885, "learning_rate": 6.171440702344464e-06, "loss": 0.0083, "num_input_tokens_seen": 38483552, "step": 11565 }, { "epoch": 2.314983868143961, "grad_norm": 0.21466754376888275, "learning_rate": 6.154218031388784e-06, "loss": 0.0029, "num_input_tokens_seen": 38500832, "step": 11570 }, { "epoch": 2.315984293324663, "grad_norm": 0.5182342529296875, "learning_rate": 6.137016051966127e-06, "loss": 0.0077, "num_input_tokens_seen": 38516672, "step": 11575 }, { "epoch": 2.3169847185053647, "grad_norm": 0.022903772071003914, "learning_rate": 6.119834782963218e-06, "loss": 0.0054, "num_input_tokens_seen": 38534592, "step": 11580 }, { "epoch": 2.317985143686067, "grad_norm": 0.06478303670883179, "learning_rate": 6.102674243244064e-06, "loss": 0.0044, "num_input_tokens_seen": 38551136, "step": 11585 }, { "epoch": 2.3189855688667684, "grad_norm": 1.263426661491394, "learning_rate": 6.085534451649905e-06, "loss": 0.0083, "num_input_tokens_seen": 38567744, "step": 11590 }, { "epoch": 2.31998599404747, "grad_norm": 0.597636878490448, "learning_rate": 6.0684154269991915e-06, "loss": 0.0102, "num_input_tokens_seen": 38584768, "step": 11595 }, { "epoch": 2.3209864192281717, "grad_norm": 1.0190768241882324, "learning_rate": 6.051317188087591e-06, "loss": 0.0168, "num_input_tokens_seen": 38601632, "step": 11600 }, { "epoch": 2.321986844408874, "grad_norm": 0.002115818904712796, "learning_rate": 6.034239753687923e-06, "loss": 0.0027, "num_input_tokens_seen": 38617664, "step": 11605 }, { "epoch": 2.3229872695895755, "grad_norm": 0.2652653753757477, "learning_rate": 6.017183142550187e-06, "loss": 0.005, "num_input_tokens_seen": 38633952, "step": 11610 }, { "epoch": 2.3239876947702776, "grad_norm": 0.32525134086608887, "learning_rate": 6.000147373401535e-06, "loss": 0.0052, "num_input_tokens_seen": 38649952, "step": 11615 }, { "epoch": 2.324988119950979, "grad_norm": 0.05901453271508217, "learning_rate": 5.9831324649461896e-06, "loss": 0.0006, "num_input_tokens_seen": 38665152, "step": 11620 }, { "epoch": 2.325988545131681, "grad_norm": 0.978908121585846, "learning_rate": 5.966138435865507e-06, "loss": 0.0116, "num_input_tokens_seen": 38681568, "step": 11625 }, { "epoch": 2.326988970312383, "grad_norm": 0.013850186951458454, "learning_rate": 5.949165304817908e-06, "loss": 0.0082, "num_input_tokens_seen": 38697792, "step": 11630 }, { "epoch": 2.3279893954930846, "grad_norm": 1.3319257497787476, "learning_rate": 5.932213090438859e-06, "loss": 0.0048, "num_input_tokens_seen": 38714752, "step": 11635 }, { "epoch": 2.3289898206737862, "grad_norm": 0.5809262990951538, "learning_rate": 5.915281811340878e-06, "loss": 0.0045, "num_input_tokens_seen": 38731744, "step": 11640 }, { "epoch": 2.3299902458544883, "grad_norm": 0.34603744745254517, "learning_rate": 5.898371486113491e-06, "loss": 0.0043, "num_input_tokens_seen": 38748224, "step": 11645 }, { "epoch": 2.33099067103519, "grad_norm": 0.34748753905296326, "learning_rate": 5.881482133323205e-06, "loss": 0.0074, "num_input_tokens_seen": 38763808, "step": 11650 }, { "epoch": 2.3319910962158916, "grad_norm": 0.2618560194969177, "learning_rate": 5.864613771513527e-06, "loss": 0.0077, "num_input_tokens_seen": 38779648, "step": 11655 }, { "epoch": 2.3329915213965937, "grad_norm": 0.12622158229351044, "learning_rate": 5.847766419204883e-06, "loss": 0.0021, "num_input_tokens_seen": 38795264, "step": 11660 }, { "epoch": 2.3339919465772954, "grad_norm": 0.0028299703262746334, "learning_rate": 5.830940094894658e-06, "loss": 0.0011, "num_input_tokens_seen": 38812064, "step": 11665 }, { "epoch": 2.334992371757997, "grad_norm": 0.08322950452566147, "learning_rate": 5.814134817057157e-06, "loss": 0.0053, "num_input_tokens_seen": 38828256, "step": 11670 }, { "epoch": 2.335992796938699, "grad_norm": 0.7389622330665588, "learning_rate": 5.797350604143542e-06, "loss": 0.0053, "num_input_tokens_seen": 38845024, "step": 11675 }, { "epoch": 2.3369932221194007, "grad_norm": 0.0005778248887509108, "learning_rate": 5.780587474581878e-06, "loss": 0.0016, "num_input_tokens_seen": 38861280, "step": 11680 }, { "epoch": 2.3379936473001024, "grad_norm": 0.1739816516637802, "learning_rate": 5.763845446777078e-06, "loss": 0.0021, "num_input_tokens_seen": 38879392, "step": 11685 }, { "epoch": 2.3389940724808045, "grad_norm": 0.23818816244602203, "learning_rate": 5.7471245391108594e-06, "loss": 0.0017, "num_input_tokens_seen": 38896704, "step": 11690 }, { "epoch": 2.339994497661506, "grad_norm": 0.20521743595600128, "learning_rate": 5.730424769941786e-06, "loss": 0.0062, "num_input_tokens_seen": 38913248, "step": 11695 }, { "epoch": 2.340994922842208, "grad_norm": 1.6082990169525146, "learning_rate": 5.713746157605199e-06, "loss": 0.0089, "num_input_tokens_seen": 38930240, "step": 11700 }, { "epoch": 2.34199534802291, "grad_norm": 0.7052076458930969, "learning_rate": 5.697088720413202e-06, "loss": 0.007, "num_input_tokens_seen": 38946784, "step": 11705 }, { "epoch": 2.3429957732036115, "grad_norm": 0.014426125213503838, "learning_rate": 5.680452476654657e-06, "loss": 0.0034, "num_input_tokens_seen": 38964352, "step": 11710 }, { "epoch": 2.343996198384313, "grad_norm": 0.027191922068595886, "learning_rate": 5.663837444595163e-06, "loss": 0.0073, "num_input_tokens_seen": 38980288, "step": 11715 }, { "epoch": 2.3449966235650153, "grad_norm": 0.4602048099040985, "learning_rate": 5.647243642477024e-06, "loss": 0.0028, "num_input_tokens_seen": 38996576, "step": 11720 }, { "epoch": 2.345997048745717, "grad_norm": 0.12057361751794815, "learning_rate": 5.6306710885192374e-06, "loss": 0.0009, "num_input_tokens_seen": 39012576, "step": 11725 }, { "epoch": 2.3469974739264186, "grad_norm": 0.13868285715579987, "learning_rate": 5.614119800917459e-06, "loss": 0.002, "num_input_tokens_seen": 39028768, "step": 11730 }, { "epoch": 2.3479978991071206, "grad_norm": 0.05260397493839264, "learning_rate": 5.597589797844008e-06, "loss": 0.0019, "num_input_tokens_seen": 39044640, "step": 11735 }, { "epoch": 2.3489983242878223, "grad_norm": 0.05297001078724861, "learning_rate": 5.5810810974478434e-06, "loss": 0.0013, "num_input_tokens_seen": 39061184, "step": 11740 }, { "epoch": 2.349998749468524, "grad_norm": 0.0046495902352035046, "learning_rate": 5.564593717854505e-06, "loss": 0.0089, "num_input_tokens_seen": 39077312, "step": 11745 }, { "epoch": 2.350999174649226, "grad_norm": 2.4701688289642334, "learning_rate": 5.548127677166154e-06, "loss": 0.0042, "num_input_tokens_seen": 39094176, "step": 11750 }, { "epoch": 2.3519995998299277, "grad_norm": 0.048507627099752426, "learning_rate": 5.531682993461515e-06, "loss": 0.0005, "num_input_tokens_seen": 39111424, "step": 11755 }, { "epoch": 2.3530000250106293, "grad_norm": 1.5936695337295532, "learning_rate": 5.515259684795845e-06, "loss": 0.0081, "num_input_tokens_seen": 39127488, "step": 11760 }, { "epoch": 2.3540004501913314, "grad_norm": 0.12213210016489029, "learning_rate": 5.498857769200957e-06, "loss": 0.0136, "num_input_tokens_seen": 39144192, "step": 11765 }, { "epoch": 2.355000875372033, "grad_norm": 0.7076942920684814, "learning_rate": 5.482477264685165e-06, "loss": 0.0028, "num_input_tokens_seen": 39161920, "step": 11770 }, { "epoch": 2.3560013005527347, "grad_norm": 0.5250160098075867, "learning_rate": 5.4661181892332744e-06, "loss": 0.0018, "num_input_tokens_seen": 39178784, "step": 11775 }, { "epoch": 2.357001725733437, "grad_norm": 0.22318120300769806, "learning_rate": 5.449780560806572e-06, "loss": 0.0039, "num_input_tokens_seen": 39194976, "step": 11780 }, { "epoch": 2.3580021509141385, "grad_norm": 0.07826335728168488, "learning_rate": 5.433464397342777e-06, "loss": 0.0059, "num_input_tokens_seen": 39210752, "step": 11785 }, { "epoch": 2.3590025760948405, "grad_norm": 0.021109146997332573, "learning_rate": 5.41716971675606e-06, "loss": 0.0039, "num_input_tokens_seen": 39227200, "step": 11790 }, { "epoch": 2.360003001275542, "grad_norm": 0.7640042304992676, "learning_rate": 5.400896536937003e-06, "loss": 0.0033, "num_input_tokens_seen": 39243296, "step": 11795 }, { "epoch": 2.361003426456244, "grad_norm": 1.1045968532562256, "learning_rate": 5.3846448757525645e-06, "loss": 0.0067, "num_input_tokens_seen": 39260960, "step": 11800 }, { "epoch": 2.3620038516369455, "grad_norm": 0.10084889829158783, "learning_rate": 5.368414751046097e-06, "loss": 0.0023, "num_input_tokens_seen": 39276416, "step": 11805 }, { "epoch": 2.3630042768176476, "grad_norm": 0.0003986780939158052, "learning_rate": 5.352206180637306e-06, "loss": 0.0104, "num_input_tokens_seen": 39292928, "step": 11810 }, { "epoch": 2.3640047019983492, "grad_norm": 0.014915438368916512, "learning_rate": 5.336019182322205e-06, "loss": 0.0065, "num_input_tokens_seen": 39309184, "step": 11815 }, { "epoch": 2.3650051271790513, "grad_norm": 0.2112547606229782, "learning_rate": 5.319853773873168e-06, "loss": 0.0105, "num_input_tokens_seen": 39326400, "step": 11820 }, { "epoch": 2.366005552359753, "grad_norm": 0.9200066328048706, "learning_rate": 5.303709973038823e-06, "loss": 0.0032, "num_input_tokens_seen": 39343552, "step": 11825 }, { "epoch": 2.3670059775404546, "grad_norm": 1.281941533088684, "learning_rate": 5.287587797544091e-06, "loss": 0.0068, "num_input_tokens_seen": 39361280, "step": 11830 }, { "epoch": 2.3680064027211563, "grad_norm": 0.4280705153942108, "learning_rate": 5.271487265090163e-06, "loss": 0.0076, "num_input_tokens_seen": 39379360, "step": 11835 }, { "epoch": 2.3690068279018583, "grad_norm": 0.6366599798202515, "learning_rate": 5.255408393354436e-06, "loss": 0.0047, "num_input_tokens_seen": 39395904, "step": 11840 }, { "epoch": 2.37000725308256, "grad_norm": 0.7315851449966431, "learning_rate": 5.23935119999055e-06, "loss": 0.0043, "num_input_tokens_seen": 39411552, "step": 11845 }, { "epoch": 2.371007678263262, "grad_norm": 0.9730765223503113, "learning_rate": 5.22331570262834e-06, "loss": 0.0014, "num_input_tokens_seen": 39427776, "step": 11850 }, { "epoch": 2.3720081034439637, "grad_norm": 0.05999741330742836, "learning_rate": 5.207301918873806e-06, "loss": 0.0023, "num_input_tokens_seen": 39443200, "step": 11855 }, { "epoch": 2.3730085286246654, "grad_norm": 1.359703779220581, "learning_rate": 5.191309866309124e-06, "loss": 0.0131, "num_input_tokens_seen": 39459808, "step": 11860 }, { "epoch": 2.3740089538053675, "grad_norm": 0.2668972909450531, "learning_rate": 5.175339562492609e-06, "loss": 0.0029, "num_input_tokens_seen": 39477152, "step": 11865 }, { "epoch": 2.375009378986069, "grad_norm": 0.11682423949241638, "learning_rate": 5.159391024958673e-06, "loss": 0.0135, "num_input_tokens_seen": 39493312, "step": 11870 }, { "epoch": 2.3760098041667708, "grad_norm": 0.002341238083317876, "learning_rate": 5.143464271217877e-06, "loss": 0.0026, "num_input_tokens_seen": 39510176, "step": 11875 }, { "epoch": 2.377010229347473, "grad_norm": 0.8494821190834045, "learning_rate": 5.127559318756814e-06, "loss": 0.007, "num_input_tokens_seen": 39526848, "step": 11880 }, { "epoch": 2.3780106545281745, "grad_norm": 0.15606018900871277, "learning_rate": 5.111676185038172e-06, "loss": 0.0122, "num_input_tokens_seen": 39544288, "step": 11885 }, { "epoch": 2.379011079708876, "grad_norm": 0.10261460393667221, "learning_rate": 5.095814887500683e-06, "loss": 0.0037, "num_input_tokens_seen": 39560320, "step": 11890 }, { "epoch": 2.3800115048895782, "grad_norm": 0.6049176454544067, "learning_rate": 5.0799754435590795e-06, "loss": 0.0045, "num_input_tokens_seen": 39576544, "step": 11895 }, { "epoch": 2.38101193007028, "grad_norm": 0.6998255848884583, "learning_rate": 5.064157870604125e-06, "loss": 0.0069, "num_input_tokens_seen": 39593312, "step": 11900 }, { "epoch": 2.3820123552509815, "grad_norm": 1.1377514600753784, "learning_rate": 5.048362186002567e-06, "loss": 0.0088, "num_input_tokens_seen": 39610624, "step": 11905 }, { "epoch": 2.3830127804316836, "grad_norm": 0.0668625608086586, "learning_rate": 5.0325884070971054e-06, "loss": 0.0019, "num_input_tokens_seen": 39628384, "step": 11910 }, { "epoch": 2.3840132056123853, "grad_norm": 0.583050549030304, "learning_rate": 5.0168365512064e-06, "loss": 0.004, "num_input_tokens_seen": 39643936, "step": 11915 }, { "epoch": 2.385013630793087, "grad_norm": 0.17105671763420105, "learning_rate": 5.001106635625044e-06, "loss": 0.0126, "num_input_tokens_seen": 39660224, "step": 11920 }, { "epoch": 2.386014055973789, "grad_norm": 0.11508852243423462, "learning_rate": 4.985398677623535e-06, "loss": 0.0013, "num_input_tokens_seen": 39676800, "step": 11925 }, { "epoch": 2.3870144811544907, "grad_norm": 1.0608491897583008, "learning_rate": 4.969712694448272e-06, "loss": 0.0167, "num_input_tokens_seen": 39693856, "step": 11930 }, { "epoch": 2.3880149063351923, "grad_norm": 0.2027820646762848, "learning_rate": 4.954048703321506e-06, "loss": 0.0023, "num_input_tokens_seen": 39709984, "step": 11935 }, { "epoch": 2.3890153315158944, "grad_norm": 0.12901784479618073, "learning_rate": 4.93840672144136e-06, "loss": 0.01, "num_input_tokens_seen": 39727040, "step": 11940 }, { "epoch": 2.390015756696596, "grad_norm": 0.0070501226000487804, "learning_rate": 4.922786765981796e-06, "loss": 0.0064, "num_input_tokens_seen": 39742848, "step": 11945 }, { "epoch": 2.3910161818772977, "grad_norm": 0.12959033250808716, "learning_rate": 4.90718885409257e-06, "loss": 0.0048, "num_input_tokens_seen": 39759712, "step": 11950 }, { "epoch": 2.392016607058, "grad_norm": 0.19388934969902039, "learning_rate": 4.891613002899259e-06, "loss": 0.0031, "num_input_tokens_seen": 39775200, "step": 11955 }, { "epoch": 2.3930170322387014, "grad_norm": 1.7149473428726196, "learning_rate": 4.8760592295032104e-06, "loss": 0.0102, "num_input_tokens_seen": 39791648, "step": 11960 }, { "epoch": 2.394017457419403, "grad_norm": 0.009398255497217178, "learning_rate": 4.860527550981519e-06, "loss": 0.013, "num_input_tokens_seen": 39808192, "step": 11965 }, { "epoch": 2.395017882600105, "grad_norm": 0.08744099736213684, "learning_rate": 4.84501798438704e-06, "loss": 0.0076, "num_input_tokens_seen": 39826816, "step": 11970 }, { "epoch": 2.396018307780807, "grad_norm": 0.014029473066329956, "learning_rate": 4.829530546748342e-06, "loss": 0.0095, "num_input_tokens_seen": 39843104, "step": 11975 }, { "epoch": 2.3970187329615085, "grad_norm": 0.044142477214336395, "learning_rate": 4.8140652550697e-06, "loss": 0.0013, "num_input_tokens_seen": 39859584, "step": 11980 }, { "epoch": 2.3980191581422106, "grad_norm": 0.3335197865962982, "learning_rate": 4.798622126331076e-06, "loss": 0.0045, "num_input_tokens_seen": 39876160, "step": 11985 }, { "epoch": 2.399019583322912, "grad_norm": 0.4007914662361145, "learning_rate": 4.783201177488084e-06, "loss": 0.0063, "num_input_tokens_seen": 39892448, "step": 11990 }, { "epoch": 2.400020008503614, "grad_norm": 0.4795380234718323, "learning_rate": 4.767802425471998e-06, "loss": 0.0027, "num_input_tokens_seen": 39908576, "step": 11995 }, { "epoch": 2.401020433684316, "grad_norm": 0.6965258717536926, "learning_rate": 4.7524258871897345e-06, "loss": 0.003, "num_input_tokens_seen": 39925184, "step": 12000 }, { "epoch": 2.4020208588650176, "grad_norm": 0.07378388941287994, "learning_rate": 4.7370715795237885e-06, "loss": 0.0063, "num_input_tokens_seen": 39941664, "step": 12005 }, { "epoch": 2.4030212840457192, "grad_norm": 2.771740436553955, "learning_rate": 4.7217395193322715e-06, "loss": 0.0115, "num_input_tokens_seen": 39958560, "step": 12010 }, { "epoch": 2.4040217092264213, "grad_norm": 0.05147058516740799, "learning_rate": 4.706429723448869e-06, "loss": 0.0089, "num_input_tokens_seen": 39974272, "step": 12015 }, { "epoch": 2.405022134407123, "grad_norm": 0.015699077397584915, "learning_rate": 4.691142208682797e-06, "loss": 0.0074, "num_input_tokens_seen": 39989632, "step": 12020 }, { "epoch": 2.406022559587825, "grad_norm": 0.048658337444067, "learning_rate": 4.675876991818839e-06, "loss": 0.0138, "num_input_tokens_seen": 40005696, "step": 12025 }, { "epoch": 2.4070229847685267, "grad_norm": 1.0035970211029053, "learning_rate": 4.66063408961728e-06, "loss": 0.0043, "num_input_tokens_seen": 40023520, "step": 12030 }, { "epoch": 2.4080234099492284, "grad_norm": 0.024283969774842262, "learning_rate": 4.645413518813907e-06, "loss": 0.0038, "num_input_tokens_seen": 40039584, "step": 12035 }, { "epoch": 2.40902383512993, "grad_norm": 0.01331272255629301, "learning_rate": 4.63021529612e-06, "loss": 0.0027, "num_input_tokens_seen": 40056800, "step": 12040 }, { "epoch": 2.410024260310632, "grad_norm": 0.07054154574871063, "learning_rate": 4.615039438222271e-06, "loss": 0.0121, "num_input_tokens_seen": 40073152, "step": 12045 }, { "epoch": 2.4110246854913338, "grad_norm": 0.25928425788879395, "learning_rate": 4.5998859617829126e-06, "loss": 0.0094, "num_input_tokens_seen": 40089568, "step": 12050 }, { "epoch": 2.412025110672036, "grad_norm": 0.1377958059310913, "learning_rate": 4.584754883439532e-06, "loss": 0.006, "num_input_tokens_seen": 40104736, "step": 12055 }, { "epoch": 2.4130255358527375, "grad_norm": 0.2971647083759308, "learning_rate": 4.569646219805126e-06, "loss": 0.0102, "num_input_tokens_seen": 40122496, "step": 12060 }, { "epoch": 2.414025961033439, "grad_norm": 0.0399310477077961, "learning_rate": 4.554559987468107e-06, "loss": 0.0134, "num_input_tokens_seen": 40139040, "step": 12065 }, { "epoch": 2.415026386214141, "grad_norm": 0.8164964318275452, "learning_rate": 4.539496202992252e-06, "loss": 0.0053, "num_input_tokens_seen": 40155392, "step": 12070 }, { "epoch": 2.416026811394843, "grad_norm": 2.913259983062744, "learning_rate": 4.524454882916676e-06, "loss": 0.0114, "num_input_tokens_seen": 40171808, "step": 12075 }, { "epoch": 2.4170272365755445, "grad_norm": 0.16079522669315338, "learning_rate": 4.50943604375586e-06, "loss": 0.0043, "num_input_tokens_seen": 40187200, "step": 12080 }, { "epoch": 2.4180276617562466, "grad_norm": 0.0017121377168223262, "learning_rate": 4.494439701999567e-06, "loss": 0.0053, "num_input_tokens_seen": 40204992, "step": 12085 }, { "epoch": 2.4190280869369483, "grad_norm": 0.07056961953639984, "learning_rate": 4.479465874112887e-06, "loss": 0.0083, "num_input_tokens_seen": 40222432, "step": 12090 }, { "epoch": 2.42002851211765, "grad_norm": 1.3982396125793457, "learning_rate": 4.464514576536183e-06, "loss": 0.0116, "num_input_tokens_seen": 40238976, "step": 12095 }, { "epoch": 2.4210289372983516, "grad_norm": 0.018442688509821892, "learning_rate": 4.449585825685071e-06, "loss": 0.0011, "num_input_tokens_seen": 40255424, "step": 12100 }, { "epoch": 2.4220293624790536, "grad_norm": 0.058521077036857605, "learning_rate": 4.434679637950417e-06, "loss": 0.0041, "num_input_tokens_seen": 40271456, "step": 12105 }, { "epoch": 2.4230297876597553, "grad_norm": 0.002892683958634734, "learning_rate": 4.419796029698334e-06, "loss": 0.0023, "num_input_tokens_seen": 40286912, "step": 12110 }, { "epoch": 2.4240302128404574, "grad_norm": 0.00801822729408741, "learning_rate": 4.404935017270109e-06, "loss": 0.0077, "num_input_tokens_seen": 40303072, "step": 12115 }, { "epoch": 2.425030638021159, "grad_norm": 0.17238368093967438, "learning_rate": 4.390096616982242e-06, "loss": 0.0013, "num_input_tokens_seen": 40320352, "step": 12120 }, { "epoch": 2.4260310632018607, "grad_norm": 0.005229899659752846, "learning_rate": 4.375280845126409e-06, "loss": 0.0045, "num_input_tokens_seen": 40337792, "step": 12125 }, { "epoch": 2.4270314883825628, "grad_norm": 0.5783362984657288, "learning_rate": 4.36048771796943e-06, "loss": 0.0068, "num_input_tokens_seen": 40355264, "step": 12130 }, { "epoch": 2.4280319135632644, "grad_norm": 0.23066847026348114, "learning_rate": 4.345717251753273e-06, "loss": 0.0099, "num_input_tokens_seen": 40371936, "step": 12135 }, { "epoch": 2.429032338743966, "grad_norm": 1.4478811025619507, "learning_rate": 4.330969462695012e-06, "loss": 0.0024, "num_input_tokens_seen": 40387968, "step": 12140 }, { "epoch": 2.430032763924668, "grad_norm": 0.05630865320563316, "learning_rate": 4.316244366986833e-06, "loss": 0.0019, "num_input_tokens_seen": 40404224, "step": 12145 }, { "epoch": 2.43103318910537, "grad_norm": 1.3600881099700928, "learning_rate": 4.301541980796009e-06, "loss": 0.0113, "num_input_tokens_seen": 40420320, "step": 12150 }, { "epoch": 2.4320336142860715, "grad_norm": 2.776556968688965, "learning_rate": 4.286862320264865e-06, "loss": 0.0255, "num_input_tokens_seen": 40439008, "step": 12155 }, { "epoch": 2.4330340394667735, "grad_norm": 0.07221867889165878, "learning_rate": 4.272205401510787e-06, "loss": 0.0082, "num_input_tokens_seen": 40455040, "step": 12160 }, { "epoch": 2.434034464647475, "grad_norm": 0.12634849548339844, "learning_rate": 4.257571240626198e-06, "loss": 0.002, "num_input_tokens_seen": 40471808, "step": 12165 }, { "epoch": 2.435034889828177, "grad_norm": 0.7212656736373901, "learning_rate": 4.242959853678513e-06, "loss": 0.0198, "num_input_tokens_seen": 40489344, "step": 12170 }, { "epoch": 2.436035315008879, "grad_norm": 0.3485330641269684, "learning_rate": 4.228371256710157e-06, "loss": 0.0098, "num_input_tokens_seen": 40505600, "step": 12175 }, { "epoch": 2.4370357401895806, "grad_norm": 0.12140169739723206, "learning_rate": 4.21380546573853e-06, "loss": 0.0007, "num_input_tokens_seen": 40522080, "step": 12180 }, { "epoch": 2.4380361653702822, "grad_norm": 0.07376435399055481, "learning_rate": 4.199262496755998e-06, "loss": 0.0144, "num_input_tokens_seen": 40538368, "step": 12185 }, { "epoch": 2.4390365905509843, "grad_norm": 0.10095193237066269, "learning_rate": 4.184742365729866e-06, "loss": 0.001, "num_input_tokens_seen": 40554784, "step": 12190 }, { "epoch": 2.440037015731686, "grad_norm": 0.14332710206508636, "learning_rate": 4.170245088602351e-06, "loss": 0.0007, "num_input_tokens_seen": 40570688, "step": 12195 }, { "epoch": 2.4410374409123876, "grad_norm": 0.6795973181724548, "learning_rate": 4.155770681290599e-06, "loss": 0.0018, "num_input_tokens_seen": 40586272, "step": 12200 }, { "epoch": 2.4420378660930897, "grad_norm": 1.7163009643554688, "learning_rate": 4.141319159686641e-06, "loss": 0.0113, "num_input_tokens_seen": 40602400, "step": 12205 }, { "epoch": 2.4430382912737914, "grad_norm": 1.4630216360092163, "learning_rate": 4.1268905396573636e-06, "loss": 0.003, "num_input_tokens_seen": 40618336, "step": 12210 }, { "epoch": 2.444038716454493, "grad_norm": 0.5265514850616455, "learning_rate": 4.112484837044531e-06, "loss": 0.0023, "num_input_tokens_seen": 40635008, "step": 12215 }, { "epoch": 2.445039141635195, "grad_norm": 0.6192420125007629, "learning_rate": 4.0981020676647385e-06, "loss": 0.0037, "num_input_tokens_seen": 40651648, "step": 12220 }, { "epoch": 2.4460395668158967, "grad_norm": 0.3160333037376404, "learning_rate": 4.083742247309394e-06, "loss": 0.0036, "num_input_tokens_seen": 40669184, "step": 12225 }, { "epoch": 2.4470399919965984, "grad_norm": 0.0337165929377079, "learning_rate": 4.069405391744718e-06, "loss": 0.0039, "num_input_tokens_seen": 40685568, "step": 12230 }, { "epoch": 2.4480404171773005, "grad_norm": 0.027851616963744164, "learning_rate": 4.055091516711712e-06, "loss": 0.0011, "num_input_tokens_seen": 40702944, "step": 12235 }, { "epoch": 2.449040842358002, "grad_norm": 1.460172176361084, "learning_rate": 4.040800637926151e-06, "loss": 0.0063, "num_input_tokens_seen": 40720576, "step": 12240 }, { "epoch": 2.4500412675387038, "grad_norm": 0.32231810688972473, "learning_rate": 4.026532771078565e-06, "loss": 0.0019, "num_input_tokens_seen": 40736896, "step": 12245 }, { "epoch": 2.451041692719406, "grad_norm": 1.7847610712051392, "learning_rate": 4.012287931834196e-06, "loss": 0.0052, "num_input_tokens_seen": 40752576, "step": 12250 }, { "epoch": 2.4520421179001075, "grad_norm": 0.21667858958244324, "learning_rate": 3.998066135833031e-06, "loss": 0.0053, "num_input_tokens_seen": 40770432, "step": 12255 }, { "epoch": 2.453042543080809, "grad_norm": 1.9653438329696655, "learning_rate": 3.983867398689747e-06, "loss": 0.0206, "num_input_tokens_seen": 40787360, "step": 12260 }, { "epoch": 2.4540429682615112, "grad_norm": 0.27307531237602234, "learning_rate": 3.969691735993691e-06, "loss": 0.0039, "num_input_tokens_seen": 40804128, "step": 12265 }, { "epoch": 2.455043393442213, "grad_norm": 0.0512768030166626, "learning_rate": 3.955539163308894e-06, "loss": 0.0089, "num_input_tokens_seen": 40820704, "step": 12270 }, { "epoch": 2.4560438186229145, "grad_norm": 0.2971963584423065, "learning_rate": 3.941409696174034e-06, "loss": 0.0056, "num_input_tokens_seen": 40837344, "step": 12275 }, { "epoch": 2.4570442438036166, "grad_norm": 0.05244804173707962, "learning_rate": 3.927303350102393e-06, "loss": 0.0056, "num_input_tokens_seen": 40853824, "step": 12280 }, { "epoch": 2.4580446689843183, "grad_norm": 0.10713346302509308, "learning_rate": 3.913220140581908e-06, "loss": 0.0104, "num_input_tokens_seen": 40870560, "step": 12285 }, { "epoch": 2.4590450941650204, "grad_norm": 1.085828185081482, "learning_rate": 3.899160083075095e-06, "loss": 0.0175, "num_input_tokens_seen": 40886336, "step": 12290 }, { "epoch": 2.460045519345722, "grad_norm": 0.4680262804031372, "learning_rate": 3.885123193019041e-06, "loss": 0.0032, "num_input_tokens_seen": 40902912, "step": 12295 }, { "epoch": 2.4610459445264237, "grad_norm": 0.21633762121200562, "learning_rate": 3.871109485825414e-06, "loss": 0.0165, "num_input_tokens_seen": 40919968, "step": 12300 }, { "epoch": 2.4620463697071253, "grad_norm": 2.2871382236480713, "learning_rate": 3.857118976880409e-06, "loss": 0.0156, "num_input_tokens_seen": 40937184, "step": 12305 }, { "epoch": 2.4630467948878274, "grad_norm": 0.4637632966041565, "learning_rate": 3.843151681544768e-06, "loss": 0.0024, "num_input_tokens_seen": 40955872, "step": 12310 }, { "epoch": 2.464047220068529, "grad_norm": 0.09708676487207413, "learning_rate": 3.829207615153743e-06, "loss": 0.0092, "num_input_tokens_seen": 40972000, "step": 12315 }, { "epoch": 2.465047645249231, "grad_norm": 0.13777290284633636, "learning_rate": 3.815286793017072e-06, "loss": 0.0049, "num_input_tokens_seen": 40989056, "step": 12320 }, { "epoch": 2.466048070429933, "grad_norm": 1.4429576396942139, "learning_rate": 3.801389230418981e-06, "loss": 0.0055, "num_input_tokens_seen": 41005984, "step": 12325 }, { "epoch": 2.4670484956106344, "grad_norm": 0.30174750089645386, "learning_rate": 3.787514942618159e-06, "loss": 0.0011, "num_input_tokens_seen": 41022976, "step": 12330 }, { "epoch": 2.468048920791336, "grad_norm": 0.25705936551094055, "learning_rate": 3.7736639448477356e-06, "loss": 0.006, "num_input_tokens_seen": 41040672, "step": 12335 }, { "epoch": 2.469049345972038, "grad_norm": 0.9293633103370667, "learning_rate": 3.7598362523152714e-06, "loss": 0.0112, "num_input_tokens_seen": 41056832, "step": 12340 }, { "epoch": 2.47004977115274, "grad_norm": 0.7727804183959961, "learning_rate": 3.7460318802027465e-06, "loss": 0.0065, "num_input_tokens_seen": 41073056, "step": 12345 }, { "epoch": 2.471050196333442, "grad_norm": 0.9562500715255737, "learning_rate": 3.7322508436665183e-06, "loss": 0.0081, "num_input_tokens_seen": 41088640, "step": 12350 }, { "epoch": 2.4720506215141436, "grad_norm": 0.004255808889865875, "learning_rate": 3.718493157837344e-06, "loss": 0.0022, "num_input_tokens_seen": 41106144, "step": 12355 }, { "epoch": 2.473051046694845, "grad_norm": 0.43987080454826355, "learning_rate": 3.704758837820327e-06, "loss": 0.0025, "num_input_tokens_seen": 41122176, "step": 12360 }, { "epoch": 2.474051471875547, "grad_norm": 0.2720640301704407, "learning_rate": 3.6910478986949243e-06, "loss": 0.003, "num_input_tokens_seen": 41139072, "step": 12365 }, { "epoch": 2.475051897056249, "grad_norm": 0.7602618932723999, "learning_rate": 3.677360355514928e-06, "loss": 0.0046, "num_input_tokens_seen": 41155616, "step": 12370 }, { "epoch": 2.4760523222369506, "grad_norm": 0.5711652636528015, "learning_rate": 3.6636962233084243e-06, "loss": 0.0044, "num_input_tokens_seen": 41171680, "step": 12375 }, { "epoch": 2.4770527474176527, "grad_norm": 0.3574143648147583, "learning_rate": 3.6500555170778134e-06, "loss": 0.0034, "num_input_tokens_seen": 41188608, "step": 12380 }, { "epoch": 2.4780531725983543, "grad_norm": 0.02313903719186783, "learning_rate": 3.6364382517997676e-06, "loss": 0.0057, "num_input_tokens_seen": 41205184, "step": 12385 }, { "epoch": 2.479053597779056, "grad_norm": 0.022654995322227478, "learning_rate": 3.6228444424252244e-06, "loss": 0.0041, "num_input_tokens_seen": 41221888, "step": 12390 }, { "epoch": 2.480054022959758, "grad_norm": 0.04455249384045601, "learning_rate": 3.6092741038793677e-06, "loss": 0.0041, "num_input_tokens_seen": 41237472, "step": 12395 }, { "epoch": 2.4810544481404597, "grad_norm": 0.005392350722104311, "learning_rate": 3.595727251061615e-06, "loss": 0.0022, "num_input_tokens_seen": 41255104, "step": 12400 }, { "epoch": 2.4820548733211614, "grad_norm": 1.2513427734375, "learning_rate": 3.5822038988455876e-06, "loss": 0.011, "num_input_tokens_seen": 41272032, "step": 12405 }, { "epoch": 2.4830552985018635, "grad_norm": 0.09189336001873016, "learning_rate": 3.568704062079112e-06, "loss": 0.0025, "num_input_tokens_seen": 41288288, "step": 12410 }, { "epoch": 2.484055723682565, "grad_norm": 1.106570839881897, "learning_rate": 3.5552277555842036e-06, "loss": 0.0099, "num_input_tokens_seen": 41303616, "step": 12415 }, { "epoch": 2.4850561488632668, "grad_norm": 1.25088369846344, "learning_rate": 3.5417749941570236e-06, "loss": 0.0034, "num_input_tokens_seen": 41320416, "step": 12420 }, { "epoch": 2.486056574043969, "grad_norm": 0.39379605650901794, "learning_rate": 3.5283457925679054e-06, "loss": 0.0174, "num_input_tokens_seen": 41337760, "step": 12425 }, { "epoch": 2.4870569992246705, "grad_norm": 0.5777842998504639, "learning_rate": 3.5149401655612897e-06, "loss": 0.0031, "num_input_tokens_seen": 41353568, "step": 12430 }, { "epoch": 2.488057424405372, "grad_norm": 0.09307357668876648, "learning_rate": 3.50155812785575e-06, "loss": 0.0018, "num_input_tokens_seen": 41370496, "step": 12435 }, { "epoch": 2.4890578495860742, "grad_norm": 0.009802114218473434, "learning_rate": 3.4881996941439705e-06, "loss": 0.0037, "num_input_tokens_seen": 41386432, "step": 12440 }, { "epoch": 2.490058274766776, "grad_norm": 0.0019708112813532352, "learning_rate": 3.474864879092693e-06, "loss": 0.0031, "num_input_tokens_seen": 41403296, "step": 12445 }, { "epoch": 2.4910586999474775, "grad_norm": 0.3863993287086487, "learning_rate": 3.461553697342745e-06, "loss": 0.002, "num_input_tokens_seen": 41420032, "step": 12450 }, { "epoch": 2.4920591251281796, "grad_norm": 0.7204772233963013, "learning_rate": 3.448266163509009e-06, "loss": 0.0074, "num_input_tokens_seen": 41437792, "step": 12455 }, { "epoch": 2.4930595503088813, "grad_norm": 0.0746840089559555, "learning_rate": 3.4350022921803807e-06, "loss": 0.0043, "num_input_tokens_seen": 41453632, "step": 12460 }, { "epoch": 2.494059975489583, "grad_norm": 0.3205074965953827, "learning_rate": 3.4217620979198063e-06, "loss": 0.0038, "num_input_tokens_seen": 41470336, "step": 12465 }, { "epoch": 2.495060400670285, "grad_norm": 0.013899927958846092, "learning_rate": 3.4085455952642175e-06, "loss": 0.0042, "num_input_tokens_seen": 41486624, "step": 12470 }, { "epoch": 2.4960608258509867, "grad_norm": 0.2298094779253006, "learning_rate": 3.3953527987245298e-06, "loss": 0.0029, "num_input_tokens_seen": 41502528, "step": 12475 }, { "epoch": 2.4970612510316883, "grad_norm": 1.8205831050872803, "learning_rate": 3.382183722785645e-06, "loss": 0.0048, "num_input_tokens_seen": 41519200, "step": 12480 }, { "epoch": 2.4980616762123904, "grad_norm": 0.0019053603755310178, "learning_rate": 3.3690383819064168e-06, "loss": 0.0032, "num_input_tokens_seen": 41536224, "step": 12485 }, { "epoch": 2.499062101393092, "grad_norm": 0.08693382889032364, "learning_rate": 3.3559167905196366e-06, "loss": 0.0093, "num_input_tokens_seen": 41554048, "step": 12490 }, { "epoch": 2.500062526573794, "grad_norm": 0.004477310460060835, "learning_rate": 3.342818963032024e-06, "loss": 0.0041, "num_input_tokens_seen": 41570688, "step": 12495 }, { "epoch": 2.5010629517544958, "grad_norm": 0.614901065826416, "learning_rate": 3.3297449138241925e-06, "loss": 0.0141, "num_input_tokens_seen": 41586944, "step": 12500 }, { "epoch": 2.5020633769351974, "grad_norm": 0.206019788980484, "learning_rate": 3.3166946572506713e-06, "loss": 0.0084, "num_input_tokens_seen": 41603456, "step": 12505 }, { "epoch": 2.503063802115899, "grad_norm": 0.05732341855764389, "learning_rate": 3.303668207639857e-06, "loss": 0.0028, "num_input_tokens_seen": 41619872, "step": 12510 }, { "epoch": 2.504064227296601, "grad_norm": 0.0009395777597092092, "learning_rate": 3.2906655792939974e-06, "loss": 0.0177, "num_input_tokens_seen": 41636832, "step": 12515 }, { "epoch": 2.505064652477303, "grad_norm": 0.8623054623603821, "learning_rate": 3.2776867864892006e-06, "loss": 0.0046, "num_input_tokens_seen": 41653376, "step": 12520 }, { "epoch": 2.506065077658005, "grad_norm": 0.0076326909475028515, "learning_rate": 3.264731843475402e-06, "loss": 0.0036, "num_input_tokens_seen": 41669792, "step": 12525 }, { "epoch": 2.5070655028387065, "grad_norm": 0.265615850687027, "learning_rate": 3.251800764476337e-06, "loss": 0.0069, "num_input_tokens_seen": 41686880, "step": 12530 }, { "epoch": 2.508065928019408, "grad_norm": 1.4407261610031128, "learning_rate": 3.2388935636895603e-06, "loss": 0.0093, "num_input_tokens_seen": 41702752, "step": 12535 }, { "epoch": 2.50906635320011, "grad_norm": 2.3761744499206543, "learning_rate": 3.226010255286399e-06, "loss": 0.008, "num_input_tokens_seen": 41720416, "step": 12540 }, { "epoch": 2.510066778380812, "grad_norm": 0.1614091694355011, "learning_rate": 3.21315085341195e-06, "loss": 0.002, "num_input_tokens_seen": 41736224, "step": 12545 }, { "epoch": 2.5110672035615136, "grad_norm": 1.1754673719406128, "learning_rate": 3.2003153721850644e-06, "loss": 0.0103, "num_input_tokens_seen": 41753248, "step": 12550 }, { "epoch": 2.5120676287422157, "grad_norm": 0.12314176559448242, "learning_rate": 3.187503825698321e-06, "loss": 0.0118, "num_input_tokens_seen": 41769696, "step": 12555 }, { "epoch": 2.5130680539229173, "grad_norm": 0.06970703601837158, "learning_rate": 3.1747162280180264e-06, "loss": 0.0036, "num_input_tokens_seen": 41787328, "step": 12560 }, { "epoch": 2.514068479103619, "grad_norm": 0.03441135585308075, "learning_rate": 3.1619525931842003e-06, "loss": 0.0078, "num_input_tokens_seen": 41805376, "step": 12565 }, { "epoch": 2.5150689042843206, "grad_norm": 0.3269253671169281, "learning_rate": 3.1492129352105354e-06, "loss": 0.0135, "num_input_tokens_seen": 41821888, "step": 12570 }, { "epoch": 2.5160693294650227, "grad_norm": 0.6984061598777771, "learning_rate": 3.1364972680844096e-06, "loss": 0.011, "num_input_tokens_seen": 41838976, "step": 12575 }, { "epoch": 2.5170697546457244, "grad_norm": 0.31647729873657227, "learning_rate": 3.1238056057668693e-06, "loss": 0.0031, "num_input_tokens_seen": 41855168, "step": 12580 }, { "epoch": 2.5180701798264264, "grad_norm": 0.31902697682380676, "learning_rate": 3.1111379621925785e-06, "loss": 0.0044, "num_input_tokens_seen": 41872128, "step": 12585 }, { "epoch": 2.519070605007128, "grad_norm": 0.19838306307792664, "learning_rate": 3.0984943512698567e-06, "loss": 0.0056, "num_input_tokens_seen": 41887936, "step": 12590 }, { "epoch": 2.5200710301878297, "grad_norm": 0.5044189095497131, "learning_rate": 3.085874786880627e-06, "loss": 0.0013, "num_input_tokens_seen": 41904480, "step": 12595 }, { "epoch": 2.5210714553685314, "grad_norm": 0.015526696108281612, "learning_rate": 3.0732792828804093e-06, "loss": 0.0003, "num_input_tokens_seen": 41920608, "step": 12600 }, { "epoch": 2.5220718805492335, "grad_norm": 0.22375400364398956, "learning_rate": 3.0607078530983102e-06, "loss": 0.0013, "num_input_tokens_seen": 41936224, "step": 12605 }, { "epoch": 2.523072305729935, "grad_norm": 0.10406365990638733, "learning_rate": 3.048160511336995e-06, "loss": 0.0009, "num_input_tokens_seen": 41953280, "step": 12610 }, { "epoch": 2.524072730910637, "grad_norm": 0.635587215423584, "learning_rate": 3.035637271372696e-06, "loss": 0.0139, "num_input_tokens_seen": 41971232, "step": 12615 }, { "epoch": 2.525073156091339, "grad_norm": 1.0286800861358643, "learning_rate": 3.023138146955176e-06, "loss": 0.0099, "num_input_tokens_seen": 41987616, "step": 12620 }, { "epoch": 2.5260735812720405, "grad_norm": 0.3848174512386322, "learning_rate": 3.010663151807716e-06, "loss": 0.0019, "num_input_tokens_seen": 42004448, "step": 12625 }, { "epoch": 2.527074006452742, "grad_norm": 0.5619518756866455, "learning_rate": 2.9982122996271095e-06, "loss": 0.0025, "num_input_tokens_seen": 42021120, "step": 12630 }, { "epoch": 2.5280744316334443, "grad_norm": 0.016514690592885017, "learning_rate": 2.985785604083649e-06, "loss": 0.0027, "num_input_tokens_seen": 42037376, "step": 12635 }, { "epoch": 2.529074856814146, "grad_norm": 2.134133815765381, "learning_rate": 2.973383078821082e-06, "loss": 0.0113, "num_input_tokens_seen": 42054080, "step": 12640 }, { "epoch": 2.530075281994848, "grad_norm": 0.11717314273118973, "learning_rate": 2.961004737456652e-06, "loss": 0.0041, "num_input_tokens_seen": 42071616, "step": 12645 }, { "epoch": 2.5310757071755496, "grad_norm": 1.4976444244384766, "learning_rate": 2.9486505935810206e-06, "loss": 0.005, "num_input_tokens_seen": 42087456, "step": 12650 }, { "epoch": 2.5320761323562513, "grad_norm": 0.11834155023097992, "learning_rate": 2.9363206607582962e-06, "loss": 0.0124, "num_input_tokens_seen": 42103648, "step": 12655 }, { "epoch": 2.533076557536953, "grad_norm": 0.11021913588047028, "learning_rate": 2.9240149525260078e-06, "loss": 0.0039, "num_input_tokens_seen": 42119424, "step": 12660 }, { "epoch": 2.534076982717655, "grad_norm": 0.011786202900111675, "learning_rate": 2.911733482395071e-06, "loss": 0.0006, "num_input_tokens_seen": 42135488, "step": 12665 }, { "epoch": 2.5350774078983567, "grad_norm": 1.3259836435317993, "learning_rate": 2.899476263849807e-06, "loss": 0.0053, "num_input_tokens_seen": 42152896, "step": 12670 }, { "epoch": 2.5360778330790588, "grad_norm": 0.031422726809978485, "learning_rate": 2.8872433103479034e-06, "loss": 0.0271, "num_input_tokens_seen": 42168544, "step": 12675 }, { "epoch": 2.5370782582597604, "grad_norm": 0.04686697572469711, "learning_rate": 2.8750346353204003e-06, "loss": 0.0057, "num_input_tokens_seen": 42184832, "step": 12680 }, { "epoch": 2.538078683440462, "grad_norm": 0.01803763210773468, "learning_rate": 2.8628502521716895e-06, "loss": 0.0105, "num_input_tokens_seen": 42200416, "step": 12685 }, { "epoch": 2.539079108621164, "grad_norm": 1.2098323106765747, "learning_rate": 2.850690174279494e-06, "loss": 0.0062, "num_input_tokens_seen": 42215840, "step": 12690 }, { "epoch": 2.540079533801866, "grad_norm": 0.1791813224554062, "learning_rate": 2.83855441499484e-06, "loss": 0.0056, "num_input_tokens_seen": 42233184, "step": 12695 }, { "epoch": 2.5410799589825674, "grad_norm": 1.6025686264038086, "learning_rate": 2.8264429876420695e-06, "loss": 0.0112, "num_input_tokens_seen": 42249472, "step": 12700 }, { "epoch": 2.5420803841632695, "grad_norm": 0.3951073884963989, "learning_rate": 2.8143559055187896e-06, "loss": 0.0029, "num_input_tokens_seen": 42265024, "step": 12705 }, { "epoch": 2.543080809343971, "grad_norm": 0.5251020193099976, "learning_rate": 2.802293181895893e-06, "loss": 0.0076, "num_input_tokens_seen": 42280640, "step": 12710 }, { "epoch": 2.544081234524673, "grad_norm": 0.024615280330181122, "learning_rate": 2.7902548300175273e-06, "loss": 0.0021, "num_input_tokens_seen": 42296864, "step": 12715 }, { "epoch": 2.545081659705375, "grad_norm": 1.228441834449768, "learning_rate": 2.778240863101067e-06, "loss": 0.0096, "num_input_tokens_seen": 42313344, "step": 12720 }, { "epoch": 2.5460820848860766, "grad_norm": 0.009008641354739666, "learning_rate": 2.76625129433713e-06, "loss": 0.0014, "num_input_tokens_seen": 42329888, "step": 12725 }, { "epoch": 2.547082510066778, "grad_norm": 0.1975419521331787, "learning_rate": 2.7542861368895444e-06, "loss": 0.004, "num_input_tokens_seen": 42346592, "step": 12730 }, { "epoch": 2.5480829352474803, "grad_norm": 0.7556296586990356, "learning_rate": 2.7423454038953245e-06, "loss": 0.0058, "num_input_tokens_seen": 42362496, "step": 12735 }, { "epoch": 2.549083360428182, "grad_norm": 0.1864507496356964, "learning_rate": 2.73042910846468e-06, "loss": 0.0029, "num_input_tokens_seen": 42380384, "step": 12740 }, { "epoch": 2.5500837856088836, "grad_norm": 1.1749720573425293, "learning_rate": 2.7185372636809815e-06, "loss": 0.0142, "num_input_tokens_seen": 42397760, "step": 12745 }, { "epoch": 2.5510842107895857, "grad_norm": 0.46301645040512085, "learning_rate": 2.7066698826007624e-06, "loss": 0.0028, "num_input_tokens_seen": 42415264, "step": 12750 }, { "epoch": 2.5520846359702873, "grad_norm": 0.0805196762084961, "learning_rate": 2.694826978253695e-06, "loss": 0.0126, "num_input_tokens_seen": 42432960, "step": 12755 }, { "epoch": 2.5530850611509894, "grad_norm": 0.6513873934745789, "learning_rate": 2.683008563642564e-06, "loss": 0.0032, "num_input_tokens_seen": 42450176, "step": 12760 }, { "epoch": 2.554085486331691, "grad_norm": 0.37311941385269165, "learning_rate": 2.671214651743284e-06, "loss": 0.0044, "num_input_tokens_seen": 42467072, "step": 12765 }, { "epoch": 2.5550859115123927, "grad_norm": 0.09965891391038895, "learning_rate": 2.6594452555048637e-06, "loss": 0.0026, "num_input_tokens_seen": 42484320, "step": 12770 }, { "epoch": 2.5560863366930944, "grad_norm": 0.617047905921936, "learning_rate": 2.6477003878493804e-06, "loss": 0.0014, "num_input_tokens_seen": 42502848, "step": 12775 }, { "epoch": 2.5570867618737965, "grad_norm": 0.0027189829852432013, "learning_rate": 2.6359800616719943e-06, "loss": 0.0067, "num_input_tokens_seen": 42518880, "step": 12780 }, { "epoch": 2.558087187054498, "grad_norm": 0.0006735940696671605, "learning_rate": 2.6242842898409255e-06, "loss": 0.0029, "num_input_tokens_seen": 42535424, "step": 12785 }, { "epoch": 2.5590876122352, "grad_norm": 0.5481417179107666, "learning_rate": 2.612613085197413e-06, "loss": 0.006, "num_input_tokens_seen": 42553088, "step": 12790 }, { "epoch": 2.560088037415902, "grad_norm": 0.0007917046896182001, "learning_rate": 2.6009664605557456e-06, "loss": 0.0054, "num_input_tokens_seen": 42570560, "step": 12795 }, { "epoch": 2.5610884625966035, "grad_norm": 0.02182622440159321, "learning_rate": 2.5893444287032137e-06, "loss": 0.0126, "num_input_tokens_seen": 42586688, "step": 12800 }, { "epoch": 2.562088887777305, "grad_norm": 0.7775011658668518, "learning_rate": 2.577747002400108e-06, "loss": 0.0046, "num_input_tokens_seen": 42603104, "step": 12805 }, { "epoch": 2.5630893129580072, "grad_norm": 0.17077545821666718, "learning_rate": 2.5661741943797057e-06, "loss": 0.0011, "num_input_tokens_seen": 42621600, "step": 12810 }, { "epoch": 2.564089738138709, "grad_norm": 0.03099755011498928, "learning_rate": 2.5546260173482465e-06, "loss": 0.0018, "num_input_tokens_seen": 42637536, "step": 12815 }, { "epoch": 2.565090163319411, "grad_norm": 2.6456117630004883, "learning_rate": 2.543102483984938e-06, "loss": 0.0041, "num_input_tokens_seen": 42654976, "step": 12820 }, { "epoch": 2.5660905885001126, "grad_norm": 0.040536269545555115, "learning_rate": 2.5316036069419292e-06, "loss": 0.0003, "num_input_tokens_seen": 42671392, "step": 12825 }, { "epoch": 2.5670910136808143, "grad_norm": 0.10133875161409378, "learning_rate": 2.5201293988442843e-06, "loss": 0.0058, "num_input_tokens_seen": 42687808, "step": 12830 }, { "epoch": 2.568091438861516, "grad_norm": 0.0018810753244906664, "learning_rate": 2.5086798722899958e-06, "loss": 0.0008, "num_input_tokens_seen": 42704064, "step": 12835 }, { "epoch": 2.569091864042218, "grad_norm": 0.1539742797613144, "learning_rate": 2.4972550398499606e-06, "loss": 0.0056, "num_input_tokens_seen": 42720576, "step": 12840 }, { "epoch": 2.5700922892229197, "grad_norm": 0.031564582139253616, "learning_rate": 2.485854914067942e-06, "loss": 0.0004, "num_input_tokens_seen": 42737056, "step": 12845 }, { "epoch": 2.5710927144036217, "grad_norm": 1.6116400957107544, "learning_rate": 2.4744795074606115e-06, "loss": 0.0086, "num_input_tokens_seen": 42753056, "step": 12850 }, { "epoch": 2.5720931395843234, "grad_norm": 0.17667633295059204, "learning_rate": 2.463128832517461e-06, "loss": 0.0091, "num_input_tokens_seen": 42770048, "step": 12855 }, { "epoch": 2.573093564765025, "grad_norm": 0.14781901240348816, "learning_rate": 2.4518029017008586e-06, "loss": 0.0021, "num_input_tokens_seen": 42786112, "step": 12860 }, { "epoch": 2.5740939899457267, "grad_norm": 8.325371163664386e-05, "learning_rate": 2.440501727445993e-06, "loss": 0.0035, "num_input_tokens_seen": 42802208, "step": 12865 }, { "epoch": 2.575094415126429, "grad_norm": 0.9688087105751038, "learning_rate": 2.429225322160869e-06, "loss": 0.0046, "num_input_tokens_seen": 42819104, "step": 12870 }, { "epoch": 2.5760948403071304, "grad_norm": 0.05281443893909454, "learning_rate": 2.417973698226297e-06, "loss": 0.0009, "num_input_tokens_seen": 42835552, "step": 12875 }, { "epoch": 2.5770952654878325, "grad_norm": 0.11945304274559021, "learning_rate": 2.406746867995896e-06, "loss": 0.0106, "num_input_tokens_seen": 42852096, "step": 12880 }, { "epoch": 2.578095690668534, "grad_norm": 0.06983213126659393, "learning_rate": 2.3955448437960326e-06, "loss": 0.0129, "num_input_tokens_seen": 42870016, "step": 12885 }, { "epoch": 2.579096115849236, "grad_norm": 0.5124595761299133, "learning_rate": 2.3843676379258636e-06, "loss": 0.0078, "num_input_tokens_seen": 42887264, "step": 12890 }, { "epoch": 2.5800965410299375, "grad_norm": 0.016952697187662125, "learning_rate": 2.373215262657286e-06, "loss": 0.0103, "num_input_tokens_seen": 42903776, "step": 12895 }, { "epoch": 2.5810969662106396, "grad_norm": 1.0047266483306885, "learning_rate": 2.3620877302349296e-06, "loss": 0.0068, "num_input_tokens_seen": 42919936, "step": 12900 }, { "epoch": 2.582097391391341, "grad_norm": 0.0015937060816213489, "learning_rate": 2.3509850528761675e-06, "loss": 0.0071, "num_input_tokens_seen": 42936864, "step": 12905 }, { "epoch": 2.5830978165720433, "grad_norm": 1.1491022109985352, "learning_rate": 2.339907242771061e-06, "loss": 0.0114, "num_input_tokens_seen": 42954976, "step": 12910 }, { "epoch": 2.584098241752745, "grad_norm": 0.05709850415587425, "learning_rate": 2.328854312082376e-06, "loss": 0.0013, "num_input_tokens_seen": 42971680, "step": 12915 }, { "epoch": 2.5850986669334466, "grad_norm": 1.429154872894287, "learning_rate": 2.317826272945578e-06, "loss": 0.0059, "num_input_tokens_seen": 42987712, "step": 12920 }, { "epoch": 2.5860990921141487, "grad_norm": 0.738528847694397, "learning_rate": 2.3068231374687722e-06, "loss": 0.0018, "num_input_tokens_seen": 43003904, "step": 12925 }, { "epoch": 2.5870995172948503, "grad_norm": 0.006645476911216974, "learning_rate": 2.2958449177327478e-06, "loss": 0.008, "num_input_tokens_seen": 43020160, "step": 12930 }, { "epoch": 2.588099942475552, "grad_norm": 1.5658634901046753, "learning_rate": 2.2848916257909305e-06, "loss": 0.0061, "num_input_tokens_seen": 43037472, "step": 12935 }, { "epoch": 2.589100367656254, "grad_norm": 0.013548973947763443, "learning_rate": 2.2739632736693663e-06, "loss": 0.0008, "num_input_tokens_seen": 43053920, "step": 12940 }, { "epoch": 2.5901007928369557, "grad_norm": 0.029908064752817154, "learning_rate": 2.263059873366735e-06, "loss": 0.0023, "num_input_tokens_seen": 43071552, "step": 12945 }, { "epoch": 2.5911012180176574, "grad_norm": 0.03885706141591072, "learning_rate": 2.2521814368543088e-06, "loss": 0.0013, "num_input_tokens_seen": 43088096, "step": 12950 }, { "epoch": 2.5921016431983595, "grad_norm": 0.28232115507125854, "learning_rate": 2.2413279760759576e-06, "loss": 0.0052, "num_input_tokens_seen": 43105568, "step": 12955 }, { "epoch": 2.593102068379061, "grad_norm": 0.15103229880332947, "learning_rate": 2.2304995029481303e-06, "loss": 0.0008, "num_input_tokens_seen": 43121792, "step": 12960 }, { "epoch": 2.5941024935597627, "grad_norm": 0.05918802320957184, "learning_rate": 2.21969602935983e-06, "loss": 0.0019, "num_input_tokens_seen": 43139040, "step": 12965 }, { "epoch": 2.595102918740465, "grad_norm": 0.05563802644610405, "learning_rate": 2.208917567172625e-06, "loss": 0.0046, "num_input_tokens_seen": 43155264, "step": 12970 }, { "epoch": 2.5961033439211665, "grad_norm": 0.25392958521842957, "learning_rate": 2.19816412822062e-06, "loss": 0.0077, "num_input_tokens_seen": 43171200, "step": 12975 }, { "epoch": 2.597103769101868, "grad_norm": 0.006377012934535742, "learning_rate": 2.1874357243104355e-06, "loss": 0.0104, "num_input_tokens_seen": 43188704, "step": 12980 }, { "epoch": 2.5981041942825702, "grad_norm": 0.21373534202575684, "learning_rate": 2.176732367221218e-06, "loss": 0.0067, "num_input_tokens_seen": 43205248, "step": 12985 }, { "epoch": 2.599104619463272, "grad_norm": 0.003723123110830784, "learning_rate": 2.166054068704609e-06, "loss": 0.0005, "num_input_tokens_seen": 43221152, "step": 12990 }, { "epoch": 2.600105044643974, "grad_norm": 0.2379627823829651, "learning_rate": 2.1554008404847335e-06, "loss": 0.0031, "num_input_tokens_seen": 43237824, "step": 12995 }, { "epoch": 2.6011054698246756, "grad_norm": 0.0022142406087368727, "learning_rate": 2.144772694258193e-06, "loss": 0.0067, "num_input_tokens_seen": 43254592, "step": 13000 }, { "epoch": 2.6021058950053773, "grad_norm": 1.8083733320236206, "learning_rate": 2.134169641694056e-06, "loss": 0.0087, "num_input_tokens_seen": 43270432, "step": 13005 }, { "epoch": 2.603106320186079, "grad_norm": 0.0026089141611009836, "learning_rate": 2.1235916944338325e-06, "loss": 0.0022, "num_input_tokens_seen": 43288096, "step": 13010 }, { "epoch": 2.604106745366781, "grad_norm": 0.4001295864582062, "learning_rate": 2.1130388640914794e-06, "loss": 0.0065, "num_input_tokens_seen": 43303392, "step": 13015 }, { "epoch": 2.6051071705474826, "grad_norm": 0.2899264991283417, "learning_rate": 2.1025111622533537e-06, "loss": 0.0043, "num_input_tokens_seen": 43321248, "step": 13020 }, { "epoch": 2.6061075957281847, "grad_norm": 1.9828625917434692, "learning_rate": 2.0920086004782487e-06, "loss": 0.0067, "num_input_tokens_seen": 43337472, "step": 13025 }, { "epoch": 2.6071080209088864, "grad_norm": 0.3573082983493805, "learning_rate": 2.081531190297342e-06, "loss": 0.0016, "num_input_tokens_seen": 43353120, "step": 13030 }, { "epoch": 2.608108446089588, "grad_norm": 0.07330165803432465, "learning_rate": 2.0710789432141968e-06, "loss": 0.0006, "num_input_tokens_seen": 43368832, "step": 13035 }, { "epoch": 2.6091088712702897, "grad_norm": 0.6771386861801147, "learning_rate": 2.0606518707047493e-06, "loss": 0.0036, "num_input_tokens_seen": 43385856, "step": 13040 }, { "epoch": 2.6101092964509918, "grad_norm": 0.08871681988239288, "learning_rate": 2.0502499842173033e-06, "loss": 0.0059, "num_input_tokens_seen": 43401888, "step": 13045 }, { "epoch": 2.6111097216316934, "grad_norm": 0.0013250740012153983, "learning_rate": 2.0398732951724952e-06, "loss": 0.0043, "num_input_tokens_seen": 43417664, "step": 13050 }, { "epoch": 2.6121101468123955, "grad_norm": 0.0141685726121068, "learning_rate": 2.0295218149633034e-06, "loss": 0.0022, "num_input_tokens_seen": 43434624, "step": 13055 }, { "epoch": 2.613110571993097, "grad_norm": 0.004939412698149681, "learning_rate": 2.0191955549550447e-06, "loss": 0.0046, "num_input_tokens_seen": 43450432, "step": 13060 }, { "epoch": 2.614110997173799, "grad_norm": 0.20622195303440094, "learning_rate": 2.0088945264853137e-06, "loss": 0.001, "num_input_tokens_seen": 43467456, "step": 13065 }, { "epoch": 2.6151114223545004, "grad_norm": 0.7119448184967041, "learning_rate": 1.9986187408640304e-06, "loss": 0.0019, "num_input_tokens_seen": 43484384, "step": 13070 }, { "epoch": 2.6161118475352025, "grad_norm": 0.006884606555104256, "learning_rate": 1.9883682093733786e-06, "loss": 0.0025, "num_input_tokens_seen": 43500160, "step": 13075 }, { "epoch": 2.617112272715904, "grad_norm": 0.4917401373386383, "learning_rate": 1.9781429432678274e-06, "loss": 0.0014, "num_input_tokens_seen": 43517472, "step": 13080 }, { "epoch": 2.6181126978966063, "grad_norm": 0.22076082229614258, "learning_rate": 1.9679429537741103e-06, "loss": 0.0136, "num_input_tokens_seen": 43533408, "step": 13085 }, { "epoch": 2.619113123077308, "grad_norm": 1.2674239873886108, "learning_rate": 1.957768252091191e-06, "loss": 0.0056, "num_input_tokens_seen": 43550752, "step": 13090 }, { "epoch": 2.6201135482580096, "grad_norm": 0.01584111526608467, "learning_rate": 1.9476188493902813e-06, "loss": 0.0058, "num_input_tokens_seen": 43567296, "step": 13095 }, { "epoch": 2.621113973438711, "grad_norm": 0.2965255081653595, "learning_rate": 1.93749475681482e-06, "loss": 0.001, "num_input_tokens_seen": 43583232, "step": 13100 }, { "epoch": 2.6221143986194133, "grad_norm": 0.2845730483531952, "learning_rate": 1.9273959854804347e-06, "loss": 0.0045, "num_input_tokens_seen": 43599840, "step": 13105 }, { "epoch": 2.623114823800115, "grad_norm": 0.8512167930603027, "learning_rate": 1.9173225464749865e-06, "loss": 0.0076, "num_input_tokens_seen": 43616352, "step": 13110 }, { "epoch": 2.624115248980817, "grad_norm": 1.9393882751464844, "learning_rate": 1.9072744508585011e-06, "loss": 0.0153, "num_input_tokens_seen": 43632480, "step": 13115 }, { "epoch": 2.6251156741615187, "grad_norm": 0.04237949475646019, "learning_rate": 1.8972517096631725e-06, "loss": 0.0026, "num_input_tokens_seen": 43649088, "step": 13120 }, { "epoch": 2.6261160993422203, "grad_norm": 0.04926257207989693, "learning_rate": 1.8872543338933762e-06, "loss": 0.0015, "num_input_tokens_seen": 43666944, "step": 13125 }, { "epoch": 2.627116524522922, "grad_norm": 0.2511679232120514, "learning_rate": 1.8772823345256207e-06, "loss": 0.0008, "num_input_tokens_seen": 43683136, "step": 13130 }, { "epoch": 2.628116949703624, "grad_norm": 1.242906093597412, "learning_rate": 1.8673357225085646e-06, "loss": 0.0056, "num_input_tokens_seen": 43701184, "step": 13135 }, { "epoch": 2.6291173748843257, "grad_norm": 0.09425300359725952, "learning_rate": 1.857414508762992e-06, "loss": 0.0056, "num_input_tokens_seen": 43717408, "step": 13140 }, { "epoch": 2.630117800065028, "grad_norm": 0.008755037561058998, "learning_rate": 1.8475187041817887e-06, "loss": 0.0056, "num_input_tokens_seen": 43733312, "step": 13145 }, { "epoch": 2.6311182252457295, "grad_norm": 2.027099847793579, "learning_rate": 1.837648319629956e-06, "loss": 0.005, "num_input_tokens_seen": 43750496, "step": 13150 }, { "epoch": 2.632118650426431, "grad_norm": 1.6400530338287354, "learning_rate": 1.8278033659445831e-06, "loss": 0.0052, "num_input_tokens_seen": 43767776, "step": 13155 }, { "epoch": 2.6331190756071328, "grad_norm": 0.012904511764645576, "learning_rate": 1.8179838539348376e-06, "loss": 0.0019, "num_input_tokens_seen": 43785440, "step": 13160 }, { "epoch": 2.634119500787835, "grad_norm": 0.36115771532058716, "learning_rate": 1.8081897943819453e-06, "loss": 0.0011, "num_input_tokens_seen": 43801856, "step": 13165 }, { "epoch": 2.6351199259685365, "grad_norm": 0.31264355778694153, "learning_rate": 1.7984211980392045e-06, "loss": 0.0013, "num_input_tokens_seen": 43817696, "step": 13170 }, { "epoch": 2.6361203511492386, "grad_norm": 0.008913216181099415, "learning_rate": 1.7886780756319333e-06, "loss": 0.0005, "num_input_tokens_seen": 43835584, "step": 13175 }, { "epoch": 2.6371207763299402, "grad_norm": 0.05168372765183449, "learning_rate": 1.7789604378574965e-06, "loss": 0.0038, "num_input_tokens_seen": 43852000, "step": 13180 }, { "epoch": 2.638121201510642, "grad_norm": 0.008873915299773216, "learning_rate": 1.7692682953852819e-06, "loss": 0.0029, "num_input_tokens_seen": 43868608, "step": 13185 }, { "epoch": 2.639121626691344, "grad_norm": 1.931552767753601, "learning_rate": 1.759601658856669e-06, "loss": 0.0046, "num_input_tokens_seen": 43885120, "step": 13190 }, { "epoch": 2.6401220518720456, "grad_norm": 0.883804440498352, "learning_rate": 1.7499605388850482e-06, "loss": 0.0021, "num_input_tokens_seen": 43901280, "step": 13195 }, { "epoch": 2.6411224770527473, "grad_norm": 0.09361104667186737, "learning_rate": 1.740344946055783e-06, "loss": 0.0031, "num_input_tokens_seen": 43916736, "step": 13200 }, { "epoch": 2.6421229022334494, "grad_norm": 0.7549806237220764, "learning_rate": 1.7307548909262117e-06, "loss": 0.0025, "num_input_tokens_seen": 43934464, "step": 13205 }, { "epoch": 2.643123327414151, "grad_norm": 0.11326675862073898, "learning_rate": 1.7211903840256532e-06, "loss": 0.0027, "num_input_tokens_seen": 43950752, "step": 13210 }, { "epoch": 2.6441237525948527, "grad_norm": 0.11757057160139084, "learning_rate": 1.7116514358553438e-06, "loss": 0.0025, "num_input_tokens_seen": 43967776, "step": 13215 }, { "epoch": 2.6451241777755548, "grad_norm": 0.6157832741737366, "learning_rate": 1.7021380568884803e-06, "loss": 0.0019, "num_input_tokens_seen": 43985184, "step": 13220 }, { "epoch": 2.6461246029562564, "grad_norm": 0.09997295588254929, "learning_rate": 1.692650257570183e-06, "loss": 0.0073, "num_input_tokens_seen": 44002176, "step": 13225 }, { "epoch": 2.647125028136958, "grad_norm": 0.20389370620250702, "learning_rate": 1.6831880483174771e-06, "loss": 0.0005, "num_input_tokens_seen": 44019776, "step": 13230 }, { "epoch": 2.64812545331766, "grad_norm": 0.6792962551116943, "learning_rate": 1.6737514395192972e-06, "loss": 0.0011, "num_input_tokens_seen": 44036736, "step": 13235 }, { "epoch": 2.649125878498362, "grad_norm": 0.35949239134788513, "learning_rate": 1.66434044153648e-06, "loss": 0.0109, "num_input_tokens_seen": 44053472, "step": 13240 }, { "epoch": 2.6501263036790634, "grad_norm": 0.00013263363507576287, "learning_rate": 1.654955064701727e-06, "loss": 0.0023, "num_input_tokens_seen": 44070016, "step": 13245 }, { "epoch": 2.6511267288597655, "grad_norm": 0.24277490377426147, "learning_rate": 1.6455953193196205e-06, "loss": 0.0049, "num_input_tokens_seen": 44087328, "step": 13250 }, { "epoch": 2.652127154040467, "grad_norm": 0.12013000249862671, "learning_rate": 1.6362612156665903e-06, "loss": 0.0093, "num_input_tokens_seen": 44104768, "step": 13255 }, { "epoch": 2.6531275792211693, "grad_norm": 0.14752374589443207, "learning_rate": 1.6269527639909215e-06, "loss": 0.0018, "num_input_tokens_seen": 44120608, "step": 13260 }, { "epoch": 2.654128004401871, "grad_norm": 0.9413996934890747, "learning_rate": 1.6176699745127422e-06, "loss": 0.0076, "num_input_tokens_seen": 44136992, "step": 13265 }, { "epoch": 2.6551284295825726, "grad_norm": 0.035456474870443344, "learning_rate": 1.6084128574239881e-06, "loss": 0.0051, "num_input_tokens_seen": 44153280, "step": 13270 }, { "epoch": 2.656128854763274, "grad_norm": 0.007137295324355364, "learning_rate": 1.5991814228884156e-06, "loss": 0.0002, "num_input_tokens_seen": 44169376, "step": 13275 }, { "epoch": 2.6571292799439763, "grad_norm": 0.014517224393785, "learning_rate": 1.589975681041589e-06, "loss": 0.0019, "num_input_tokens_seen": 44185472, "step": 13280 }, { "epoch": 2.658129705124678, "grad_norm": 1.2557194232940674, "learning_rate": 1.580795641990851e-06, "loss": 0.0036, "num_input_tokens_seen": 44202816, "step": 13285 }, { "epoch": 2.65913013030538, "grad_norm": 0.022609582170844078, "learning_rate": 1.5716413158153338e-06, "loss": 0.0071, "num_input_tokens_seen": 44219648, "step": 13290 }, { "epoch": 2.6601305554860817, "grad_norm": 0.11861256510019302, "learning_rate": 1.5625127125659445e-06, "loss": 0.0008, "num_input_tokens_seen": 44235232, "step": 13295 }, { "epoch": 2.6611309806667833, "grad_norm": 0.17032299935817719, "learning_rate": 1.5534098422653243e-06, "loss": 0.0056, "num_input_tokens_seen": 44251552, "step": 13300 }, { "epoch": 2.662131405847485, "grad_norm": 0.22129566967487335, "learning_rate": 1.544332714907884e-06, "loss": 0.0011, "num_input_tokens_seen": 44268704, "step": 13305 }, { "epoch": 2.663131831028187, "grad_norm": 0.0006550539401359856, "learning_rate": 1.5352813404597655e-06, "loss": 0.0054, "num_input_tokens_seen": 44284768, "step": 13310 }, { "epoch": 2.6641322562088887, "grad_norm": 0.47652092576026917, "learning_rate": 1.5262557288588275e-06, "loss": 0.0075, "num_input_tokens_seen": 44301184, "step": 13315 }, { "epoch": 2.665132681389591, "grad_norm": 0.6773017048835754, "learning_rate": 1.5172558900146517e-06, "loss": 0.0025, "num_input_tokens_seen": 44318208, "step": 13320 }, { "epoch": 2.6661331065702925, "grad_norm": 0.3130875527858734, "learning_rate": 1.5082818338085142e-06, "loss": 0.0014, "num_input_tokens_seen": 44335872, "step": 13325 }, { "epoch": 2.667133531750994, "grad_norm": 1.1312685012817383, "learning_rate": 1.499333570093392e-06, "loss": 0.0026, "num_input_tokens_seen": 44353184, "step": 13330 }, { "epoch": 2.6681339569316957, "grad_norm": 0.06901410967111588, "learning_rate": 1.4904111086939398e-06, "loss": 0.0021, "num_input_tokens_seen": 44370592, "step": 13335 }, { "epoch": 2.669134382112398, "grad_norm": 0.028959838673472404, "learning_rate": 1.4815144594064773e-06, "loss": 0.0052, "num_input_tokens_seen": 44387680, "step": 13340 }, { "epoch": 2.6701348072930995, "grad_norm": 0.4723975956439972, "learning_rate": 1.4726436319989933e-06, "loss": 0.0035, "num_input_tokens_seen": 44403680, "step": 13345 }, { "epoch": 2.6711352324738016, "grad_norm": 0.8988831043243408, "learning_rate": 1.463798636211125e-06, "loss": 0.0043, "num_input_tokens_seen": 44419552, "step": 13350 }, { "epoch": 2.6721356576545032, "grad_norm": 0.9595151543617249, "learning_rate": 1.4549794817541374e-06, "loss": 0.0058, "num_input_tokens_seen": 44435808, "step": 13355 }, { "epoch": 2.673136082835205, "grad_norm": 0.26894503831863403, "learning_rate": 1.446186178310932e-06, "loss": 0.0011, "num_input_tokens_seen": 44452864, "step": 13360 }, { "epoch": 2.6741365080159065, "grad_norm": 0.024321796372532845, "learning_rate": 1.4374187355360307e-06, "loss": 0.0013, "num_input_tokens_seen": 44469280, "step": 13365 }, { "epoch": 2.6751369331966086, "grad_norm": 0.059120357036590576, "learning_rate": 1.4286771630555557e-06, "loss": 0.002, "num_input_tokens_seen": 44486816, "step": 13370 }, { "epoch": 2.6761373583773103, "grad_norm": 0.024076877161860466, "learning_rate": 1.4199614704672293e-06, "loss": 0.0004, "num_input_tokens_seen": 44503808, "step": 13375 }, { "epoch": 2.6771377835580124, "grad_norm": 0.00032420014031231403, "learning_rate": 1.4112716673403497e-06, "loss": 0.0042, "num_input_tokens_seen": 44520416, "step": 13380 }, { "epoch": 2.678138208738714, "grad_norm": 0.47371798753738403, "learning_rate": 1.402607763215802e-06, "loss": 0.0047, "num_input_tokens_seen": 44537248, "step": 13385 }, { "epoch": 2.6791386339194156, "grad_norm": 0.649607241153717, "learning_rate": 1.393969767606032e-06, "loss": 0.0081, "num_input_tokens_seen": 44553600, "step": 13390 }, { "epoch": 2.6801390591001173, "grad_norm": 0.12219960242509842, "learning_rate": 1.3853576899950344e-06, "loss": 0.0013, "num_input_tokens_seen": 44571520, "step": 13395 }, { "epoch": 2.6811394842808194, "grad_norm": 0.9294015169143677, "learning_rate": 1.3767715398383507e-06, "loss": 0.0097, "num_input_tokens_seen": 44588000, "step": 13400 }, { "epoch": 2.682139909461521, "grad_norm": 0.6352314352989197, "learning_rate": 1.3682113265630652e-06, "loss": 0.0091, "num_input_tokens_seen": 44604960, "step": 13405 }, { "epoch": 2.683140334642223, "grad_norm": 0.12153366208076477, "learning_rate": 1.3596770595677655e-06, "loss": 0.0027, "num_input_tokens_seen": 44620448, "step": 13410 }, { "epoch": 2.6841407598229248, "grad_norm": 0.23107106983661652, "learning_rate": 1.35116874822257e-06, "loss": 0.0008, "num_input_tokens_seen": 44638272, "step": 13415 }, { "epoch": 2.6851411850036264, "grad_norm": 0.07143131643533707, "learning_rate": 1.3426864018690865e-06, "loss": 0.0028, "num_input_tokens_seen": 44655136, "step": 13420 }, { "epoch": 2.686141610184328, "grad_norm": 1.5513947010040283, "learning_rate": 1.334230029820424e-06, "loss": 0.0045, "num_input_tokens_seen": 44671808, "step": 13425 }, { "epoch": 2.68714203536503, "grad_norm": 0.32228389382362366, "learning_rate": 1.3257996413611695e-06, "loss": 0.0101, "num_input_tokens_seen": 44688448, "step": 13430 }, { "epoch": 2.688142460545732, "grad_norm": 0.09518205374479294, "learning_rate": 1.3173952457473798e-06, "loss": 0.0008, "num_input_tokens_seen": 44704736, "step": 13435 }, { "epoch": 2.689142885726434, "grad_norm": 0.04356449842453003, "learning_rate": 1.3090168522065738e-06, "loss": 0.0016, "num_input_tokens_seen": 44722400, "step": 13440 }, { "epoch": 2.6901433109071355, "grad_norm": 0.25289589166641235, "learning_rate": 1.300664469937729e-06, "loss": 0.0033, "num_input_tokens_seen": 44740000, "step": 13445 }, { "epoch": 2.691143736087837, "grad_norm": 1.0440763235092163, "learning_rate": 1.292338108111249e-06, "loss": 0.0083, "num_input_tokens_seen": 44757248, "step": 13450 }, { "epoch": 2.6921441612685393, "grad_norm": 3.475522041320801, "learning_rate": 1.284037775868982e-06, "loss": 0.0082, "num_input_tokens_seen": 44774400, "step": 13455 }, { "epoch": 2.693144586449241, "grad_norm": 0.00039189582457765937, "learning_rate": 1.2757634823241938e-06, "loss": 0.0015, "num_input_tokens_seen": 44790560, "step": 13460 }, { "epoch": 2.6941450116299426, "grad_norm": 0.11311087757349014, "learning_rate": 1.2675152365615533e-06, "loss": 0.0007, "num_input_tokens_seen": 44808032, "step": 13465 }, { "epoch": 2.6951454368106447, "grad_norm": 1.334816813468933, "learning_rate": 1.25929304763715e-06, "loss": 0.0029, "num_input_tokens_seen": 44824448, "step": 13470 }, { "epoch": 2.6961458619913463, "grad_norm": 0.06325584650039673, "learning_rate": 1.2510969245784376e-06, "loss": 0.0071, "num_input_tokens_seen": 44841440, "step": 13475 }, { "epoch": 2.697146287172048, "grad_norm": 0.4612632691860199, "learning_rate": 1.242926876384276e-06, "loss": 0.0054, "num_input_tokens_seen": 44857696, "step": 13480 }, { "epoch": 2.69814671235275, "grad_norm": 0.0031939514447003603, "learning_rate": 1.2347829120248844e-06, "loss": 0.0058, "num_input_tokens_seen": 44874656, "step": 13485 }, { "epoch": 2.6991471375334517, "grad_norm": 0.07405953854322433, "learning_rate": 1.2266650404418379e-06, "loss": 0.0053, "num_input_tokens_seen": 44891328, "step": 13490 }, { "epoch": 2.700147562714154, "grad_norm": 0.0022581659723073244, "learning_rate": 1.2185732705480762e-06, "loss": 0.0041, "num_input_tokens_seen": 44908160, "step": 13495 }, { "epoch": 2.7011479878948554, "grad_norm": 0.7377486824989319, "learning_rate": 1.2105076112278763e-06, "loss": 0.0101, "num_input_tokens_seen": 44924672, "step": 13500 }, { "epoch": 2.702148413075557, "grad_norm": 0.6490694284439087, "learning_rate": 1.202468071336843e-06, "loss": 0.0021, "num_input_tokens_seen": 44940608, "step": 13505 }, { "epoch": 2.7031488382562587, "grad_norm": 0.9131351709365845, "learning_rate": 1.1944546597019046e-06, "loss": 0.0032, "num_input_tokens_seen": 44956704, "step": 13510 }, { "epoch": 2.704149263436961, "grad_norm": 1.0523442029953003, "learning_rate": 1.1864673851213065e-06, "loss": 0.0029, "num_input_tokens_seen": 44973280, "step": 13515 }, { "epoch": 2.7051496886176625, "grad_norm": 0.2591552734375, "learning_rate": 1.1785062563645976e-06, "loss": 0.0047, "num_input_tokens_seen": 44989984, "step": 13520 }, { "epoch": 2.7061501137983646, "grad_norm": 1.9421402215957642, "learning_rate": 1.1705712821726195e-06, "loss": 0.0084, "num_input_tokens_seen": 45005920, "step": 13525 }, { "epoch": 2.707150538979066, "grad_norm": 0.03383360430598259, "learning_rate": 1.1626624712574862e-06, "loss": 0.001, "num_input_tokens_seen": 45022080, "step": 13530 }, { "epoch": 2.708150964159768, "grad_norm": 0.04191024228930473, "learning_rate": 1.1547798323025994e-06, "loss": 0.0063, "num_input_tokens_seen": 45038624, "step": 13535 }, { "epoch": 2.7091513893404695, "grad_norm": 0.12199781090021133, "learning_rate": 1.14692337396263e-06, "loss": 0.0056, "num_input_tokens_seen": 45055072, "step": 13540 }, { "epoch": 2.7101518145211716, "grad_norm": 0.04170893877744675, "learning_rate": 1.139093104863484e-06, "loss": 0.0006, "num_input_tokens_seen": 45072096, "step": 13545 }, { "epoch": 2.7111522397018732, "grad_norm": 0.03867083787918091, "learning_rate": 1.1312890336023285e-06, "loss": 0.0047, "num_input_tokens_seen": 45087264, "step": 13550 }, { "epoch": 2.7121526648825753, "grad_norm": 0.011973058804869652, "learning_rate": 1.1235111687475707e-06, "loss": 0.009, "num_input_tokens_seen": 45103136, "step": 13555 }, { "epoch": 2.713153090063277, "grad_norm": 0.05495055764913559, "learning_rate": 1.115759518838827e-06, "loss": 0.0027, "num_input_tokens_seen": 45119872, "step": 13560 }, { "epoch": 2.7141535152439786, "grad_norm": 0.5237078666687012, "learning_rate": 1.1080340923869475e-06, "loss": 0.0054, "num_input_tokens_seen": 45136736, "step": 13565 }, { "epoch": 2.7151539404246803, "grad_norm": 1.1536104679107666, "learning_rate": 1.1003348978739836e-06, "loss": 0.002, "num_input_tokens_seen": 45153312, "step": 13570 }, { "epoch": 2.7161543656053824, "grad_norm": 1.8903820514678955, "learning_rate": 1.0926619437531898e-06, "loss": 0.0069, "num_input_tokens_seen": 45169632, "step": 13575 }, { "epoch": 2.717154790786084, "grad_norm": 0.9461204409599304, "learning_rate": 1.0850152384490081e-06, "loss": 0.0022, "num_input_tokens_seen": 45186368, "step": 13580 }, { "epoch": 2.718155215966786, "grad_norm": 0.07230707257986069, "learning_rate": 1.077394790357053e-06, "loss": 0.0072, "num_input_tokens_seen": 45202592, "step": 13585 }, { "epoch": 2.7191556411474878, "grad_norm": 0.8314633965492249, "learning_rate": 1.069800607844121e-06, "loss": 0.0016, "num_input_tokens_seen": 45220000, "step": 13590 }, { "epoch": 2.7201560663281894, "grad_norm": 0.5452732443809509, "learning_rate": 1.0622326992481695e-06, "loss": 0.0059, "num_input_tokens_seen": 45236960, "step": 13595 }, { "epoch": 2.721156491508891, "grad_norm": 0.2344525009393692, "learning_rate": 1.0546910728783027e-06, "loss": 0.0015, "num_input_tokens_seen": 45255232, "step": 13600 }, { "epoch": 2.722156916689593, "grad_norm": 1.2953059673309326, "learning_rate": 1.0471757370147745e-06, "loss": 0.0193, "num_input_tokens_seen": 45272576, "step": 13605 }, { "epoch": 2.723157341870295, "grad_norm": 0.07832599431276321, "learning_rate": 1.0396866999089704e-06, "loss": 0.0022, "num_input_tokens_seen": 45288352, "step": 13610 }, { "epoch": 2.724157767050997, "grad_norm": 0.4092762768268585, "learning_rate": 1.0322239697833997e-06, "loss": 0.0058, "num_input_tokens_seen": 45305280, "step": 13615 }, { "epoch": 2.7251581922316985, "grad_norm": 0.7260483503341675, "learning_rate": 1.024787554831691e-06, "loss": 0.0096, "num_input_tokens_seen": 45320320, "step": 13620 }, { "epoch": 2.7261586174124, "grad_norm": 0.014826206490397453, "learning_rate": 1.0173774632185801e-06, "loss": 0.0021, "num_input_tokens_seen": 45337760, "step": 13625 }, { "epoch": 2.727159042593102, "grad_norm": 0.0008843060932122171, "learning_rate": 1.0099937030799034e-06, "loss": 0.004, "num_input_tokens_seen": 45355744, "step": 13630 }, { "epoch": 2.728159467773804, "grad_norm": 0.1502399891614914, "learning_rate": 1.0026362825225844e-06, "loss": 0.0018, "num_input_tokens_seen": 45371488, "step": 13635 }, { "epoch": 2.7291598929545056, "grad_norm": 0.02906009927392006, "learning_rate": 9.953052096246224e-07, "loss": 0.0056, "num_input_tokens_seen": 45387584, "step": 13640 }, { "epoch": 2.7301603181352077, "grad_norm": 0.042441483587026596, "learning_rate": 9.880004924350984e-07, "loss": 0.0013, "num_input_tokens_seen": 45403968, "step": 13645 }, { "epoch": 2.7311607433159093, "grad_norm": 0.10809578746557236, "learning_rate": 9.80722138974155e-07, "loss": 0.0058, "num_input_tokens_seen": 45420320, "step": 13650 }, { "epoch": 2.732161168496611, "grad_norm": 0.007705936208367348, "learning_rate": 9.734701572329753e-07, "loss": 0.0015, "num_input_tokens_seen": 45436672, "step": 13655 }, { "epoch": 2.7331615936773126, "grad_norm": 0.012891904450953007, "learning_rate": 9.662445551738064e-07, "loss": 0.0065, "num_input_tokens_seen": 45453440, "step": 13660 }, { "epoch": 2.7341620188580147, "grad_norm": 0.10762783885002136, "learning_rate": 9.59045340729925e-07, "loss": 0.0055, "num_input_tokens_seen": 45469312, "step": 13665 }, { "epoch": 2.7351624440387163, "grad_norm": 1.0634167194366455, "learning_rate": 9.518725218056246e-07, "loss": 0.0036, "num_input_tokens_seen": 45484416, "step": 13670 }, { "epoch": 2.7361628692194184, "grad_norm": 1.4050567150115967, "learning_rate": 9.447261062762419e-07, "loss": 0.0052, "num_input_tokens_seen": 45500544, "step": 13675 }, { "epoch": 2.73716329440012, "grad_norm": 1.091619849205017, "learning_rate": 9.376061019881005e-07, "loss": 0.0081, "num_input_tokens_seen": 45517664, "step": 13680 }, { "epoch": 2.7381637195808217, "grad_norm": 0.01584814302623272, "learning_rate": 9.305125167585388e-07, "loss": 0.0036, "num_input_tokens_seen": 45535296, "step": 13685 }, { "epoch": 2.739164144761524, "grad_norm": 0.1886490136384964, "learning_rate": 9.234453583758906e-07, "loss": 0.001, "num_input_tokens_seen": 45552128, "step": 13690 }, { "epoch": 2.7401645699422255, "grad_norm": 0.009118461050093174, "learning_rate": 9.164046345994604e-07, "loss": 0.0085, "num_input_tokens_seen": 45568352, "step": 13695 }, { "epoch": 2.741164995122927, "grad_norm": 0.0091153709217906, "learning_rate": 9.093903531595476e-07, "loss": 0.0023, "num_input_tokens_seen": 45584512, "step": 13700 }, { "epoch": 2.742165420303629, "grad_norm": 0.3516223132610321, "learning_rate": 9.024025217574089e-07, "loss": 0.0065, "num_input_tokens_seen": 45600352, "step": 13705 }, { "epoch": 2.743165845484331, "grad_norm": 1.48143470287323, "learning_rate": 8.954411480652569e-07, "loss": 0.0201, "num_input_tokens_seen": 45616160, "step": 13710 }, { "epoch": 2.7441662706650325, "grad_norm": 0.9060990810394287, "learning_rate": 8.885062397262639e-07, "loss": 0.0026, "num_input_tokens_seen": 45632736, "step": 13715 }, { "epoch": 2.7451666958457346, "grad_norm": 0.10566138476133347, "learning_rate": 8.815978043545392e-07, "loss": 0.0035, "num_input_tokens_seen": 45649696, "step": 13720 }, { "epoch": 2.7461671210264362, "grad_norm": 0.3342597186565399, "learning_rate": 8.747158495351348e-07, "loss": 0.0007, "num_input_tokens_seen": 45665856, "step": 13725 }, { "epoch": 2.747167546207138, "grad_norm": 0.643731415271759, "learning_rate": 8.678603828240178e-07, "loss": 0.0034, "num_input_tokens_seen": 45682464, "step": 13730 }, { "epoch": 2.74816797138784, "grad_norm": 1.8434544801712036, "learning_rate": 8.610314117480783e-07, "loss": 0.0098, "num_input_tokens_seen": 45699456, "step": 13735 }, { "epoch": 2.7491683965685416, "grad_norm": 3.394533634185791, "learning_rate": 8.542289438051132e-07, "loss": 0.0203, "num_input_tokens_seen": 45716512, "step": 13740 }, { "epoch": 2.7501688217492433, "grad_norm": 0.34342724084854126, "learning_rate": 8.474529864638259e-07, "loss": 0.0071, "num_input_tokens_seen": 45733312, "step": 13745 }, { "epoch": 2.7511692469299454, "grad_norm": 0.0001394088176311925, "learning_rate": 8.40703547163807e-07, "loss": 0.0039, "num_input_tokens_seen": 45749280, "step": 13750 }, { "epoch": 2.752169672110647, "grad_norm": 0.177780881524086, "learning_rate": 8.339806333155342e-07, "loss": 0.0046, "num_input_tokens_seen": 45766208, "step": 13755 }, { "epoch": 2.753170097291349, "grad_norm": 0.6227426528930664, "learning_rate": 8.272842523003643e-07, "loss": 0.0019, "num_input_tokens_seen": 45781952, "step": 13760 }, { "epoch": 2.7541705224720507, "grad_norm": 0.005509722046554089, "learning_rate": 8.206144114705134e-07, "loss": 0.0038, "num_input_tokens_seen": 45798560, "step": 13765 }, { "epoch": 2.7551709476527524, "grad_norm": 0.0030598777811974287, "learning_rate": 8.139711181490706e-07, "loss": 0.0003, "num_input_tokens_seen": 45814656, "step": 13770 }, { "epoch": 2.756171372833454, "grad_norm": 1.6217525005340576, "learning_rate": 8.07354379629971e-07, "loss": 0.0151, "num_input_tokens_seen": 45832032, "step": 13775 }, { "epoch": 2.757171798014156, "grad_norm": 0.009558570571243763, "learning_rate": 8.00764203177995e-07, "loss": 0.0071, "num_input_tokens_seen": 45849120, "step": 13780 }, { "epoch": 2.7581722231948578, "grad_norm": 0.3835294246673584, "learning_rate": 7.942005960287602e-07, "loss": 0.0015, "num_input_tokens_seen": 45865856, "step": 13785 }, { "epoch": 2.75917264837556, "grad_norm": 0.00481353048235178, "learning_rate": 7.876635653887076e-07, "loss": 0.0013, "num_input_tokens_seen": 45882784, "step": 13790 }, { "epoch": 2.7601730735562615, "grad_norm": 0.06251227110624313, "learning_rate": 7.811531184351073e-07, "loss": 0.0002, "num_input_tokens_seen": 45899136, "step": 13795 }, { "epoch": 2.761173498736963, "grad_norm": 0.011212627403438091, "learning_rate": 7.746692623160357e-07, "loss": 0.0062, "num_input_tokens_seen": 45916512, "step": 13800 }, { "epoch": 2.762173923917665, "grad_norm": 0.6980820894241333, "learning_rate": 7.68212004150376e-07, "loss": 0.0042, "num_input_tokens_seen": 45933120, "step": 13805 }, { "epoch": 2.763174349098367, "grad_norm": 0.00017238684813492, "learning_rate": 7.617813510278071e-07, "loss": 0.0039, "num_input_tokens_seen": 45950368, "step": 13810 }, { "epoch": 2.7641747742790685, "grad_norm": 0.021036557853221893, "learning_rate": 7.553773100088035e-07, "loss": 0.0014, "num_input_tokens_seen": 45965664, "step": 13815 }, { "epoch": 2.7651751994597706, "grad_norm": 0.16205120086669922, "learning_rate": 7.489998881246102e-07, "loss": 0.0071, "num_input_tokens_seen": 45981760, "step": 13820 }, { "epoch": 2.7661756246404723, "grad_norm": 1.0853997468948364, "learning_rate": 7.426490923772567e-07, "loss": 0.0086, "num_input_tokens_seen": 45997216, "step": 13825 }, { "epoch": 2.767176049821174, "grad_norm": 0.02892439253628254, "learning_rate": 7.363249297395319e-07, "loss": 0.0038, "num_input_tokens_seen": 46015552, "step": 13830 }, { "epoch": 2.7681764750018756, "grad_norm": 0.5620378255844116, "learning_rate": 7.300274071549873e-07, "loss": 0.0077, "num_input_tokens_seen": 46032256, "step": 13835 }, { "epoch": 2.7691769001825777, "grad_norm": 2.367673397064209, "learning_rate": 7.237565315379225e-07, "loss": 0.0063, "num_input_tokens_seen": 46048928, "step": 13840 }, { "epoch": 2.7701773253632793, "grad_norm": 0.2110394984483719, "learning_rate": 7.175123097733772e-07, "loss": 0.0006, "num_input_tokens_seen": 46065184, "step": 13845 }, { "epoch": 2.7711777505439814, "grad_norm": 0.0275700893253088, "learning_rate": 7.112947487171367e-07, "loss": 0.0013, "num_input_tokens_seen": 46081472, "step": 13850 }, { "epoch": 2.772178175724683, "grad_norm": 0.027688276022672653, "learning_rate": 7.051038551957073e-07, "loss": 0.0007, "num_input_tokens_seen": 46098368, "step": 13855 }, { "epoch": 2.7731786009053847, "grad_norm": 0.29458293318748474, "learning_rate": 6.9893963600631e-07, "loss": 0.0146, "num_input_tokens_seen": 46115424, "step": 13860 }, { "epoch": 2.7741790260860864, "grad_norm": 0.04649446904659271, "learning_rate": 6.928020979168948e-07, "loss": 0.0007, "num_input_tokens_seen": 46132352, "step": 13865 }, { "epoch": 2.7751794512667884, "grad_norm": 0.3606918454170227, "learning_rate": 6.866912476661075e-07, "loss": 0.0023, "num_input_tokens_seen": 46149792, "step": 13870 }, { "epoch": 2.77617987644749, "grad_norm": 0.058070261031389236, "learning_rate": 6.80607091963284e-07, "loss": 0.0031, "num_input_tokens_seen": 46166400, "step": 13875 }, { "epoch": 2.777180301628192, "grad_norm": 0.01889689825475216, "learning_rate": 6.745496374884725e-07, "loss": 0.0053, "num_input_tokens_seen": 46183104, "step": 13880 }, { "epoch": 2.778180726808894, "grad_norm": 0.8481465578079224, "learning_rate": 6.685188908923917e-07, "loss": 0.0125, "num_input_tokens_seen": 46200160, "step": 13885 }, { "epoch": 2.7791811519895955, "grad_norm": 1.760446310043335, "learning_rate": 6.625148587964286e-07, "loss": 0.0069, "num_input_tokens_seen": 46217792, "step": 13890 }, { "epoch": 2.780181577170297, "grad_norm": 0.020744048058986664, "learning_rate": 6.5653754779266e-07, "loss": 0.004, "num_input_tokens_seen": 46232832, "step": 13895 }, { "epoch": 2.781182002350999, "grad_norm": 0.008959859609603882, "learning_rate": 6.505869644438057e-07, "loss": 0.0004, "num_input_tokens_seen": 46250304, "step": 13900 }, { "epoch": 2.782182427531701, "grad_norm": 0.021957986056804657, "learning_rate": 6.446631152832478e-07, "loss": 0.0038, "num_input_tokens_seen": 46266752, "step": 13905 }, { "epoch": 2.783182852712403, "grad_norm": 0.4723031520843506, "learning_rate": 6.387660068150225e-07, "loss": 0.0049, "num_input_tokens_seen": 46283552, "step": 13910 }, { "epoch": 2.7841832778931046, "grad_norm": 0.7204026579856873, "learning_rate": 6.328956455137924e-07, "loss": 0.002, "num_input_tokens_seen": 46300768, "step": 13915 }, { "epoch": 2.7851837030738062, "grad_norm": 0.003451480297371745, "learning_rate": 6.270520378248601e-07, "loss": 0.0132, "num_input_tokens_seen": 46317984, "step": 13920 }, { "epoch": 2.786184128254508, "grad_norm": 0.012446947395801544, "learning_rate": 6.212351901641628e-07, "loss": 0.0018, "num_input_tokens_seen": 46333856, "step": 13925 }, { "epoch": 2.78718455343521, "grad_norm": 0.005464858375489712, "learning_rate": 6.154451089182389e-07, "loss": 0.001, "num_input_tokens_seen": 46350912, "step": 13930 }, { "epoch": 2.7881849786159116, "grad_norm": 0.0008582032169215381, "learning_rate": 6.096818004442533e-07, "loss": 0.0002, "num_input_tokens_seen": 46367840, "step": 13935 }, { "epoch": 2.7891854037966137, "grad_norm": 0.08223076909780502, "learning_rate": 6.03945271069975e-07, "loss": 0.0097, "num_input_tokens_seen": 46383968, "step": 13940 }, { "epoch": 2.7901858289773154, "grad_norm": 0.7391047477722168, "learning_rate": 5.982355270937628e-07, "loss": 0.0039, "num_input_tokens_seen": 46399744, "step": 13945 }, { "epoch": 2.791186254158017, "grad_norm": 1.6347475051879883, "learning_rate": 5.925525747845717e-07, "loss": 0.0047, "num_input_tokens_seen": 46415968, "step": 13950 }, { "epoch": 2.792186679338719, "grad_norm": 1.4310767650604248, "learning_rate": 5.868964203819439e-07, "loss": 0.0064, "num_input_tokens_seen": 46433280, "step": 13955 }, { "epoch": 2.7931871045194208, "grad_norm": 0.6000454425811768, "learning_rate": 5.812670700959922e-07, "loss": 0.0093, "num_input_tokens_seen": 46451040, "step": 13960 }, { "epoch": 2.7941875297001224, "grad_norm": 0.0004939927603118122, "learning_rate": 5.756645301074087e-07, "loss": 0.0038, "num_input_tokens_seen": 46467648, "step": 13965 }, { "epoch": 2.7951879548808245, "grad_norm": 0.04235506057739258, "learning_rate": 5.700888065674398e-07, "loss": 0.0082, "num_input_tokens_seen": 46482656, "step": 13970 }, { "epoch": 2.796188380061526, "grad_norm": 0.432803213596344, "learning_rate": 5.645399055978967e-07, "loss": 0.0033, "num_input_tokens_seen": 46498784, "step": 13975 }, { "epoch": 2.797188805242228, "grad_norm": 0.3252566158771515, "learning_rate": 5.590178332911366e-07, "loss": 0.0009, "num_input_tokens_seen": 46516064, "step": 13980 }, { "epoch": 2.79818923042293, "grad_norm": 0.01758207008242607, "learning_rate": 5.535225957100654e-07, "loss": 0.0053, "num_input_tokens_seen": 46532704, "step": 13985 }, { "epoch": 2.7991896556036315, "grad_norm": 0.01629800535738468, "learning_rate": 5.480541988881177e-07, "loss": 0.0044, "num_input_tokens_seen": 46549184, "step": 13990 }, { "epoch": 2.800190080784333, "grad_norm": 0.0037658638320863247, "learning_rate": 5.426126488292715e-07, "loss": 0.0082, "num_input_tokens_seen": 46564864, "step": 13995 }, { "epoch": 2.8011905059650353, "grad_norm": 1.712538242340088, "learning_rate": 5.371979515080145e-07, "loss": 0.0059, "num_input_tokens_seen": 46581440, "step": 14000 }, { "epoch": 2.802190931145737, "grad_norm": 0.7744560837745667, "learning_rate": 5.318101128693575e-07, "loss": 0.0085, "num_input_tokens_seen": 46598592, "step": 14005 }, { "epoch": 2.8031913563264386, "grad_norm": 0.03285866603255272, "learning_rate": 5.264491388288272e-07, "loss": 0.0019, "num_input_tokens_seen": 46614752, "step": 14010 }, { "epoch": 2.8041917815071407, "grad_norm": 0.07157700508832932, "learning_rate": 5.211150352724459e-07, "loss": 0.0014, "num_input_tokens_seen": 46631232, "step": 14015 }, { "epoch": 2.8051922066878423, "grad_norm": 0.025913609191775322, "learning_rate": 5.158078080567425e-07, "loss": 0.0082, "num_input_tokens_seen": 46647328, "step": 14020 }, { "epoch": 2.8061926318685444, "grad_norm": 0.022490929812192917, "learning_rate": 5.105274630087282e-07, "loss": 0.0153, "num_input_tokens_seen": 46663840, "step": 14025 }, { "epoch": 2.807193057049246, "grad_norm": 0.15464244782924652, "learning_rate": 5.052740059259042e-07, "loss": 0.0031, "num_input_tokens_seen": 46679904, "step": 14030 }, { "epoch": 2.8081934822299477, "grad_norm": 0.16805796325206757, "learning_rate": 5.000474425762541e-07, "loss": 0.0016, "num_input_tokens_seen": 46697792, "step": 14035 }, { "epoch": 2.8091939074106493, "grad_norm": 0.7833674550056458, "learning_rate": 4.948477786982237e-07, "loss": 0.0095, "num_input_tokens_seen": 46714784, "step": 14040 }, { "epoch": 2.8101943325913514, "grad_norm": 1.0255088806152344, "learning_rate": 4.89675020000735e-07, "loss": 0.0044, "num_input_tokens_seen": 46732320, "step": 14045 }, { "epoch": 2.811194757772053, "grad_norm": 0.20480923354625702, "learning_rate": 4.845291721631645e-07, "loss": 0.0039, "num_input_tokens_seen": 46747648, "step": 14050 }, { "epoch": 2.812195182952755, "grad_norm": 0.0014816083712503314, "learning_rate": 4.794102408353402e-07, "loss": 0.0071, "num_input_tokens_seen": 46764928, "step": 14055 }, { "epoch": 2.813195608133457, "grad_norm": 0.789983332157135, "learning_rate": 4.743182316375439e-07, "loss": 0.0094, "num_input_tokens_seen": 46781472, "step": 14060 }, { "epoch": 2.8141960333141585, "grad_norm": 1.3696625232696533, "learning_rate": 4.692531501604924e-07, "loss": 0.0135, "num_input_tokens_seen": 46798656, "step": 14065 }, { "epoch": 2.81519645849486, "grad_norm": 1.2937527894973755, "learning_rate": 4.6421500196534294e-07, "loss": 0.0062, "num_input_tokens_seen": 46815232, "step": 14070 }, { "epoch": 2.816196883675562, "grad_norm": 0.0012796290684491396, "learning_rate": 4.592037925836734e-07, "loss": 0.0039, "num_input_tokens_seen": 46831712, "step": 14075 }, { "epoch": 2.817197308856264, "grad_norm": 0.41710588335990906, "learning_rate": 4.5421952751749375e-07, "loss": 0.0024, "num_input_tokens_seen": 46848288, "step": 14080 }, { "epoch": 2.818197734036966, "grad_norm": 0.05578610301017761, "learning_rate": 4.492622122392265e-07, "loss": 0.0036, "num_input_tokens_seen": 46864864, "step": 14085 }, { "epoch": 2.8191981592176676, "grad_norm": 0.005146725103259087, "learning_rate": 4.443318521917067e-07, "loss": 0.0042, "num_input_tokens_seen": 46881536, "step": 14090 }, { "epoch": 2.8201985843983692, "grad_norm": 0.015356880612671375, "learning_rate": 4.3942845278816814e-07, "loss": 0.0023, "num_input_tokens_seen": 46898048, "step": 14095 }, { "epoch": 2.821199009579071, "grad_norm": 0.0015450617065653205, "learning_rate": 4.3455201941224877e-07, "loss": 0.0029, "num_input_tokens_seen": 46914624, "step": 14100 }, { "epoch": 2.822199434759773, "grad_norm": 0.04543983191251755, "learning_rate": 4.2970255741798524e-07, "loss": 0.0025, "num_input_tokens_seen": 46930624, "step": 14105 }, { "epoch": 2.8231998599404746, "grad_norm": 0.2659851014614105, "learning_rate": 4.248800721297852e-07, "loss": 0.0021, "num_input_tokens_seen": 46947296, "step": 14110 }, { "epoch": 2.8242002851211767, "grad_norm": 0.2597131133079529, "learning_rate": 4.2008456884244927e-07, "loss": 0.0211, "num_input_tokens_seen": 46963232, "step": 14115 }, { "epoch": 2.8252007103018784, "grad_norm": 1.0440688133239746, "learning_rate": 4.153160528211575e-07, "loss": 0.0019, "num_input_tokens_seen": 46978688, "step": 14120 }, { "epoch": 2.82620113548258, "grad_norm": 0.08481547981500626, "learning_rate": 4.105745293014468e-07, "loss": 0.0031, "num_input_tokens_seen": 46995744, "step": 14125 }, { "epoch": 2.8272015606632817, "grad_norm": 0.5087813138961792, "learning_rate": 4.0586000348922513e-07, "loss": 0.0032, "num_input_tokens_seen": 47012128, "step": 14130 }, { "epoch": 2.8282019858439837, "grad_norm": 1.1093705892562866, "learning_rate": 4.011724805607575e-07, "loss": 0.0023, "num_input_tokens_seen": 47029408, "step": 14135 }, { "epoch": 2.8292024110246854, "grad_norm": 0.6753928065299988, "learning_rate": 3.9651196566266593e-07, "loss": 0.0064, "num_input_tokens_seen": 47046112, "step": 14140 }, { "epoch": 2.8302028362053875, "grad_norm": 0.0030104496981948614, "learning_rate": 3.9187846391191273e-07, "loss": 0.0024, "num_input_tokens_seen": 47063072, "step": 14145 }, { "epoch": 2.831203261386089, "grad_norm": 0.06650929152965546, "learning_rate": 3.8727198039579804e-07, "loss": 0.0093, "num_input_tokens_seen": 47080160, "step": 14150 }, { "epoch": 2.832203686566791, "grad_norm": 0.28551894426345825, "learning_rate": 3.8269252017197054e-07, "loss": 0.0043, "num_input_tokens_seen": 47096448, "step": 14155 }, { "epoch": 2.8332041117474924, "grad_norm": 0.018239686265587807, "learning_rate": 3.781400882684e-07, "loss": 0.0007, "num_input_tokens_seen": 47113664, "step": 14160 }, { "epoch": 2.8342045369281945, "grad_norm": 0.34065768122673035, "learning_rate": 3.7361468968337976e-07, "loss": 0.0018, "num_input_tokens_seen": 47129984, "step": 14165 }, { "epoch": 2.835204962108896, "grad_norm": 0.03855415806174278, "learning_rate": 3.6911632938552433e-07, "loss": 0.0013, "num_input_tokens_seen": 47145920, "step": 14170 }, { "epoch": 2.8362053872895983, "grad_norm": 1.1146639585494995, "learning_rate": 3.64645012313769e-07, "loss": 0.003, "num_input_tokens_seen": 47162752, "step": 14175 }, { "epoch": 2.8372058124703, "grad_norm": 0.008468233048915863, "learning_rate": 3.6020074337734244e-07, "loss": 0.0077, "num_input_tokens_seen": 47178656, "step": 14180 }, { "epoch": 2.8382062376510016, "grad_norm": 0.07861342281103134, "learning_rate": 3.557835274557886e-07, "loss": 0.0011, "num_input_tokens_seen": 47195104, "step": 14185 }, { "epoch": 2.839206662831703, "grad_norm": 0.5510034561157227, "learning_rate": 3.5139336939894184e-07, "loss": 0.0048, "num_input_tokens_seen": 47212704, "step": 14190 }, { "epoch": 2.8402070880124053, "grad_norm": 0.02640078216791153, "learning_rate": 3.470302740269327e-07, "loss": 0.0061, "num_input_tokens_seen": 47229824, "step": 14195 }, { "epoch": 2.841207513193107, "grad_norm": 0.05584491044282913, "learning_rate": 3.4269424613017907e-07, "loss": 0.0061, "num_input_tokens_seen": 47247072, "step": 14200 }, { "epoch": 2.842207938373809, "grad_norm": 0.2541450560092926, "learning_rate": 3.3838529046937295e-07, "loss": 0.0026, "num_input_tokens_seen": 47263648, "step": 14205 }, { "epoch": 2.8432083635545107, "grad_norm": 2.0473499298095703, "learning_rate": 3.34103411775491e-07, "loss": 0.0107, "num_input_tokens_seen": 47281056, "step": 14210 }, { "epoch": 2.8442087887352123, "grad_norm": 1.1724183559417725, "learning_rate": 3.298486147497809e-07, "loss": 0.0028, "num_input_tokens_seen": 47297024, "step": 14215 }, { "epoch": 2.8452092139159144, "grad_norm": 0.006476056762039661, "learning_rate": 3.256209040637503e-07, "loss": 0.0037, "num_input_tokens_seen": 47313440, "step": 14220 }, { "epoch": 2.846209639096616, "grad_norm": 0.0024868613108992577, "learning_rate": 3.214202843591696e-07, "loss": 0.0004, "num_input_tokens_seen": 47330656, "step": 14225 }, { "epoch": 2.8472100642773177, "grad_norm": 0.1444324404001236, "learning_rate": 3.1724676024806886e-07, "loss": 0.0018, "num_input_tokens_seen": 47346880, "step": 14230 }, { "epoch": 2.84821048945802, "grad_norm": 1.7168079614639282, "learning_rate": 3.131003363127216e-07, "loss": 0.0051, "num_input_tokens_seen": 47363296, "step": 14235 }, { "epoch": 2.8492109146387214, "grad_norm": 0.03592948615550995, "learning_rate": 3.089810171056584e-07, "loss": 0.0025, "num_input_tokens_seen": 47379744, "step": 14240 }, { "epoch": 2.850211339819423, "grad_norm": 0.31552281975746155, "learning_rate": 3.048888071496392e-07, "loss": 0.0055, "num_input_tokens_seen": 47395712, "step": 14245 }, { "epoch": 2.851211765000125, "grad_norm": 0.0047025843523442745, "learning_rate": 3.008237109376644e-07, "loss": 0.0003, "num_input_tokens_seen": 47412352, "step": 14250 }, { "epoch": 2.852212190180827, "grad_norm": 0.34631943702697754, "learning_rate": 2.9678573293296653e-07, "loss": 0.0012, "num_input_tokens_seen": 47429120, "step": 14255 }, { "epoch": 2.853212615361529, "grad_norm": 1.796602487564087, "learning_rate": 2.9277487756899645e-07, "loss": 0.009, "num_input_tokens_seen": 47445856, "step": 14260 }, { "epoch": 2.8542130405422306, "grad_norm": 0.028868287801742554, "learning_rate": 2.8879114924943995e-07, "loss": 0.0026, "num_input_tokens_seen": 47463264, "step": 14265 }, { "epoch": 2.855213465722932, "grad_norm": 0.18896105885505676, "learning_rate": 2.8483455234818437e-07, "loss": 0.0091, "num_input_tokens_seen": 47480672, "step": 14270 }, { "epoch": 2.856213890903634, "grad_norm": 0.0005116119282320142, "learning_rate": 2.8090509120933527e-07, "loss": 0.0039, "num_input_tokens_seen": 47496800, "step": 14275 }, { "epoch": 2.857214316084336, "grad_norm": 0.37647685408592224, "learning_rate": 2.7700277014720834e-07, "loss": 0.0026, "num_input_tokens_seen": 47512192, "step": 14280 }, { "epoch": 2.8582147412650376, "grad_norm": 0.03440157696604729, "learning_rate": 2.731275934463151e-07, "loss": 0.0034, "num_input_tokens_seen": 47529152, "step": 14285 }, { "epoch": 2.8592151664457397, "grad_norm": 0.0038820921909064054, "learning_rate": 2.6927956536136056e-07, "loss": 0.0013, "num_input_tokens_seen": 47544928, "step": 14290 }, { "epoch": 2.8602155916264413, "grad_norm": 1.7431918382644653, "learning_rate": 2.654586901172568e-07, "loss": 0.0097, "num_input_tokens_seen": 47561984, "step": 14295 }, { "epoch": 2.861216016807143, "grad_norm": 0.2928279936313629, "learning_rate": 2.616649719090869e-07, "loss": 0.0102, "num_input_tokens_seen": 47577984, "step": 14300 }, { "epoch": 2.8622164419878446, "grad_norm": 0.6232368350028992, "learning_rate": 2.578984149021302e-07, "loss": 0.0145, "num_input_tokens_seen": 47594112, "step": 14305 }, { "epoch": 2.8632168671685467, "grad_norm": 0.014803355559706688, "learning_rate": 2.5415902323183694e-07, "loss": 0.0007, "num_input_tokens_seen": 47610368, "step": 14310 }, { "epoch": 2.8642172923492484, "grad_norm": 2.0789053440093994, "learning_rate": 2.504468010038341e-07, "loss": 0.0059, "num_input_tokens_seen": 47627520, "step": 14315 }, { "epoch": 2.8652177175299505, "grad_norm": 1.0814423561096191, "learning_rate": 2.467617522939197e-07, "loss": 0.0078, "num_input_tokens_seen": 47642656, "step": 14320 }, { "epoch": 2.866218142710652, "grad_norm": 0.5673795342445374, "learning_rate": 2.431038811480518e-07, "loss": 0.0035, "num_input_tokens_seen": 47659552, "step": 14325 }, { "epoch": 2.8672185678913538, "grad_norm": 0.2489253282546997, "learning_rate": 2.3947319158235394e-07, "loss": 0.0018, "num_input_tokens_seen": 47676672, "step": 14330 }, { "epoch": 2.8682189930720554, "grad_norm": 0.14608818292617798, "learning_rate": 2.3586968758310678e-07, "loss": 0.0072, "num_input_tokens_seen": 47693568, "step": 14335 }, { "epoch": 2.8692194182527575, "grad_norm": 0.09965641051530838, "learning_rate": 2.3229337310673727e-07, "loss": 0.0032, "num_input_tokens_seen": 47710048, "step": 14340 }, { "epoch": 2.870219843433459, "grad_norm": 0.02677687630057335, "learning_rate": 2.2874425207982387e-07, "loss": 0.0013, "num_input_tokens_seen": 47727936, "step": 14345 }, { "epoch": 2.8712202686141612, "grad_norm": 0.01108902134001255, "learning_rate": 2.2522232839908842e-07, "loss": 0.0034, "num_input_tokens_seen": 47745056, "step": 14350 }, { "epoch": 2.872220693794863, "grad_norm": 0.08057229220867157, "learning_rate": 2.217276059313933e-07, "loss": 0.0184, "num_input_tokens_seen": 47761536, "step": 14355 }, { "epoch": 2.8732211189755645, "grad_norm": 0.33636265993118286, "learning_rate": 2.182600885137276e-07, "loss": 0.0078, "num_input_tokens_seen": 47778304, "step": 14360 }, { "epoch": 2.874221544156266, "grad_norm": 0.23361791670322418, "learning_rate": 2.148197799532209e-07, "loss": 0.0023, "num_input_tokens_seen": 47794656, "step": 14365 }, { "epoch": 2.8752219693369683, "grad_norm": 3.437770128250122, "learning_rate": 2.1140668402712393e-07, "loss": 0.0197, "num_input_tokens_seen": 47811200, "step": 14370 }, { "epoch": 2.87622239451767, "grad_norm": 0.43796396255493164, "learning_rate": 2.0802080448280858e-07, "loss": 0.0034, "num_input_tokens_seen": 47828096, "step": 14375 }, { "epoch": 2.877222819698372, "grad_norm": 0.09619075804948807, "learning_rate": 2.0466214503777058e-07, "loss": 0.0064, "num_input_tokens_seen": 47844672, "step": 14380 }, { "epoch": 2.8782232448790737, "grad_norm": 0.2009647935628891, "learning_rate": 2.0133070937961017e-07, "loss": 0.0012, "num_input_tokens_seen": 47859968, "step": 14385 }, { "epoch": 2.8792236700597753, "grad_norm": 0.015301394276320934, "learning_rate": 1.9802650116604593e-07, "loss": 0.0033, "num_input_tokens_seen": 47876608, "step": 14390 }, { "epoch": 2.880224095240477, "grad_norm": 1.40193772315979, "learning_rate": 1.947495240248981e-07, "loss": 0.0102, "num_input_tokens_seen": 47893024, "step": 14395 }, { "epoch": 2.881224520421179, "grad_norm": 0.0199880488216877, "learning_rate": 1.9149978155409143e-07, "loss": 0.0068, "num_input_tokens_seen": 47909984, "step": 14400 }, { "epoch": 2.8822249456018807, "grad_norm": 0.359917551279068, "learning_rate": 1.8827727732164958e-07, "loss": 0.0015, "num_input_tokens_seen": 47926784, "step": 14405 }, { "epoch": 2.883225370782583, "grad_norm": 0.023170724511146545, "learning_rate": 1.85082014865684e-07, "loss": 0.0012, "num_input_tokens_seen": 47943360, "step": 14410 }, { "epoch": 2.8842257959632844, "grad_norm": 0.05834521725773811, "learning_rate": 1.819139976944023e-07, "loss": 0.0015, "num_input_tokens_seen": 47959584, "step": 14415 }, { "epoch": 2.885226221143986, "grad_norm": 0.3212447166442871, "learning_rate": 1.7877322928609442e-07, "loss": 0.002, "num_input_tokens_seen": 47977120, "step": 14420 }, { "epoch": 2.8862266463246877, "grad_norm": 0.005522616673260927, "learning_rate": 1.7565971308913798e-07, "loss": 0.0027, "num_input_tokens_seen": 47995136, "step": 14425 }, { "epoch": 2.88722707150539, "grad_norm": 0.008083946071565151, "learning_rate": 1.7257345252198187e-07, "loss": 0.0169, "num_input_tokens_seen": 48011968, "step": 14430 }, { "epoch": 2.8882274966860915, "grad_norm": 0.00019500641792546958, "learning_rate": 1.6951445097316e-07, "loss": 0.0026, "num_input_tokens_seen": 48029408, "step": 14435 }, { "epoch": 2.8892279218667936, "grad_norm": 0.06523102521896362, "learning_rate": 1.6648271180126628e-07, "loss": 0.0192, "num_input_tokens_seen": 48046432, "step": 14440 }, { "epoch": 2.890228347047495, "grad_norm": 0.27350881695747375, "learning_rate": 1.6347823833497145e-07, "loss": 0.0293, "num_input_tokens_seen": 48063168, "step": 14445 }, { "epoch": 2.891228772228197, "grad_norm": 0.044810544699430466, "learning_rate": 1.605010338730034e-07, "loss": 0.0028, "num_input_tokens_seen": 48080128, "step": 14450 }, { "epoch": 2.892229197408899, "grad_norm": 0.0014945862349122763, "learning_rate": 1.5755110168415577e-07, "loss": 0.0012, "num_input_tokens_seen": 48096608, "step": 14455 }, { "epoch": 2.8932296225896006, "grad_norm": 0.08826776593923569, "learning_rate": 1.5462844500727657e-07, "loss": 0.0093, "num_input_tokens_seen": 48113920, "step": 14460 }, { "epoch": 2.8942300477703022, "grad_norm": 0.10818234086036682, "learning_rate": 1.517330670512629e-07, "loss": 0.002, "num_input_tokens_seen": 48131360, "step": 14465 }, { "epoch": 2.8952304729510043, "grad_norm": 0.09250593930482864, "learning_rate": 1.4886497099506903e-07, "loss": 0.0086, "num_input_tokens_seen": 48147104, "step": 14470 }, { "epoch": 2.896230898131706, "grad_norm": 0.7708470821380615, "learning_rate": 1.4602415998768992e-07, "loss": 0.0231, "num_input_tokens_seen": 48163520, "step": 14475 }, { "epoch": 2.8972313233124076, "grad_norm": 0.01487722061574459, "learning_rate": 1.4321063714816674e-07, "loss": 0.0146, "num_input_tokens_seen": 48180192, "step": 14480 }, { "epoch": 2.8982317484931097, "grad_norm": 1.3531252145767212, "learning_rate": 1.4042440556557568e-07, "loss": 0.0094, "num_input_tokens_seen": 48196992, "step": 14485 }, { "epoch": 2.8992321736738114, "grad_norm": 0.3768194615840912, "learning_rate": 1.376654682990308e-07, "loss": 0.0014, "num_input_tokens_seen": 48213792, "step": 14490 }, { "epoch": 2.900232598854513, "grad_norm": 0.49699288606643677, "learning_rate": 1.3493382837768132e-07, "loss": 0.0012, "num_input_tokens_seen": 48229376, "step": 14495 }, { "epoch": 2.901233024035215, "grad_norm": 0.07719047367572784, "learning_rate": 1.3222948880070306e-07, "loss": 0.0097, "num_input_tokens_seen": 48245280, "step": 14500 }, { "epoch": 2.9022334492159167, "grad_norm": 0.12778985500335693, "learning_rate": 1.2955245253729597e-07, "loss": 0.0008, "num_input_tokens_seen": 48262432, "step": 14505 }, { "epoch": 2.9032338743966184, "grad_norm": 0.7701250910758972, "learning_rate": 1.269027225266839e-07, "loss": 0.0031, "num_input_tokens_seen": 48279584, "step": 14510 }, { "epoch": 2.9042342995773205, "grad_norm": 0.27826884388923645, "learning_rate": 1.2428030167811743e-07, "loss": 0.0044, "num_input_tokens_seen": 48296000, "step": 14515 }, { "epoch": 2.905234724758022, "grad_norm": 0.00854239147156477, "learning_rate": 1.2168519287084622e-07, "loss": 0.0028, "num_input_tokens_seen": 48313920, "step": 14520 }, { "epoch": 2.9062351499387242, "grad_norm": 0.8932532072067261, "learning_rate": 1.191173989541522e-07, "loss": 0.0067, "num_input_tokens_seen": 48330176, "step": 14525 }, { "epoch": 2.907235575119426, "grad_norm": 0.014440479688346386, "learning_rate": 1.1657692274731624e-07, "loss": 0.0062, "num_input_tokens_seen": 48346752, "step": 14530 }, { "epoch": 2.9082360003001275, "grad_norm": 1.0627107620239258, "learning_rate": 1.1406376703962385e-07, "loss": 0.0144, "num_input_tokens_seen": 48364384, "step": 14535 }, { "epoch": 2.909236425480829, "grad_norm": 0.957610011100769, "learning_rate": 1.1157793459037058e-07, "loss": 0.0041, "num_input_tokens_seen": 48380800, "step": 14540 }, { "epoch": 2.9102368506615313, "grad_norm": 0.008517167530953884, "learning_rate": 1.0911942812885378e-07, "loss": 0.0018, "num_input_tokens_seen": 48397312, "step": 14545 }, { "epoch": 2.911237275842233, "grad_norm": 0.00145952426828444, "learning_rate": 1.0668825035436426e-07, "loss": 0.0018, "num_input_tokens_seen": 48414592, "step": 14550 }, { "epoch": 2.912237701022935, "grad_norm": 0.00036152126267552376, "learning_rate": 1.0428440393618621e-07, "loss": 0.0009, "num_input_tokens_seen": 48431040, "step": 14555 }, { "epoch": 2.9132381262036366, "grad_norm": 0.01873910427093506, "learning_rate": 1.0190789151360014e-07, "loss": 0.0042, "num_input_tokens_seen": 48447424, "step": 14560 }, { "epoch": 2.9142385513843383, "grad_norm": 0.04497993737459183, "learning_rate": 9.955871569587438e-08, "loss": 0.0067, "num_input_tokens_seen": 48465856, "step": 14565 }, { "epoch": 2.91523897656504, "grad_norm": 0.6971708536148071, "learning_rate": 9.723687906225964e-08, "loss": 0.0102, "num_input_tokens_seen": 48482176, "step": 14570 }, { "epoch": 2.916239401745742, "grad_norm": 0.5259365439414978, "learning_rate": 9.494238416199453e-08, "loss": 0.001, "num_input_tokens_seen": 48498016, "step": 14575 }, { "epoch": 2.9172398269264437, "grad_norm": 0.30338263511657715, "learning_rate": 9.267523351429441e-08, "loss": 0.0019, "num_input_tokens_seen": 48514464, "step": 14580 }, { "epoch": 2.9182402521071458, "grad_norm": 1.076026201248169, "learning_rate": 9.043542960835705e-08, "loss": 0.008, "num_input_tokens_seen": 48530240, "step": 14585 }, { "epoch": 2.9192406772878474, "grad_norm": 0.5502411723136902, "learning_rate": 8.822297490335141e-08, "loss": 0.0047, "num_input_tokens_seen": 48546624, "step": 14590 }, { "epoch": 2.920241102468549, "grad_norm": 1.4354311227798462, "learning_rate": 8.603787182841772e-08, "loss": 0.013, "num_input_tokens_seen": 48563776, "step": 14595 }, { "epoch": 2.9212415276492507, "grad_norm": 0.009782413020730019, "learning_rate": 8.388012278266744e-08, "loss": 0.0054, "num_input_tokens_seen": 48580032, "step": 14600 }, { "epoch": 2.922241952829953, "grad_norm": 0.03704375773668289, "learning_rate": 8.17497301351805e-08, "loss": 0.0056, "num_input_tokens_seen": 48597760, "step": 14605 }, { "epoch": 2.9232423780106545, "grad_norm": 1.8117647171020508, "learning_rate": 7.964669622499976e-08, "loss": 0.0092, "num_input_tokens_seen": 48615104, "step": 14610 }, { "epoch": 2.9242428031913565, "grad_norm": 0.5814581513404846, "learning_rate": 7.75710233611282e-08, "loss": 0.0024, "num_input_tokens_seen": 48631648, "step": 14615 }, { "epoch": 2.925243228372058, "grad_norm": 0.46254706382751465, "learning_rate": 7.552271382253174e-08, "loss": 0.0095, "num_input_tokens_seen": 48647584, "step": 14620 }, { "epoch": 2.92624365355276, "grad_norm": 0.010827763006091118, "learning_rate": 7.350176985812806e-08, "loss": 0.0034, "num_input_tokens_seen": 48662944, "step": 14625 }, { "epoch": 2.9272440787334615, "grad_norm": 0.15264484286308289, "learning_rate": 7.150819368679229e-08, "loss": 0.0042, "num_input_tokens_seen": 48680096, "step": 14630 }, { "epoch": 2.9282445039141636, "grad_norm": 0.5206581950187683, "learning_rate": 6.954198749735408e-08, "loss": 0.0056, "num_input_tokens_seen": 48697888, "step": 14635 }, { "epoch": 2.9292449290948652, "grad_norm": 0.6589008569717407, "learning_rate": 6.760315344858658e-08, "loss": 0.0047, "num_input_tokens_seen": 48715360, "step": 14640 }, { "epoch": 2.9302453542755673, "grad_norm": 1.2055493593215942, "learning_rate": 6.569169366921479e-08, "loss": 0.0018, "num_input_tokens_seen": 48731808, "step": 14645 }, { "epoch": 2.931245779456269, "grad_norm": 0.0015922492602840066, "learning_rate": 6.38076102579016e-08, "loss": 0.0017, "num_input_tokens_seen": 48748000, "step": 14650 }, { "epoch": 2.9322462046369706, "grad_norm": 1.6169053316116333, "learning_rate": 6.195090528326453e-08, "loss": 0.0194, "num_input_tokens_seen": 48763872, "step": 14655 }, { "epoch": 2.9332466298176723, "grad_norm": 0.033600740134716034, "learning_rate": 6.012158078384511e-08, "loss": 0.0092, "num_input_tokens_seen": 48780320, "step": 14660 }, { "epoch": 2.9342470549983743, "grad_norm": 0.010385665111243725, "learning_rate": 5.8319638768139504e-08, "loss": 0.0034, "num_input_tokens_seen": 48797952, "step": 14665 }, { "epoch": 2.935247480179076, "grad_norm": 1.4164212942123413, "learning_rate": 5.654508121456792e-08, "loss": 0.019, "num_input_tokens_seen": 48815008, "step": 14670 }, { "epoch": 2.936247905359778, "grad_norm": 0.10302630066871643, "learning_rate": 5.479791007148571e-08, "loss": 0.0066, "num_input_tokens_seen": 48831936, "step": 14675 }, { "epoch": 2.9372483305404797, "grad_norm": 0.0018415686208754778, "learning_rate": 5.307812725718342e-08, "loss": 0.0108, "num_input_tokens_seen": 48848544, "step": 14680 }, { "epoch": 2.9382487557211814, "grad_norm": 0.749054491519928, "learning_rate": 5.1385734659881165e-08, "loss": 0.0065, "num_input_tokens_seen": 48865344, "step": 14685 }, { "epoch": 2.939249180901883, "grad_norm": 0.0004770423984155059, "learning_rate": 4.9720734137720384e-08, "loss": 0.0002, "num_input_tokens_seen": 48882656, "step": 14690 }, { "epoch": 2.940249606082585, "grad_norm": 0.12848667800426483, "learning_rate": 4.8083127518772084e-08, "loss": 0.0013, "num_input_tokens_seen": 48899168, "step": 14695 }, { "epoch": 2.9412500312632868, "grad_norm": 0.00011170543439220637, "learning_rate": 4.647291660103137e-08, "loss": 0.0046, "num_input_tokens_seen": 48914624, "step": 14700 }, { "epoch": 2.942250456443989, "grad_norm": 0.10075289011001587, "learning_rate": 4.489010315241182e-08, "loss": 0.0015, "num_input_tokens_seen": 48931040, "step": 14705 }, { "epoch": 2.9432508816246905, "grad_norm": 0.8776507377624512, "learning_rate": 4.3334688910748324e-08, "loss": 0.0066, "num_input_tokens_seen": 48946592, "step": 14710 }, { "epoch": 2.944251306805392, "grad_norm": 0.0035674963146448135, "learning_rate": 4.1806675583788703e-08, "loss": 0.0051, "num_input_tokens_seen": 48962688, "step": 14715 }, { "epoch": 2.9452517319860942, "grad_norm": 0.020074935629963875, "learning_rate": 4.030606484920208e-08, "loss": 0.0009, "num_input_tokens_seen": 48979616, "step": 14720 }, { "epoch": 2.946252157166796, "grad_norm": 0.017800666391849518, "learning_rate": 3.8832858354567736e-08, "loss": 0.0011, "num_input_tokens_seen": 48996000, "step": 14725 }, { "epoch": 2.9472525823474975, "grad_norm": 0.3195970952510834, "learning_rate": 3.7387057717380715e-08, "loss": 0.0012, "num_input_tokens_seen": 49011776, "step": 14730 }, { "epoch": 2.9482530075281996, "grad_norm": 0.0029207849875092506, "learning_rate": 3.596866452503789e-08, "loss": 0.0008, "num_input_tokens_seen": 49028512, "step": 14735 }, { "epoch": 2.9492534327089013, "grad_norm": 0.014407744631171227, "learning_rate": 3.457768033485465e-08, "loss": 0.0144, "num_input_tokens_seen": 49045824, "step": 14740 }, { "epoch": 2.950253857889603, "grad_norm": 0.3978143632411957, "learning_rate": 3.321410667404268e-08, "loss": 0.0129, "num_input_tokens_seen": 49062368, "step": 14745 }, { "epoch": 2.951254283070305, "grad_norm": 1.726647973060608, "learning_rate": 3.1877945039726634e-08, "loss": 0.0086, "num_input_tokens_seen": 49079520, "step": 14750 }, { "epoch": 2.9522547082510067, "grad_norm": 0.22471491992473602, "learning_rate": 3.056919689893023e-08, "loss": 0.0042, "num_input_tokens_seen": 49096800, "step": 14755 }, { "epoch": 2.9532551334317083, "grad_norm": 0.6711318492889404, "learning_rate": 2.928786368858183e-08, "loss": 0.0052, "num_input_tokens_seen": 49113792, "step": 14760 }, { "epoch": 2.9542555586124104, "grad_norm": 0.6739404201507568, "learning_rate": 2.803394681550886e-08, "loss": 0.0009, "num_input_tokens_seen": 49131040, "step": 14765 }, { "epoch": 2.955255983793112, "grad_norm": 0.3748348653316498, "learning_rate": 2.6807447656432283e-08, "loss": 0.0021, "num_input_tokens_seen": 49148160, "step": 14770 }, { "epoch": 2.9562564089738137, "grad_norm": 0.09970913082361221, "learning_rate": 2.5608367557977684e-08, "loss": 0.0069, "num_input_tokens_seen": 49164512, "step": 14775 }, { "epoch": 2.957256834154516, "grad_norm": 1.040163278579712, "learning_rate": 2.443670783666141e-08, "loss": 0.0061, "num_input_tokens_seen": 49181024, "step": 14780 }, { "epoch": 2.9582572593352174, "grad_norm": 0.02159752883017063, "learning_rate": 2.3292469778896097e-08, "loss": 0.0003, "num_input_tokens_seen": 49196992, "step": 14785 }, { "epoch": 2.9592576845159195, "grad_norm": 1.130287528038025, "learning_rate": 2.217565464098792e-08, "loss": 0.0073, "num_input_tokens_seen": 49213440, "step": 14790 }, { "epoch": 2.960258109696621, "grad_norm": 0.032430630177259445, "learning_rate": 2.1086263649128247e-08, "loss": 0.0013, "num_input_tokens_seen": 49230528, "step": 14795 }, { "epoch": 2.961258534877323, "grad_norm": 0.04397590830922127, "learning_rate": 2.002429799940475e-08, "loss": 0.0042, "num_input_tokens_seen": 49247840, "step": 14800 }, { "epoch": 2.9622589600580245, "grad_norm": 0.9333756566047668, "learning_rate": 1.8989758857790308e-08, "loss": 0.0113, "num_input_tokens_seen": 49264896, "step": 14805 }, { "epoch": 2.9632593852387266, "grad_norm": 1.2465392351150513, "learning_rate": 1.7982647360151318e-08, "loss": 0.007, "num_input_tokens_seen": 49282016, "step": 14810 }, { "epoch": 2.964259810419428, "grad_norm": 0.4767018258571625, "learning_rate": 1.7002964612231055e-08, "loss": 0.0031, "num_input_tokens_seen": 49298624, "step": 14815 }, { "epoch": 2.9652602356001303, "grad_norm": 2.005401134490967, "learning_rate": 1.6050711689663546e-08, "loss": 0.0087, "num_input_tokens_seen": 49316320, "step": 14820 }, { "epoch": 2.966260660780832, "grad_norm": 1.832701563835144, "learning_rate": 1.512588963796524e-08, "loss": 0.0092, "num_input_tokens_seen": 49332448, "step": 14825 }, { "epoch": 2.9672610859615336, "grad_norm": 1.6767756938934326, "learning_rate": 1.4228499472535017e-08, "loss": 0.0068, "num_input_tokens_seen": 49350496, "step": 14830 }, { "epoch": 2.9682615111422352, "grad_norm": 0.027200087904930115, "learning_rate": 1.33585421786514e-08, "loss": 0.0007, "num_input_tokens_seen": 49366592, "step": 14835 }, { "epoch": 2.9692619363229373, "grad_norm": 0.2780955135822296, "learning_rate": 1.2516018711478117e-08, "loss": 0.0034, "num_input_tokens_seen": 49383872, "step": 14840 }, { "epoch": 2.970262361503639, "grad_norm": 0.06683382391929626, "learning_rate": 1.170092999605299e-08, "loss": 0.0163, "num_input_tokens_seen": 49400288, "step": 14845 }, { "epoch": 2.971262786684341, "grad_norm": 0.013099157251417637, "learning_rate": 1.0913276927296267e-08, "loss": 0.0015, "num_input_tokens_seen": 49416800, "step": 14850 }, { "epoch": 2.9722632118650427, "grad_norm": 0.014247686602175236, "learning_rate": 1.0153060369999523e-08, "loss": 0.0027, "num_input_tokens_seen": 49432800, "step": 14855 }, { "epoch": 2.9732636370457444, "grad_norm": 0.0005714791477657855, "learning_rate": 9.420281158839527e-09, "loss": 0.0019, "num_input_tokens_seen": 49449440, "step": 14860 }, { "epoch": 2.974264062226446, "grad_norm": 2.1527199745178223, "learning_rate": 8.714940098361601e-09, "loss": 0.0105, "num_input_tokens_seen": 49467072, "step": 14865 }, { "epoch": 2.975264487407148, "grad_norm": 2.3097634315490723, "learning_rate": 8.037037962987937e-09, "loss": 0.0097, "num_input_tokens_seen": 49482976, "step": 14870 }, { "epoch": 2.9762649125878498, "grad_norm": 0.021265488117933273, "learning_rate": 7.386575497014825e-09, "loss": 0.0065, "num_input_tokens_seen": 49498400, "step": 14875 }, { "epoch": 2.977265337768552, "grad_norm": 2.3299357891082764, "learning_rate": 6.76355341460988e-09, "loss": 0.0093, "num_input_tokens_seen": 49514880, "step": 14880 }, { "epoch": 2.9782657629492535, "grad_norm": 1.0414243936538696, "learning_rate": 6.167972399814815e-09, "loss": 0.0043, "num_input_tokens_seen": 49531072, "step": 14885 }, { "epoch": 2.979266188129955, "grad_norm": 0.4314385652542114, "learning_rate": 5.599833106539887e-09, "loss": 0.001, "num_input_tokens_seen": 49547904, "step": 14890 }, { "epoch": 2.980266613310657, "grad_norm": 0.07597378641366959, "learning_rate": 5.05913615856668e-09, "loss": 0.0085, "num_input_tokens_seen": 49565024, "step": 14895 }, { "epoch": 2.981267038491359, "grad_norm": 0.001904193079099059, "learning_rate": 4.545882149553649e-09, "loss": 0.0008, "num_input_tokens_seen": 49582720, "step": 14900 }, { "epoch": 2.9822674636720605, "grad_norm": 0.3590216040611267, "learning_rate": 4.060071643016694e-09, "loss": 0.0024, "num_input_tokens_seen": 49599680, "step": 14905 }, { "epoch": 2.9832678888527626, "grad_norm": 0.30538246035575867, "learning_rate": 3.601705172351366e-09, "loss": 0.0088, "num_input_tokens_seen": 49616352, "step": 14910 }, { "epoch": 2.9842683140334643, "grad_norm": 0.7103545069694519, "learning_rate": 3.1707832408134354e-09, "loss": 0.0015, "num_input_tokens_seen": 49632160, "step": 14915 }, { "epoch": 2.985268739214166, "grad_norm": 1.2359659671783447, "learning_rate": 2.7673063215272188e-09, "loss": 0.0045, "num_input_tokens_seen": 49649120, "step": 14920 }, { "epoch": 2.9862691643948676, "grad_norm": 0.8294517397880554, "learning_rate": 2.3912748574911326e-09, "loss": 0.0026, "num_input_tokens_seen": 49665216, "step": 14925 }, { "epoch": 2.9872695895755697, "grad_norm": 0.1594506800174713, "learning_rate": 2.042689261561037e-09, "loss": 0.0023, "num_input_tokens_seen": 49681824, "step": 14930 }, { "epoch": 2.9882700147562713, "grad_norm": 0.11420899629592896, "learning_rate": 1.7215499164668913e-09, "loss": 0.0028, "num_input_tokens_seen": 49698464, "step": 14935 }, { "epoch": 2.9892704399369734, "grad_norm": 0.029725858941674232, "learning_rate": 1.427857174796099e-09, "loss": 0.0049, "num_input_tokens_seen": 49714944, "step": 14940 }, { "epoch": 2.990270865117675, "grad_norm": 0.0007896269671618938, "learning_rate": 1.1616113590073863e-09, "loss": 0.0065, "num_input_tokens_seen": 49730816, "step": 14945 }, { "epoch": 2.9912712902983767, "grad_norm": 1.536004662513733, "learning_rate": 9.228127614252513e-10, "loss": 0.0089, "num_input_tokens_seen": 49745664, "step": 14950 }, { "epoch": 2.9922717154790783, "grad_norm": 1.1751419305801392, "learning_rate": 7.114616442288613e-10, "loss": 0.013, "num_input_tokens_seen": 49761792, "step": 14955 }, { "epoch": 2.9932721406597804, "grad_norm": 0.3211577832698822, "learning_rate": 5.275582394770328e-10, "loss": 0.0072, "num_input_tokens_seen": 49778240, "step": 14960 }, { "epoch": 2.994272565840482, "grad_norm": 0.012990407645702362, "learning_rate": 3.711027490804764e-10, "loss": 0.0008, "num_input_tokens_seen": 49795104, "step": 14965 }, { "epoch": 2.995272991021184, "grad_norm": 0.07811839133501053, "learning_rate": 2.4209534481844933e-10, "loss": 0.0095, "num_input_tokens_seen": 49812672, "step": 14970 }, { "epoch": 2.996273416201886, "grad_norm": 0.0054620616137981415, "learning_rate": 1.405361683359807e-10, "loss": 0.0044, "num_input_tokens_seen": 49828416, "step": 14975 }, { "epoch": 2.9972738413825875, "grad_norm": 0.0696403980255127, "learning_rate": 6.642533113276895e-11, "loss": 0.0021, "num_input_tokens_seen": 49844512, "step": 14980 }, { "epoch": 2.9982742665632895, "grad_norm": 0.018436448648571968, "learning_rate": 1.976291458538615e-11, "loss": 0.0004, "num_input_tokens_seen": 49861664, "step": 14985 }, { "epoch": 2.999274691743991, "grad_norm": 0.10342025756835938, "learning_rate": 5.489699195226905e-13, "loss": 0.0008, "num_input_tokens_seen": 49878144, "step": 14990 }, { "epoch": 2.9994747767801315, "num_input_tokens_seen": 49881152, "step": 14991, "total_flos": 2.4994081540447273e+18, "train_loss": 0.027549093207103522, "train_runtime": 856295.3813, "train_samples_per_second": 0.56, "train_steps_per_second": 0.018 } ], "logging_steps": 5, "max_steps": 14991, "num_input_tokens_seen": 49881152, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4994081540447273e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }